From 39536331671cf3aa9303dcdfef80f13dbc57dc9e Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Wed, 4 Mar 2026 18:33:42 +0800 Subject: [PATCH 1/3] Fix SigV4 auth to use base64-encoded content SHA256 and custom canonical request --- pyiceberg/catalog/rest/__init__.py | 32 ++++++++++--- tests/catalog/test_rest.py | 72 ++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index 38373ff809..fcd41f029b 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -678,6 +678,8 @@ def _split_identifier_for_json(self, identifier: str | Identifier) -> dict[str, return {"namespace": identifier_tuple[:-1], "name": identifier_tuple[-1]} def _init_sigv4(self, session: Session) -> None: + import base64 + import hashlib from urllib import parse import boto3 @@ -686,6 +688,12 @@ def _init_sigv4(self, session: Session) -> None: from requests import PreparedRequest from requests.adapters import HTTPAdapter + class _IcebergSigV4Auth(SigV4Auth): + def canonical_request(self, request: Any) -> str: + cr = super().canonical_request(request) + # Replace the last line (body_checksum) with hex-encoded payload hash. + return cr.rsplit("\n", 1)[0] + "\n" + self.payload(request) + class SigV4Adapter(HTTPAdapter): def __init__(self, **properties: str): super().__init__() @@ -710,17 +718,27 @@ def add_headers(self, request: PreparedRequest, **kwargs: Any) -> None: # pylin # remove the connection header as it will be updated after signing if "connection" in request.headers: del request.headers["connection"] - # For empty bodies, explicitly set the content hash header to the SHA256 of an empty string - if not request.body: - request.headers["x-amz-content-sha256"] = EMPTY_BODY_SHA256 + + # Compute the x-amz-content-sha256 header to match Iceberg Java SDK: + # - empty body → hex (EMPTY_BODY_SHA256) + # - non-empty body → base64 + if request.body: + body_bytes = request.body.encode("utf-8") if isinstance(request.body, str) else request.body + content_sha256_header = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + else: + content_sha256_header = EMPTY_BODY_SHA256 + + signing_headers = dict(request.headers) + signing_headers["x-amz-content-sha256"] = content_sha256_header aws_request = AWSRequest( - method=request.method, url=url, params=params, data=request.body, headers=dict(request.headers) + method=request.method, url=url, params=params, data=request.body, headers=signing_headers ) - SigV4Auth(credentials, service, region).add_auth(aws_request) - original_header = request.headers - signed_headers = aws_request.headers + _IcebergSigV4Auth(credentials, service, region).add_auth(aws_request) + + original_header = dict(request.headers) + signed_headers = dict(aws_request.headers) relocated_headers = {} # relocate headers if there is a conflict with signed headers diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index c45d899388..fdc9156040 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -493,9 +493,10 @@ def test_sigv4_sign_request_without_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + assert "SignedHeaders=" in prepared.headers["Authorization"] def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: @@ -524,9 +525,74 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in prepared.headers["Authorization"] + # Conflicting Authorization header is relocated assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" - assert prepared.headers.get("x-amz-content-sha256") != EMPTY_BODY_SHA256 + assert prepared.headers["x-amz-content-sha256"] == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" + + +def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: + existing_token = "existing_token" + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": existing_token, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test_namespace"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + adapter.add_headers(prepared) + + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in prepared.headers["Authorization"] + assert prepared.headers["x-amz-content-sha256"] == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" + + +def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + prepared = catalog._session.prepare_request(Request("GET", f"{TEST_URI}v1/config")) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Inject conflicting SigV4 headers before signing + prepared.headers["x-amz-content-sha256"] = "fake" + prepared.headers["X-Amz-Date"] = "fake" + + adapter.add_headers(prepared) + + # Matching Java SDK: conflicting headers are relocated with "Original-" prefix + assert prepared.headers.get("Original-x-amz-content-sha256") == "fake" + assert prepared.headers.get("Original-X-Amz-Date") == "fake" + # SigV4 headers are set correctly after signing + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + assert "X-Amz-Date" in prepared.headers def test_list_tables_404(rest_mock: Mocker) -> None: From b043a0120066acc5c53f77e2d315c4eed898dc3d Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Sat, 21 Mar 2026 16:43:35 +0800 Subject: [PATCH 2/3] Refactor _IcebergSigV4Auth to reuse canonical_request logic instead of rsplit --- pyiceberg/catalog/rest/__init__.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index fcd41f029b..8fd02024d8 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -690,9 +690,20 @@ def _init_sigv4(self, session: Session) -> None: class _IcebergSigV4Auth(SigV4Auth): def canonical_request(self, request: Any) -> str: - cr = super().canonical_request(request) - # Replace the last line (body_checksum) with hex-encoded payload hash. - return cr.rsplit("\n", 1)[0] + "\n" + self.payload(request) + # Reuses the logic from botocore's SigV4Auth.canonical_request + # (https://github.com/boto/botocore/blob/develop/botocore/auth.py) + # but always uses self.payload(request) for the body checksum. + cr = [request.method.upper()] + path = self._normalize_url_path(parse.urlsplit(request.url).path) + cr.append(path) + cr.append(self.canonical_query_string(request)) + headers_to_sign = self.headers_to_sign(request) + cr.append(self.canonical_headers(headers_to_sign) + "\n") + cr.append(self.signed_headers(headers_to_sign)) + # Always use hex-encoded payload hash per SigV4 spec, + # regardless of the x-amz-content-sha256 header value (which may be base64). + cr.append(self.payload(request)) + return "\n".join(cr) class SigV4Adapter(HTTPAdapter): def __init__(self, **properties: str): From c95670af04e44921f3f0ab5579b02a6d832e0abc Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Sat, 21 Mar 2026 17:19:09 +0800 Subject: [PATCH 3/3] update test --- tests/catalog/test_rest.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index fdc9156040..b908ece7bf 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -16,6 +16,7 @@ # under the License. # pylint: disable=redefined-outer-name,unused-argument import base64 +import hashlib import os from collections.abc import Callable from typing import Any, cast @@ -493,10 +494,16 @@ def test_sigv4_sign_request_without_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 - assert "SignedHeaders=" in prepared.headers["Authorization"] + # Verify the signature format: Credential, SignedHeaders, Signature + assert "Credential=" in auth_header + assert "SignedHeaders=" in auth_header + assert "Signature=" in auth_header + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: @@ -525,11 +532,19 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") - assert "SignedHeaders=" in prepared.headers["Authorization"] + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in auth_header # Conflicting Authorization header is relocated assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" - assert prepared.headers["x-amz-content-sha256"] == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" + # Non-empty body should have base64-encoded SHA256 + content_sha256 = prepared.headers["x-amz-content-sha256"] + assert content_sha256 == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" + # Verify it's valid base64 and matches the body + decoded = base64.b64decode(content_sha256) + assert len(decoded) == 32 # SHA256 produces 32 bytes + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: @@ -561,7 +576,12 @@ def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") assert "SignedHeaders=" in prepared.headers["Authorization"] - assert prepared.headers["x-amz-content-sha256"] == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" + content_sha256 = prepared.headers["x-amz-content-sha256"] + assert content_sha256 == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" + # Verify it's valid base64 and matches the body + decoded = base64.b64decode(content_sha256) + assert len(decoded) == 32 # SHA256 produces 32 bytes + assert decoded == hashlib.sha256(body_content).digest() def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: