diff --git a/.github/scripts/aiter_prebuild_upload.sh b/.github/scripts/aiter_prebuild_upload.sh index 473ef1c75..4f039264e 100755 --- a/.github/scripts/aiter_prebuild_upload.sh +++ b/.github/scripts/aiter_prebuild_upload.sh @@ -7,8 +7,86 @@ set -euo pipefail # Inputs for upload (optional): # NVTE_AITER_PREBUILT_BASE_URL - base URL for prebuilts # NVTE_AITER_PREBUILT_UPLOAD_TOKEN - bearer token for Artifactory -# Optional flag: -# --build : build aiter libs before packaging/uploading; default is package-only. +# Optional flags: +# --preflight --upload +# Validate upload path: Artifactory ping, then HEAD on the probe URL with the bearer token. +# Use in CI before uploading prebuilts. +# --preflight --download +# Validate download path: same ping, then HEAD on the probe URL without credentials. +# Matches what CMake file(DOWNLOAD) sees when fetching prebuilts (no token). +# --build : build AITER libs before packaging/uploading; default is package-only. + +_aiter_set_artifactory_check_urls() { + if [[ -z "${NVTE_AITER_PREBUILT_BASE_URL:-}" ]]; then + echo "Missing vars.NVTE_AITER_PREBUILT_BASE_URL" >&2 + exit 1 + fi + local BASE="${NVTE_AITER_PREBUILT_BASE_URL%/}" + local ROOT_PREFIX="${BASE%%/artifactory/*}" + _AITER_ARTIFACTORY_SYSTEM_PING_URL="${ROOT_PREFIX}/artifactory/api/system/ping" + _AITER_PREBUILT_BASE_ACCESS_PROBE_URL="${BASE}/__aiter_repo_access_probe_not_a_real_artifact" +} + +_aiter_curl_artifactory_system_ping() { + echo "[AITER-PREBUILT] Preflight: GET ${_AITER_ARTIFACTORY_SYSTEM_PING_URL} ..." + curl -fsS --connect-timeout 25 --max-time 60 "${_AITER_ARTIFACTORY_SYSTEM_PING_URL}" >/dev/null +} + +_aiter_preflight_head_ok() { + local mode=$1 + local code=$2 + case "${code}" in + 404|200) + echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (success)" + ;; + *) + echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (failed)" >&2 + exit 1 + ;; + esac +} + +_aiter_check_artifactory_upload() { + _aiter_set_artifactory_check_urls + if [[ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN:-}" ]]; then + echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2 + exit 1 + fi + _aiter_curl_artifactory_system_ping + echo "[AITER-PREBUILT] Preflight (upload): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (authenticated) ..." + local code + code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \ + -H "Authorization: Bearer ${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" \ + -I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)" + _aiter_preflight_head_ok upload "${code}" +} + +_aiter_check_artifactory_download() { + _aiter_set_artifactory_check_urls + _aiter_curl_artifactory_system_ping + echo "[AITER-PREBUILT] Preflight (download): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (anonymous) ..." + local code + code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \ + -I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)" + _aiter_preflight_head_ok download "${code}" +} + +if [[ "${1:-}" == "--preflight" ]]; then + shift + case "${1:-}" in + --upload) + _aiter_check_artifactory_upload + ;; + --download) + _aiter_check_artifactory_download + ;; + *) + echo "Usage: $(basename "$0") --preflight --upload | --preflight --download" >&2 + exit 1 + ;; + esac + exit 0 +fi # Derive ROCm version and aiter commit -> cache key ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" diff --git a/.github/workflows/aiter-prebuilt-upload.yml b/.github/workflows/aiter-prebuilt-upload.yml index a45350a79..7ee13a919 100644 --- a/.github/workflows/aiter-prebuilt-upload.yml +++ b/.github/workflows/aiter-prebuilt-upload.yml @@ -10,10 +10,19 @@ on: description: "Docker image" required: false default: "" + workflow_call: + inputs: + docker_image: + description: "Docker image URI from rocm-ci select_image.outputs.image-tag" + required: true + type: string jobs: upload: runs-on: build-only-te + env: + NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }} + NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }} steps: - name: Checkout source uses: actions/checkout@v6 @@ -22,9 +31,32 @@ jobs: submodules: recursive fetch-depth: 0 + # Verify this runner can reach Artifactory for uploads + - name: "Preflight: Artifactory upload reachability" + run: | + set -euo pipefail + if bash .github/scripts/aiter_prebuild_upload.sh --preflight --upload; then + echo "::notice::Preflight upload reachability succeeded" + exit 0 + fi + echo "::error::Preflight upload reachability failed" + exit 1 + - name: Resolve docker image id: cfg run: | + set -euo pipefail + EVENT="${{ github.event_name }}" + if [ "$EVENT" = "workflow_call" ]; then + IMAGE="${{ inputs.docker_image }}" + if [ -z "$IMAGE" ]; then + echo "workflow_call requires non-empty docker_image." >&2 + exit 1 + fi + echo "Using docker_image from caller." + echo "image=${IMAGE}" >> "$GITHUB_OUTPUT" + exit 0 + fi IMAGE="${{ inputs.docker_image }}" if [ -z "$IMAGE" ]; then IMAGE="${{ vars.DEV_DOCKER_IMAGE }}" @@ -33,7 +65,7 @@ jobs: echo "No docker image provided and vars.DEV_DOCKER_IMAGE is empty." >&2 exit 1 fi - echo "image=${IMAGE}" >> $GITHUB_OUTPUT + echo "image=${IMAGE}" >> "$GITHUB_OUTPUT" - name: Pull docker image run: docker pull ${{ steps.cfg.outputs.image }} @@ -50,19 +82,12 @@ jobs: ${{ steps.cfg.outputs.image }} - name: Build and upload aiter prebuilt - env: - NVTE_AITER_PREBUILT_BASE_URL: https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts - NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }} run: | docker exec \ -e NVTE_AITER_PREBUILT_BASE_URL=${NVTE_AITER_PREBUILT_BASE_URL} \ -e NVTE_AITER_PREBUILT_UPLOAD_TOKEN=${NVTE_AITER_PREBUILT_UPLOAD_TOKEN} \ te-aiter-upload bash -c "$(cat <<'EOF' set -ex - if [ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" ]; then - echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2 - exit 1 - fi export HIP_PATH="" git config --global --add safe.directory '*' bash .github/scripts/aiter_prebuild_upload.sh --build diff --git a/.github/workflows/rocm-ci-dispatch.yml b/.github/workflows/rocm-ci-dispatch.yml index e679ece46..7b1718608 100644 --- a/.github/workflows/rocm-ci-dispatch.yml +++ b/.github/workflows/rocm-ci-dispatch.yml @@ -10,8 +10,66 @@ on: permissions: contents: read + pull-requests: read jobs: + # To determine whether to upload AITER prebuilt to Artifactory + aiter_prebuilt_upload_trigger: + runs-on: ubuntu-latest + outputs: + trigger_aiter_upload: ${{ steps.set.outputs.trigger_aiter_upload }} + steps: + - name: Detect PR changes under 3rdparty/aiter + uses: dorny/paths-filter@v4 + id: paths + if: github.event.action == 'synchronize' + with: + filters: | + aiter: + - '3rdparty/aiter/**' + + - name: Detect skip_aiter_upload label + id: skip_label + uses: actions/github-script@v8 + with: + script: | + const labels = context.payload.pull_request?.labels || []; + const skip = labels.some((l) => l.name === 'skip_aiter_upload'); + core.info(`skip_aiter_upload label : ${skip}`); + core.setOutput('skip', skip ? 'true' : 'false'); + + - name: Set trigger_aiter_upload from paths and labels + id: set + run: | + set -euo pipefail + ACTION='${{ github.event.action }}' + echo "PR action=${ACTION}" + + if [ "$ACTION" != "synchronize" ]; then + echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT" + echo "Not synchronize - trigger_aiter_upload = false" + exit 0 + fi + + SKIP='${{ steps.skip_label.outputs.skip }}' + echo "skip_aiter_upload label : ${SKIP}" + + if [ "$SKIP" = 'true' ]; then + echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT" + echo "skip_aiter_upload label set - trigger_aiter_upload = false" + exit 0 + fi + + AITER_PATHS='${{ steps.paths.outputs.aiter }}' + + if [ "$AITER_PATHS" = "true" ]; then + echo "trigger_aiter_upload=true" >> "$GITHUB_OUTPUT" + echo "3rdparty/aiter changed on PR - trigger_aiter_upload = true" + else + echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT" + echo "No 3rdparty/aiter changes on PR - trigger_aiter_upload = false" + fi + determine_level: runs-on: ubuntu-latest outputs: @@ -53,9 +111,10 @@ jobs: # - A commit was pushed with existing ci-level label(s) # - The PR was reopened or opened with existing ci-level label(s) if: ${{ needs.determine_level.outputs.test_level != '' }} - needs: determine_level + needs: [determine_level, aiter_prebuilt_upload_trigger] name: CI Level ${{ needs.determine_level.outputs.test_level }} uses: ./.github/workflows/rocm-ci.yml secrets: inherit with: test_level: ${{ needs.determine_level.outputs.test_level }} + trigger_aiter_upload: ${{ needs.aiter_prebuilt_upload_trigger.outputs.trigger_aiter_upload == 'true' }} diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 5e0ae242c..51414a461 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -26,6 +26,11 @@ on: required: false default: false type: boolean + trigger_aiter_upload: + description: 'True when 3rdparty/aiter changed on the PR (set by rocm-ci-dispatch)' + required: false + default: false + type: boolean workflow_dispatch: inputs: test_level: @@ -99,8 +104,19 @@ jobs: echo "Selected image: $IMAGE_TO_USE" echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT + upload_aiter_prebuilt: + name: Build and upload AITER prebuilt + needs: select_image + if: ${{ (github.event_name == 'workflow_call' && inputs.trigger_aiter_upload == 'true') }} + uses: ./.github/workflows/aiter-prebuilt-upload.yml + with: + docker_image: ${{ needs.select_image.outputs.image-tag }} + secrets: inherit + build: # Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`. + needs: [select_image, upload_aiter_prebuilt] + if: always() && needs.select_image.result == 'success' && (needs.upload_aiter_prebuilt.result == 'skipped' || needs.upload_aiter_prebuilt.result == 'success') uses: ./.github/workflows/rocm-wheels-build.yml secrets: inherit diff --git a/.github/workflows/rocm-wheels-build.yml b/.github/workflows/rocm-wheels-build.yml index c1a8ea087..afaf5024f 100644 --- a/.github/workflows/rocm-wheels-build.yml +++ b/.github/workflows/rocm-wheels-build.yml @@ -76,6 +76,8 @@ jobs: build-rocm-wheels: name: Build ROCm Docker image and TransformerEngine wheels runs-on: build-only-te + env: + NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }} steps: - name: Checkout repository @@ -89,6 +91,19 @@ jobs: 3rdparty/QoLA \ 3rdparty/hipify_torch + # Verify this runner can reach Artifactory for anonymous prebuilt downloads + - name: "Preflight: Artifactory download reachability" + if: ${{ inputs.use_prebuilt_aiter }} + continue-on-error: true + run: | + set -euo pipefail + if bash .github/scripts/aiter_prebuild_upload.sh --preflight --download; then + echo "::notice::Preflight download reachability succeeded" + exit 0 + fi + echo "::warning::Preflight download reachability failed" + exit 1 + - name: Derive Docker image tag id: set-tag run: | @@ -187,7 +202,7 @@ jobs: # The container writes all wheels and logs under /wheelhouse. - name: Build TransformerEngine wheels run: | - NVTE_AITER_PREBUILT_BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts" + set -euo pipefail docker run --rm \ --env LOCAL_TREE_BUILD=1 \ --env NVTE_SKIP_SUBMODULE_CHECKS_DURING_BUILD=1 \