From d96a2f03e27659fd8cf812f941865a1cd65bc31d Mon Sep 17 00:00:00 2001 From: leo-automation Date: Wed, 22 Apr 2026 18:23:01 +0200 Subject: [PATCH 1/9] CI: Refactor ROCm CI to use GPU-sized runners and build-only jobs --- .github/workflows/aiter-prebuilt-upload.yml | 6 +- .github/workflows/rocm-ci.yml | 660 ++++++++++++-------- ci/_utils.sh | 3 +- 3 files changed, 392 insertions(+), 277 deletions(-) diff --git a/.github/workflows/aiter-prebuilt-upload.yml b/.github/workflows/aiter-prebuilt-upload.yml index f9d2a91d7..a45350a79 100644 --- a/.github/workflows/aiter-prebuilt-upload.yml +++ b/.github/workflows/aiter-prebuilt-upload.yml @@ -13,7 +13,7 @@ on: jobs: upload: - runs-on: linux-te-mi325-8 + runs-on: build-only-te steps: - name: Checkout source uses: actions/checkout@v6 @@ -44,11 +44,7 @@ jobs: --rm \ --name te-aiter-upload \ --network=host \ - --device=/dev/dri --device=/dev/kfd \ - --shm-size=16G \ --pid=host \ - --group-add $(getent group render | cut -d: -f3) \ - --group-add $(getent group video | cut -d: -f3) \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ ${{ steps.cfg.outputs.image }} diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index c85f1bed2..c377bad37 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -45,132 +45,157 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + TEST_LEVEL: ${{ (github.event_name == 'push' && '3') || inputs.test_level || '1' }} + jobs: - build_and_test: - name: Build and Test on GPU (${{ matrix.runner }}) - Level ${{ (github.event_name == 'push' && '3') || inputs.test_level || '1' }} - timeout-minutes: 720 - runs-on: ${{ matrix.runner }} - strategy: - fail-fast: false - matrix: - runner: [linux-te-mi325-8, linux-te-mi35x-8] + select_image: + name: Select Docker Image + runs-on: ubuntu-latest + timeout-minutes: 10 + outputs: + image-tag: ${{ steps.select-image.outputs.image-tag }} steps: - name: Checkout repository uses: actions/checkout@v6 with: - submodules: 'recursive' - - - name: Host Diagnostics & Environment Setup - id: host-setup - run: | - # Host Activity Checks - echo "::group::Host Diagnostics" - - echo ">>> Active Containers:" - docker ps -a - - echo ">>> ROCm Installation:" - (ls -d /opt/rocm/core-* || ls -d /opt/rocm-* || echo "No default ROCm path found") 2>/dev/null || true - echo ">>> GPU info:" - ls -l /dev/dri - ls -l /dev/kfd - rocm-smi - - echo ">>> Kernel Command Line:" - cat /proc/cmdline - echo "::endgroup::" - - # Calculate Test Level - # Default to input (or '1' if input is missing/null) - CALC_LEVEL="${{ inputs.test_level || '1' }}" - - # Only force Level 3 if this is a direct PUSH to dev or a release branch - if [[ "${{ github.event_name }}" == "push" ]]; then - echo "::notice::Push to monitored branch (${{ github.ref_name }}) detected. Forcing Level 3." - CALC_LEVEL="3" - fi - - echo "TEST_LEVEL=$CALC_LEVEL" >> $GITHUB_ENV - - # Print Final Environment - echo "::group::Environment & Parameters" - echo "Final Test Level: $CALC_LEVEL" - echo "Event Name: ${{ github.event_name }}" - echo "Ref Name: ${{ github.ref_name }}" - echo "Base Ref: ${{ github.base_ref }}" - env | sort - echo "::endgroup::" + ref: ${{ inputs.test_config_from_source && github.ref_name || github.event.repository.default_branch || 'dev' }} + sparse-checkout: ci/ci_config.json + sparse-checkout-cone-mode: false - name: Select Docker Image Tag id: select-image run: | - # Determine config source - # Default we are fetching from 'dev' branch - CONFIG_BRANCH="dev" - - # If manual run requesting source config, switch branch if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then - CONFIG_BRANCH="${{ github.ref_name }}" - echo "::notice::Debugging mode: Fetching config from current branch ($CONFIG_BRANCH)" + echo "::notice::Debugging mode: Using ci/ci_config.json from ${{ github.ref_name }}" + else + echo "::notice::Using ci/ci_config.json from ${{ github.event.repository.default_branch || 'dev' }}" fi - # Download config - CONFIG_URL="https://raw.githubusercontent.com/ROCm/TransformerEngine/${CONFIG_BRANCH}/ci/ci_config.json" - echo "Attempting to fetch image config from: $CONFIG_URL" - - if curl -s -f -o docker_config.json "$CONFIG_URL"; then - echo "Successfully downloaded config from $CONFIG_BRANCH." - else - echo "::warning::Failed to fetch config from $CONFIG_BRANCH (File might not exist yet)." - - # Fallback: Check source branch file - if [[ -f "ci/ci_config.json" ]]; then - echo "::notice::Falling back to local 'ci/ci_config.json' from checkout." - cp ci/ci_config.json docker_config.json - else - echo "::error::Config file not found in $CONFIG_BRANCH OR locally." - exit 1 - fi + if [[ ! -f "ci/ci_config.json" ]]; then + echo "::error::Config file not found in checkout." + exit 1 fi - # Determine image key BRANCH_NAME="${{ github.base_ref || github.ref_name }}" echo "Determining image for branch: $BRANCH_NAME" - - # Logic: Check if branch matches "release_vX.X". - # If so, look for that key in JSON. Otherwise default. - JSON_KEY="default" - - if [[ $BRANCH_NAME =~ ^release_v([0-9]+\.[0-9]+)_rocm$ ]]; then - VERSION_KEY="release_v${BASH_REMATCH[1]}" - # Check if this specific version key exists in the JSON - if [[ $(jq "(.docker_images | has(\"$VERSION_KEY\"))" docker_config.json) == "true" ]]; then - JSON_KEY="$VERSION_KEY" - fi + VERSION_KEY="$BRANCH_NAME" + + if jq -e --arg key "$VERSION_KEY" '.docker_images[$key]' ci/ci_config.json > /dev/null; then + JSON_KEY="$VERSION_KEY" + else + JSON_KEY="default" fi - - echo "Selected config key: $JSON_KEY" - # Extract image name from json - IMAGE_TO_USE=$(jq -r ".docker_images.\"$JSON_KEY\"" docker_config.json) + echo "Selected config key: $JSON_KEY" + IMAGE_TO_USE=$(jq -r --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json) - # Check input from workflow_dispatch overriding the image MANUAL_OVERRIDE="${{ inputs.docker_image_override }}" if [[ -n "$MANUAL_OVERRIDE" ]]; then echo "::notice::Manual override detected: $MANUAL_OVERRIDE" IMAGE_TO_USE="$MANUAL_OVERRIDE" fi - + echo "Selected image: $IMAGE_TO_USE" echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT + build: + name: Build Wheel + needs: select_image + timeout-minutes: 120 + runs-on: build-only-te + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Initialize required submodules + run: | + git submodule update --init --recursive \ + 3rdparty/aotriton \ + 3rdparty/aiter \ + 3rdparty/hipify_torch + + - name: Pull Docker Image + run: | + docker pull ${{ needs.select_image.outputs.image-tag }} + + - name: Build Wheel + run: | + docker run --rm \ + --network=host \ + -v "${{ github.workspace }}:/workspace" \ + -w /workspace \ + ${{ needs.select_image.outputs.image-tag }} \ + bash -c "$(cat <<'EOF' + set -ex + + export HIP_PATH="" + export PYTORCH_ROCM_ARCH="gfx942;gfx950" + export NVTE_ROCM_ARCH="gfx942;gfx950" + export NVTE_SKIP_SUBMODULE_CHECKS_DURING_BUILD=1 + export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts + pip install ninja + git config --global --add safe.directory /workspace + git submodule foreach --recursive 'git config --global --add safe.directory /workspace/$sm_path' + NVTE_RELEASE_BUILD=1 pip wheel --no-build-isolation --no-deps -v . -w /workspace/dist/ 2>&1 + + mkdir -p /tmp/sdist + (cd transformer_engine/pytorch && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /tmp/sdist/) + (cd transformer_engine/jax && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /tmp/sdist/) + + pip wheel --no-build-isolation --no-deps -v /tmp/sdist/transformer_engine_rocm_torch-*.tar.gz -w /workspace/dist/ 2>&1 + pip wheel --no-build-isolation --no-deps -v /tmp/sdist/transformer_engine_rocm_jax-*.tar.gz -w /workspace/dist/ 2>&1 + EOF + )" + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: te-wheel + path: dist/* + retention-days: 1 + + sgpu_tests: + name: sGPU Tests (${{ matrix.arch_label }}) + needs: [select_image, build] + timeout-minutes: 360 + runs-on: ${{ matrix.arch_label == 'mi325' && 'linux-te-mi325-4' || 'linux-te-mi35x-4' }} + strategy: + fail-fast: false + matrix: + arch_label: [mi325, mi35x] + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Initialize required submodules + run: | + git submodule update --init --recursive \ + 3rdparty/googletest \ + 3rdparty/hipify_torch + + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + path: dist/ + + - name: Host Diagnostics + run: | + echo "::group::Host Diagnostics" + echo ">>> GPU info:" + ls -l /dev/dri + ls -l /dev/kfd + rocm-smi + echo "::endgroup::" + - name: Pull Docker Image run: | - docker pull ${{ steps.select-image.outputs.image-tag }} + docker pull ${{ needs.select_image.outputs.image-tag }} - name: Run Container run: | docker run -dt \ + --rm \ --name te-runner \ --network=host \ --device=/dev/dri --device=/dev/kfd \ @@ -180,257 +205,330 @@ jobs: --group-add $(getent group video | cut -d: -f3) \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ - ${{ steps.select-image.outputs.image-tag}} + ${{ needs.select_image.outputs.image-tag }} - - name: Container Diagnostics & GPU Setup - id: container-diag + - name: Install packages run: | - echo "::group::Container Configuration" - # Check Shared Memory Size inside container - echo ">>> /dev/shm size:" - docker exec te-runner df -h /dev/shm - - # Check OS/Kernel inside container - echo ">>> Container OS:" - docker exec te-runner cat /etc/os-release | grep PRETTY_NAME - echo "::endgroup::" + docker exec te-runner bash -c "$(cat <<'EOF' + set -ex + # core (cpp) tests build via cmake inside the repo; allow git ops in-tree. + git config --global --add safe.directory '*' - echo "::group::ROCm Diagnostics (Host vs Container)" - echo ">>> CONTAINER rocm-smi:" - docker exec te-runner rocm-smi || true - echo "::endgroup::" + install_if_present() { + pkg="$1" + label="$2" + if [ -n "$pkg" ]; then + echo "Installing $label package: $pkg" + pip install --no-build-isolation --no-deps "$pkg" 2>&1 + else + echo "No $label package found; using the monolithic wheel layout." + fi + } - # Determine Architecture - # Run rocminfo inside the container and capture the output - ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'") - - if [ -z "$ARCH" ]; then - echo "::error::Could not determine GPU architecture using rocminfo inside the container." - docker exec te-runner rocminfo - exit 1 + TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) + TE_SPLIT_LAYOUT=1 + if [ -z "$TE_CORE_PKG" ]; then + TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine-[0-9]*.whl' | sort | head -n 1) + TE_SPLIT_LAYOUT=0 fi - - echo "Detected GPU Arch: $ARCH" - echo "arch=$ARCH" >> $GITHUB_OUTPUT + test -n "$TE_CORE_PKG" + pip install --no-deps "$TE_CORE_PKG" 2>&1 - - name: Build Project - run: | - docker exec \ - -e GPU_ARCH=${{ steps.container-diag.outputs.arch }} \ - te-runner bash -c "$(cat <<'EOF' - set -ex - - export HIP_PATH="" - export PYTORCH_ROCM_ARCH=$GPU_ARCH - export NVTE_ROCM_ARCH=$GPU_ARCH - export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts - pip install ninja - git config --global --add safe.directory '*' - pip install --no-build-isolation -v . 2>&1 + if [ "$TE_SPLIT_LAYOUT" = "1" ]; then + TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.whl' | sort | head -n 1) + TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.whl' | sort | head -n 1) + else + TE_TORCH_PKG="" + TE_JAX_PKG="" + fi + + if [ -n "$TE_TORCH_PKG$TE_JAX_PKG" ]; then + pip install ninja pybind11[global] 2>&1 + fi + + install_if_present "$TE_TORCH_PKG" "PyTorch" + install_if_present "$TE_JAX_PKG" "JAX" EOF )" - - name: Run sGPU tests - id: sgpu-tests - continue-on-error: true + - name: Run sGPU tests in parallel (pytorch, jax, examples, core) + id: run-tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - # Cleanup previous failure markers if any. Don't actually do anything on k8s pods rm -f FAIL_* docker exec \ -e TEST_SGPU=1 \ -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ + -e HF_TOKEN="$HF_TOKEN" \ te-runner bash -c "$(cat <<'EOF' #!/usr/bin/bash set -x -o pipefail ulimit -c 0 # Disable core dumps - HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 & - torch_pid=$!; echo Pytorch test pid $! - - HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 & - jax_pid=$!; echo JAX test pid $! - - HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 & - core_pid=$!; echo Core test pid $! - - wait $core_pid; core_rc=$? - wait $jax_pid; jax_rc=$? - wait $torch_pid; torch_rc=$? - - # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later - # Check PyTorch - if [ $torch_rc -ne 0 ]; then - echo "::group::[FAILED] PyTorch sGPU Log" - cat /workspace/torch_sgpu.log + HIP_VISIBLE_DEVICES=0 ci/pytorch.sh > /workspace/torch.log 2>&1 & + TORCH_PID=$! + + HIP_VISIBLE_DEVICES=1 ci/jax.sh > /workspace/jax.log 2>&1 & + JAX_PID=$! + + ( + set -e + python -c "import os; print('HF_TOKEN set:', bool(os.environ.get('HF_TOKEN')))" + + JAX_CONSTRAINTS=/tmp/jax-constraints.txt + pip freeze | grep -iE '^(jax|jaxlib|jax[_-]rocm|jax[_-]plugins)[=@]' > "$JAX_CONSTRAINTS" || true + + export HIP_VISIBLE_DEVICES=2 + + cd /workspace/examples/pytorch/mnist + python main.py + python main.py --use-te + python main.py --use-fp8 + + cd /workspace/examples/jax/mnist + pip3 install -c "$JAX_CONSTRAINTS" -r requirements.txt + python test_single_gpu_mnist.py + python test_single_gpu_mnist.py --use-te + python test_single_gpu_mnist.py --use-fp8 + + cd /workspace/examples/jax/encoder + pip3 install -c "$JAX_CONSTRAINTS" -r requirements.txt + python test_single_gpu_encoder.py + python test_single_gpu_encoder.py --use-fp8 + ) > /workspace/examples.log 2>&1 & + EXAMPLES_PID=$! + + HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core.log 2>&1 & + CORE_PID=$! + + wait $TORCH_PID; torch_rc=$? + wait $JAX_PID; jax_rc=$? + wait $EXAMPLES_PID; examples_rc=$? + wait $CORE_PID; core_rc=$? + + if [ $torch_rc -ne 0 ]; then + echo "::group::[FAILED] PyTorch Log" + cat /workspace/torch.log echo "::endgroup::" - echo "::error::Pytorch sGPU test FAILED." - touch /workspace/FAIL_TORCH_SGPU + echo "::error::PyTorch tests FAILED." + touch /workspace/FAIL_TORCH fi - # Check JAX - if [ $jax_rc -ne 0 ]; then - echo "::group::[FAILED] JAX sGPU Log" - cat /workspace/jax_sgpu.log + if [ $jax_rc -ne 0 ]; then + echo "::group::[FAILED] JAX Log" + cat /workspace/jax.log echo "::endgroup::" - echo "::error::JAX sGPU test FAILED." - touch /workspace/FAIL_JAX_SGPU + echo "::error::JAX tests FAILED." + touch /workspace/FAIL_JAX fi - # Check Core - if [ $core_rc -ne 0 ]; then - echo "::group::[FAILED] Core sGPU Log" - cat /workspace/core_sgpu.log + if [ $examples_rc -ne 0 ]; then + echo "::group::[FAILED] Examples Log" + cat /workspace/examples.log echo "::endgroup::" - echo "::error::Core sGPU test FAILED." - touch /workspace/FAIL_CORE_SGPU + echo "::error::Examples FAILED." + touch /workspace/FAIL_EXAMPLES fi - - test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0 + + if [ $core_rc -ne 0 ]; then + echo "::group::[FAILED] Core Log" + cat /workspace/core.log + echo "::endgroup::" + echo "::error::Core tests FAILED." + touch /workspace/FAIL_CORE + fi + + test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $examples_rc -eq 0 -a $core_rc -eq 0 + EOF + )" + + - name: Check suite failure status + if: always() + run: | + EXIT_STATUS=0 + if [[ -f FAIL_TORCH ]]; then + echo "::error::PyTorch tests failed." + EXIT_STATUS=1 + fi + if [[ -f FAIL_JAX ]]; then + echo "::error::JAX tests failed." + EXIT_STATUS=1 + fi + if [[ -f FAIL_EXAMPLES ]]; then + echo "::error::Examples failed." + EXIT_STATUS=1 + fi + if [[ -f FAIL_CORE ]]; then + echo "::error::Core tests failed." + EXIT_STATUS=1 + fi + exit $EXIT_STATUS + + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-sgpu-${{ matrix.arch_label }} + path: | + *.log + if-no-files-found: ignore + retention-days: 5 + + - name: Cleanup container + if: always() + run: docker rm -f te-runner || true + + mgpu_tests: + name: mGPU Tests (${{ matrix.arch_label }}) + needs: [select_image, build] + timeout-minutes: 360 + runs-on: ${{ matrix.arch_label == 'mi300' && 'linux-te-mi300-8' || 'linux-te-mi35x-8' }} + strategy: + fail-fast: false + matrix: + arch_label: [mi300, mi35x] + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + path: dist/ + + - name: Host Diagnostics + run: | + echo "::group::Host Diagnostics" + echo ">>> GPU info:" + ls -l /dev/dri + ls -l /dev/kfd + rocm-smi + echo "::endgroup::" + + - name: Pull Docker Image + run: | + docker pull ${{ needs.select_image.outputs.image-tag }} + + - name: Run Container + run: | + docker run -dt \ + --rm \ + --name te-runner \ + --network=host \ + --device=/dev/dri --device=/dev/kfd \ + --shm-size=16G \ + --pid=host \ + --group-add $(getent group render | cut -d: -f3) \ + --group-add $(getent group video | cut -d: -f3) \ + -v "${{ github.workspace }}:/workspace" \ + -w /workspace \ + ${{ needs.select_image.outputs.image-tag }} + + - name: Install packages + run: | + docker exec te-runner bash -c "$(cat <<'EOF' + set -ex + + install_if_present() { + pkg="$1" + label="$2" + if [ -n "$pkg" ]; then + echo "Installing $label package: $pkg" + pip install --no-build-isolation --no-deps "$pkg" 2>&1 + else + echo "No $label package found; using the monolithic wheel layout." + fi + } + + TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) + TE_SPLIT_LAYOUT=1 + if [ -z "$TE_CORE_PKG" ]; then + TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine-[0-9]*.whl' | sort | head -n 1) + TE_SPLIT_LAYOUT=0 + fi + test -n "$TE_CORE_PKG" + pip install --no-deps "$TE_CORE_PKG" 2>&1 + + if [ "$TE_SPLIT_LAYOUT" = "1" ]; then + TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.whl' | sort | head -n 1) + TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.whl' | sort | head -n 1) + else + TE_TORCH_PKG="" + TE_JAX_PKG="" + fi + + if [ -n "$TE_TORCH_PKG$TE_JAX_PKG" ]; then + pip install ninja pybind11[global] 2>&1 + fi + + install_if_present "$TE_TORCH_PKG" "PyTorch" + install_if_present "$TE_JAX_PKG" "JAX" EOF )" - - # Export failed tests statuses to host runner - if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi - if [ -f FAIL_JAX_SGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi - if [ -f FAIL_CORE_SGPU ]; then echo "core=fail" >> $GITHUB_OUTPUT; fi - name: Run mGPU tests id: mgpu-tests - continue-on-error: true + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | + rm -f FAIL_* + docker exec \ -e TEST_MGPU=1 \ -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ + -e HF_TOKEN="$HF_TOKEN" \ te-runner bash -c "$(cat <<'EOF' #!/usr/bin/bash set -x -o pipefail ulimit -c 0 # Disable core dumps - - # Run PyTorch + ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1 torch_rc=$? - - # Run JAX + ci/jax.sh > /workspace/jax_mgpu.log 2>&1 jax_rc=$? - - # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later - if [ $torch_rc -ne 0 ]; then + + if [ $torch_rc -ne 0 ]; then echo "::group::[FAILED] PyTorch mGPU Log" cat /workspace/torch_mgpu.log echo "::endgroup::" - echo "::error::Pytorch mGPU test FAILED." + echo "::error::PyTorch mGPU tests FAILED." touch /workspace/FAIL_TORCH_MGPU fi - if [ $jax_rc -ne 0 ]; then + if [ $jax_rc -ne 0 ]; then echo "::group::[FAILED] JAX mGPU Log" cat /workspace/jax_mgpu.log echo "::endgroup::" - echo "::error::JAX mGPU test FAILED." + echo "::error::JAX mGPU tests FAILED." touch /workspace/FAIL_JAX_MGPU fi - - test $torch_rc -eq 0 -a $jax_rc -eq 0 - EOF - )" - - # Export failed tests statuses to host runner - if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi - if [ -f FAIL_JAX_MGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi - - name: Run Examples - id: examples-tests - continue-on-error: true - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - docker exec -e HF_TOKEN="$HF_TOKEN" te-runner bash -c "$(cat <<'EOF' - #!/usr/bin/bash - set -ex -o pipefail - ulimit -c 0 # Disable core dumps - - # Check whether the HF_TOKEN is present - python -c "import os; print('HF_TOKEN set:', bool(os.environ.get('HF_TOKEN')))" - - cd /workspace/examples/pytorch/mnist - python main.py 2>&1 | tee /workspace/examples.log - python main.py --use-te 2>&1 | tee -a /workspace/examples.log - python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log - - cd /workspace/examples/jax/mnist - pip3 install -r requirements.txt - python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log - python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log - python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log - - cd /workspace/examples/jax/encoder - pip3 install -r requirements.txt - python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log - python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log + test $torch_rc -eq 0 -a $jax_rc -eq 0 EOF )" - - name: Check Test Failure Status + - name: Check mGPU failure status if: always() run: | EXIT_STATUS=0 - # Check outcomes of the specific test steps - # "outcome" will be 'failure' even if continue-on-error was true - - # sGPU CHECKS - # We check for the file existence directly because the 'Run sGPU tests' step - # halts immediately on docker failure, skipping the lines that set step outputs. - if [[ -f FAIL_CORE_SGPU ]]; then - echo "::error::Core sGPU Tests Failed." - EXIT_STATUS=1 - fi - if [[ -f FAIL_TORCH_SGPU ]]; then - echo "::error::PyTorch sGPU Tests Failed." - EXIT_STATUS=1 - fi - if [[ -f FAIL_JAX_SGPU ]]; then - echo "::error::JAX sGPU Tests Failed." - EXIT_STATUS=1 - fi - - # mGPU CHECKS if [[ -f FAIL_TORCH_MGPU ]]; then - echo "::error::PyTorch mGPU Tests Failed." + echo "::error::PyTorch mGPU tests failed." EXIT_STATUS=1 fi if [[ -f FAIL_JAX_MGPU ]]; then - echo "::error::JAX mGPU Tests Failed." - EXIT_STATUS=1 - fi - - # EXAMPLES CHECK - # Examples script does not use marker files, so we rely on step outcome - if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then - echo "::error::Example Tests Failed." + echo "::error::JAX mGPU tests failed." EXIT_STATUS=1 fi + exit $EXIT_STATUS - # Fail the job if any errors were detected - if [[ "$EXIT_STATUS" == "1" ]]; then - exit 1 - fi - - - name: Copy logs and reports from container - if: always() - run: | - docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true - docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true - docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true - docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true - docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true - - - name: Upload logs and test reports + - name: Upload logs if: always() uses: actions/upload-artifact@v4 with: - name: logs-and-reports-${{ matrix.runner }} + name: logs-mgpu-${{ matrix.arch_label }} path: | *.log if-no-files-found: ignore @@ -439,3 +537,23 @@ jobs: - name: Cleanup container if: always() run: docker rm -f te-runner || true + + check_results: + name: CI Result + if: always() + needs: [build, sgpu_tests, mgpu_tests] + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Evaluate job results + run: | + echo "Build: ${{ needs.build.result }}" + echo "sGPU Tests: ${{ needs.sgpu_tests.result }}" + echo "mGPU Tests: ${{ needs.mgpu_tests.result }}" + + if [[ "${{ needs.build.result }}" != "success" ]] || \ + [[ "${{ needs.sgpu_tests.result }}" != "success" ]] || \ + [[ "${{ needs.mgpu_tests.result }}" != "success" ]]; then + echo "::error::One or more CI jobs did not succeed." + exit 1 + fi diff --git a/ci/_utils.sh b/ci/_utils.sh index b4aae9cc7..0e5ef8bfc 100644 --- a/ci/_utils.sh +++ b/ci/_utils.sh @@ -266,6 +266,7 @@ pytest_run() { check_test_filter $_test_name_tag || return _start_ts=`date +%s` echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`" - pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" || test_run_error "[$_test_variant_tag] $1" + pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` "$TEST_DIR/$@" + test $? -eq 0 || test_run_error "[$_test_variant_tag] $1" echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`" } From 4ce11586dd95714f93aa7059cb414e8565392f54 Mon Sep 17 00:00:00 2001 From: leo-automation Date: Thu, 23 Apr 2026 18:03:32 +0200 Subject: [PATCH 2/9] Update labels --- .github/workflows/rocm-ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index c377bad37..8ad91986d 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -159,11 +159,11 @@ jobs: name: sGPU Tests (${{ matrix.arch_label }}) needs: [select_image, build] timeout-minutes: 360 - runs-on: ${{ matrix.arch_label == 'mi325' && 'linux-te-mi325-4' || 'linux-te-mi35x-4' }} + runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }} strategy: fail-fast: false matrix: - arch_label: [mi325, mi35x] + arch_label: [mi30x, mi35x] steps: - name: Checkout repository uses: actions/checkout@v6 @@ -384,11 +384,11 @@ jobs: name: mGPU Tests (${{ matrix.arch_label }}) needs: [select_image, build] timeout-minutes: 360 - runs-on: ${{ matrix.arch_label == 'mi300' && 'linux-te-mi300-8' || 'linux-te-mi35x-8' }} + runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }} strategy: fail-fast: false matrix: - arch_label: [mi300, mi35x] + arch_label: [mi30x, mi35x] steps: - name: Checkout repository uses: actions/checkout@v6 From b58d24cab376e18afdc64d728654a2f70f8c9e1c Mon Sep 17 00:00:00 2001 From: leo-automation Date: Thu, 23 Apr 2026 18:52:29 +0200 Subject: [PATCH 3/9] Shallow clone --- .github/workflows/rocm-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 8ad91986d..9d04262a1 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -110,7 +110,7 @@ jobs: - name: Initialize required submodules run: | - git submodule update --init --recursive \ + git submodule update --init --recursive --depth 1 \ 3rdparty/aotriton \ 3rdparty/aiter \ 3rdparty/hipify_torch @@ -170,7 +170,7 @@ jobs: - name: Initialize required submodules run: | - git submodule update --init --recursive \ + git submodule update --init --recursive --depth 1 \ 3rdparty/googletest \ 3rdparty/hipify_torch From 217515551f15e8f0b8a83c0c29c15967b23cdae5 Mon Sep 17 00:00:00 2001 From: leo-automation Date: Mon, 27 Apr 2026 15:17:51 +0200 Subject: [PATCH 4/9] Address comments --- .github/workflows/rocm-ci.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 9d04262a1..9ce099937 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -139,12 +139,9 @@ jobs: git submodule foreach --recursive 'git config --global --add safe.directory /workspace/$sm_path' NVTE_RELEASE_BUILD=1 pip wheel --no-build-isolation --no-deps -v . -w /workspace/dist/ 2>&1 - mkdir -p /tmp/sdist - (cd transformer_engine/pytorch && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /tmp/sdist/) - (cd transformer_engine/jax && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /tmp/sdist/) - - pip wheel --no-build-isolation --no-deps -v /tmp/sdist/transformer_engine_rocm_torch-*.tar.gz -w /workspace/dist/ 2>&1 - pip wheel --no-build-isolation --no-deps -v /tmp/sdist/transformer_engine_rocm_jax-*.tar.gz -w /workspace/dist/ 2>&1 + # Framework extensions ship as sdists as tar.gz. Wheels are build against the consumer's torch/jax at install time + (cd transformer_engine/pytorch && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /workspace/dist/) + (cd transformer_engine/jax && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /workspace/dist/) EOF )" @@ -235,8 +232,8 @@ jobs: pip install --no-deps "$TE_CORE_PKG" 2>&1 if [ "$TE_SPLIT_LAYOUT" = "1" ]; then - TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.whl' | sort | head -n 1) - TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.whl' | sort | head -n 1) + TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) + TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) else TE_TORCH_PKG="" TE_JAX_PKG="" @@ -452,8 +449,8 @@ jobs: pip install --no-deps "$TE_CORE_PKG" 2>&1 if [ "$TE_SPLIT_LAYOUT" = "1" ]; then - TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.whl' | sort | head -n 1) - TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.whl' | sort | head -n 1) + TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) + TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) else TE_TORCH_PKG="" TE_JAX_PKG="" From 9d5ac7cf1bd91618c0c27cde3561dd9d309ce95a Mon Sep 17 00:00:00 2001 From: leo-automation Date: Mon, 27 Apr 2026 15:33:43 +0200 Subject: [PATCH 5/9] Add a missing submodule --- .github/workflows/rocm-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 9ce099937..e3aa49b69 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -113,7 +113,8 @@ jobs: git submodule update --init --recursive --depth 1 \ 3rdparty/aotriton \ 3rdparty/aiter \ - 3rdparty/hipify_torch + 3rdparty/hipify_torch \ + 3rdparty/QoLA - name: Pull Docker Image run: | From aebffd51a446ef4fb606d83e93d13a2a78f8cf0a Mon Sep 17 00:00:00 2001 From: leo-automation Date: Tue, 28 Apr 2026 13:37:54 +0200 Subject: [PATCH 6/9] Address comments --- .github/workflows/rocm-ci.yml | 111 +++++++--------------------------- ci/_utils.sh | 2 +- 2 files changed, 24 insertions(+), 89 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index e3aa49b69..35626f303 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -100,58 +100,9 @@ jobs: echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT build: - name: Build Wheel - needs: select_image - timeout-minutes: 120 - runs-on: build-only-te - steps: - - name: Checkout repository - uses: actions/checkout@v6 - - - name: Initialize required submodules - run: | - git submodule update --init --recursive --depth 1 \ - 3rdparty/aotriton \ - 3rdparty/aiter \ - 3rdparty/hipify_torch \ - 3rdparty/QoLA - - - name: Pull Docker Image - run: | - docker pull ${{ needs.select_image.outputs.image-tag }} - - - name: Build Wheel - run: | - docker run --rm \ - --network=host \ - -v "${{ github.workspace }}:/workspace" \ - -w /workspace \ - ${{ needs.select_image.outputs.image-tag }} \ - bash -c "$(cat <<'EOF' - set -ex - - export HIP_PATH="" - export PYTORCH_ROCM_ARCH="gfx942;gfx950" - export NVTE_ROCM_ARCH="gfx942;gfx950" - export NVTE_SKIP_SUBMODULE_CHECKS_DURING_BUILD=1 - export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts - pip install ninja - git config --global --add safe.directory /workspace - git submodule foreach --recursive 'git config --global --add safe.directory /workspace/$sm_path' - NVTE_RELEASE_BUILD=1 pip wheel --no-build-isolation --no-deps -v . -w /workspace/dist/ 2>&1 - - # Framework extensions ship as sdists as tar.gz. Wheels are build against the consumer's torch/jax at install time - (cd transformer_engine/pytorch && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /workspace/dist/) - (cd transformer_engine/jax && NVTE_RELEASE_BUILD=1 python setup.py sdist -d /workspace/dist/) - EOF - )" - - - name: Upload build artifacts - uses: actions/upload-artifact@v4 - with: - name: te-wheel - path: dist/* - retention-days: 1 + # Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`. + uses: ./.github/workflows/rocm-wheels-build.yml + secrets: inherit sgpu_tests: name: sGPU Tests (${{ matrix.arch_label }}) @@ -175,6 +126,7 @@ jobs: - name: Download build artifacts uses: actions/download-artifact@v4 with: + name: te-rocm-wheels path: dist/ - name: Host Diagnostics @@ -379,7 +331,7 @@ jobs: run: docker rm -f te-runner || true mgpu_tests: - name: mGPU Tests (${{ matrix.arch_label }}) + name: mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }}) needs: [select_image, build] timeout-minutes: 360 runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }} @@ -387,6 +339,7 @@ jobs: fail-fast: false matrix: arch_label: [mi30x, mi35x] + framework: [pytorch, jax] steps: - name: Checkout repository uses: actions/checkout@v6 @@ -394,6 +347,7 @@ jobs: - name: Download build artifacts uses: actions/download-artifact@v4 with: + name: te-rocm-wheels path: dist/ - name: Host Diagnostics @@ -471,62 +425,43 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - rm -f FAIL_* + case "${{ matrix.framework }}" in + pytorch) TEST_SCRIPT=ci/pytorch.sh; LOG_FILE=/workspace/torch_mgpu.log; SUITE_NAME=PyTorch ;; + jax) TEST_SCRIPT=ci/jax.sh; LOG_FILE=/workspace/jax_mgpu.log; SUITE_NAME=JAX ;; + *) echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;; + esac docker exec \ -e TEST_MGPU=1 \ -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ + -e TEST_SCRIPT=$TEST_SCRIPT \ + -e LOG_FILE=$LOG_FILE \ + -e SUITE_NAME=$SUITE_NAME \ -e HF_TOKEN="$HF_TOKEN" \ te-runner bash -c "$(cat <<'EOF' #!/usr/bin/bash set -x -o pipefail ulimit -c 0 # Disable core dumps - ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1 - torch_rc=$? - - ci/jax.sh > /workspace/jax_mgpu.log 2>&1 - jax_rc=$? + "$TEST_SCRIPT" > "$LOG_FILE" 2>&1 + test_rc=$? - if [ $torch_rc -ne 0 ]; then - echo "::group::[FAILED] PyTorch mGPU Log" - cat /workspace/torch_mgpu.log - echo "::endgroup::" - echo "::error::PyTorch mGPU tests FAILED." - touch /workspace/FAIL_TORCH_MGPU - fi - - if [ $jax_rc -ne 0 ]; then - echo "::group::[FAILED] JAX mGPU Log" - cat /workspace/jax_mgpu.log + if [ $test_rc -ne 0 ]; then + echo "::group::[FAILED] ${SUITE_NAME} mGPU Log" + cat "$LOG_FILE" echo "::endgroup::" - echo "::error::JAX mGPU tests FAILED." - touch /workspace/FAIL_JAX_MGPU + echo "::error::${SUITE_NAME} mGPU tests FAILED." fi - test $torch_rc -eq 0 -a $jax_rc -eq 0 + exit $test_rc EOF )" - - name: Check mGPU failure status - if: always() - run: | - EXIT_STATUS=0 - if [[ -f FAIL_TORCH_MGPU ]]; then - echo "::error::PyTorch mGPU tests failed." - EXIT_STATUS=1 - fi - if [[ -f FAIL_JAX_MGPU ]]; then - echo "::error::JAX mGPU tests failed." - EXIT_STATUS=1 - fi - exit $EXIT_STATUS - - name: Upload logs if: always() uses: actions/upload-artifact@v4 with: - name: logs-mgpu-${{ matrix.arch_label }} + name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }} path: | *.log if-no-files-found: ignore diff --git a/ci/_utils.sh b/ci/_utils.sh index 0e5ef8bfc..25c5d9a74 100644 --- a/ci/_utils.sh +++ b/ci/_utils.sh @@ -266,7 +266,7 @@ pytest_run() { check_test_filter $_test_name_tag || return _start_ts=`date +%s` echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`" - pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` "$TEST_DIR/$@" + pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" test $? -eq 0 || test_run_error "[$_test_variant_tag] $1" echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`" } From 7af0ebeb271aeb3141aab7592c2963d8f5b60129 Mon Sep 17 00:00:00 2001 From: leo-automation Date: Tue, 28 Apr 2026 13:53:20 +0200 Subject: [PATCH 7/9] Cleanup --- .github/workflows/rocm-ci.yml | 82 +++++++---------------------------- ci/_utils.sh | 3 +- 2 files changed, 17 insertions(+), 68 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 35626f303..ce86f8963 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -164,40 +164,15 @@ jobs: # core (cpp) tests build via cmake inside the repo; allow git ops in-tree. git config --global --add safe.directory '*' - install_if_present() { - pkg="$1" - label="$2" - if [ -n "$pkg" ]; then - echo "Installing $label package: $pkg" - pip install --no-build-isolation --no-deps "$pkg" 2>&1 - else - echo "No $label package found; using the monolithic wheel layout." - fi - } - TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) - TE_SPLIT_LAYOUT=1 - if [ -z "$TE_CORE_PKG" ]; then - TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine-[0-9]*.whl' | sort | head -n 1) - TE_SPLIT_LAYOUT=0 - fi - test -n "$TE_CORE_PKG" - pip install --no-deps "$TE_CORE_PKG" 2>&1 - - if [ "$TE_SPLIT_LAYOUT" = "1" ]; then - TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) - TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) - else - TE_TORCH_PKG="" - TE_JAX_PKG="" - fi - - if [ -n "$TE_TORCH_PKG$TE_JAX_PKG" ]; then - pip install ninja pybind11[global] 2>&1 - fi - - install_if_present "$TE_TORCH_PKG" "PyTorch" - install_if_present "$TE_JAX_PKG" "JAX" + TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) + TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) + test -n "$TE_CORE_PKG" && test -n "$TE_TORCH_PKG" && test -n "$TE_JAX_PKG" + + pip install --no-deps "$TE_CORE_PKG" + pip install ninja pybind11[global] + pip install --no-build-isolation --no-deps "$TE_TORCH_PKG" + pip install --no-build-isolation --no-deps "$TE_JAX_PKG" EOF )" @@ -383,40 +358,15 @@ jobs: docker exec te-runner bash -c "$(cat <<'EOF' set -ex - install_if_present() { - pkg="$1" - label="$2" - if [ -n "$pkg" ]; then - echo "Installing $label package: $pkg" - pip install --no-build-isolation --no-deps "$pkg" 2>&1 - else - echo "No $label package found; using the monolithic wheel layout." - fi - } - TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) - TE_SPLIT_LAYOUT=1 - if [ -z "$TE_CORE_PKG" ]; then - TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine-[0-9]*.whl' | sort | head -n 1) - TE_SPLIT_LAYOUT=0 - fi - test -n "$TE_CORE_PKG" - pip install --no-deps "$TE_CORE_PKG" 2>&1 - - if [ "$TE_SPLIT_LAYOUT" = "1" ]; then - TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) - TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) - else - TE_TORCH_PKG="" - TE_JAX_PKG="" - fi - - if [ -n "$TE_TORCH_PKG$TE_JAX_PKG" ]; then - pip install ninja pybind11[global] 2>&1 - fi - - install_if_present "$TE_TORCH_PKG" "PyTorch" - install_if_present "$TE_JAX_PKG" "JAX" + TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) + TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) + test -n "$TE_CORE_PKG" && test -n "$TE_TORCH_PKG" && test -n "$TE_JAX_PKG" + + pip install --no-deps "$TE_CORE_PKG" + pip install ninja pybind11[global] + pip install --no-build-isolation --no-deps "$TE_TORCH_PKG" + pip install --no-build-isolation --no-deps "$TE_JAX_PKG" EOF )" diff --git a/ci/_utils.sh b/ci/_utils.sh index 25c5d9a74..b4aae9cc7 100644 --- a/ci/_utils.sh +++ b/ci/_utils.sh @@ -266,7 +266,6 @@ pytest_run() { check_test_filter $_test_name_tag || return _start_ts=`date +%s` echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`" - pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" - test $? -eq 0 || test_run_error "[$_test_variant_tag] $1" + pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@" || test_run_error "[$_test_variant_tag] $1" echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`" } From 7e5b1af14c0dd488d5149ef425151eae890f51ec Mon Sep 17 00:00:00 2001 From: leo-automation Date: Tue, 28 Apr 2026 19:43:04 +0200 Subject: [PATCH 8/9] Address comments --- .github/workflows/rocm-ci.yml | 36 ++++++++++------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index ce86f8963..a12bfe663 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -354,19 +354,23 @@ jobs: ${{ needs.select_image.outputs.image-tag }} - name: Install packages + env: + FRAMEWORK: ${{ matrix.framework }} run: | - docker exec te-runner bash -c "$(cat <<'EOF' + docker exec -e FRAMEWORK="$FRAMEWORK" te-runner bash -c "$(cat <<'EOF' set -ex TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) - TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) - TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) - test -n "$TE_CORE_PKG" && test -n "$TE_TORCH_PKG" && test -n "$TE_JAX_PKG" + if [ "$FRAMEWORK" = "pytorch" ]; then + TE_FW_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) + else + TE_FW_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) + fi + test -n "$TE_CORE_PKG" && test -n "$TE_FW_PKG" pip install --no-deps "$TE_CORE_PKG" pip install ninja pybind11[global] - pip install --no-build-isolation --no-deps "$TE_TORCH_PKG" - pip install --no-build-isolation --no-deps "$TE_JAX_PKG" + pip install --no-build-isolation --no-deps "$TE_FW_PKG" EOF )" @@ -420,23 +424,3 @@ jobs: - name: Cleanup container if: always() run: docker rm -f te-runner || true - - check_results: - name: CI Result - if: always() - needs: [build, sgpu_tests, mgpu_tests] - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - name: Evaluate job results - run: | - echo "Build: ${{ needs.build.result }}" - echo "sGPU Tests: ${{ needs.sgpu_tests.result }}" - echo "mGPU Tests: ${{ needs.mgpu_tests.result }}" - - if [[ "${{ needs.build.result }}" != "success" ]] || \ - [[ "${{ needs.sgpu_tests.result }}" != "success" ]] || \ - [[ "${{ needs.mgpu_tests.result }}" != "success" ]]; then - echo "::error::One or more CI jobs did not succeed." - exit 1 - fi From b4dc30f475a2da657fb846e5d90098544a0d8304 Mon Sep 17 00:00:00 2001 From: leo-automation Date: Tue, 28 Apr 2026 20:22:25 +0200 Subject: [PATCH 9/9] Fix NVTE_FRAMEWORK --- .github/workflows/rocm-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index a12bfe663..5e0ae242c 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -391,6 +391,7 @@ jobs: -e TEST_SCRIPT=$TEST_SCRIPT \ -e LOG_FILE=$LOG_FILE \ -e SUITE_NAME=$SUITE_NAME \ + -e NVTE_FRAMEWORK=${{ matrix.framework }} \ -e HF_TOKEN="$HF_TOKEN" \ te-runner bash -c "$(cat <<'EOF' #!/usr/bin/bash