diff --git a/docs/developer/guides/kaiju-cluster-setup.md b/docs/developer/guides/kaiju-cluster-setup.md new file mode 100644 index 00000000..760f7435 --- /dev/null +++ b/docs/developer/guides/kaiju-cluster-setup.md @@ -0,0 +1,302 @@ +# Kaiju Cluster Setup + +This guide covers installing and running Underworld3 on the **Kaiju** cluster — a Rocky Linux 8.10 HPC system using Spack for module management and Slurm for job scheduling. + +Python packages are managed by **pixi** (the same tool used for local development). MPI-dependent packages — `mpi4py`, PETSc+AMR tools, `petsc4py`, and `h5py` — are built from source against Spack's OpenMPI to ensure compatibility with Slurm's parallel interconnect. + +--- + +## Hardware Overview + +| Resource | Specification | +|----------|--------------| +| Head node | 1× Intel Xeon Silver 4210R, 40 CPUs @ 2.4 GHz | +| Compute nodes | 8× Intel Xeon Gold 6230R, 104 CPUs @ 2.1 GHz each | +| Shared storage | `/opt/cluster` via NFS (cluster-wide) | +| Scheduler | Slurm with Munge authentication | + +--- + +## Why pixi + spack? + +Pixi manages the Python environment consistently with the developer's local machine (same `pixi.toml`, same package versions). Spack provides the cluster's OpenMPI, which is what Slurm uses for inter-node communication. + +The key constraint is that **anything linked against MPI must use the same MPI as Slurm**. This means `mpi4py`, `h5py`, PETSc, and `petsc4py` are built from source against Spack's OpenMPI — not from conda-forge (which bundles MPICH). + +``` +pixi kaiju env → Python 3.12, sympy, scipy, pint, pydantic, ... (conda-forge, no MPI) +spack → openmpi@4.1.6 (cluster MPI) +source build → mpi4py, PETSc+AMR+petsc4py, h5py (linked to spack MPI) +``` + +--- + +## Prerequisites + +Spack must have OpenMPI available: + +```bash +spack find openmpi +# openmpi@4.1.6 +``` + +Pixi must be installed in your user space (no root needed): + +```bash +# Check if already installed +pixi --version + +# Install if missing +curl -fsSL https://pixi.sh/install.sh | bash +``` + +--- + +## Installation + +Use the install script at `uw3_install_kaiju_amr.sh` from the [kaiju-admin-notes](https://github.com/jcgraciosa/kaiju-admin-notes) repo. + +### Step 1: Edit configuration + +Open the script and set the variables at the top: + +```bash +SPACK_MPI_VERSION="openmpi@4.1.6" # Spack MPI module to load +INSTALL_PATH="${HOME}/uw3-installation" # Root directory for everything +UW3_BRANCH="development" # UW3 git branch +``` + +### Step 2: Run the full install + +```bash +source uw3_install_kaiju_amr.sh install +``` + +This runs the following steps in order: + +| Step | Function | Time | +|------|----------|------| +| Install pixi | `setup_pixi` | ~1 min | +| Clone Underworld3 | `clone_uw3` | ~1 min | +| Install pixi kaiju env | `install_pixi_env` | ~3 min | +| Build mpi4py from source | `install_mpi4py` | ~2 min | +| Build PETSc + AMR tools | `install_petsc` | ~1 hour | +| Build MPI-enabled h5py | `install_h5py` | ~2 min | +| Install Underworld3 | `install_uw3` | ~2 min | +| Verify | `verify_install` | ~1 min | + +You can also run individual steps after sourcing: + +```bash +source uw3_install_kaiju_amr.sh +install_petsc # run just one step +``` + +### What PETSc builds + +PETSc is compiled from source (`petsc-custom/build-petsc-kaiju.sh`) with: + +- **AMR tools**: mmg, parmmg, pragmatic, eigen, bison +- **Solvers**: mumps, scalapack, slepc +- **Partitioners**: metis, parmetis, ptscotch +- **MPI**: Spack's OpenMPI (`--with-mpi-dir`) +- **HDF5**: downloaded and built with MPI support +- **BLAS/LAPACK**: fblaslapack (Rocky Linux 8 has no guaranteed system BLAS) +- **cmake**: downloaded (not in Spack) +- **petsc4py**: built during configure (`--with-petsc4py=1`) + +--- + +## Activating the Environment + +In every new session (interactive or job), source the install script: + +```bash +source ~/install_scripts/uw3_install_kaiju_amr.sh +``` + +This: +1. Loads `spack openmpi@4.1.6` +2. Activates the pixi `kaiju` environment via `pixi shell-hook` +3. Sets `PETSC_DIR`, `PETSC_ARCH`, and `PYTHONPATH` for petsc4py +4. Sets `PMIX_MCA_psec=native` and `OMPI_MCA_btl_tcp_if_include=eno1` + +{note} +`pixi shell-hook` is used instead of `pixi shell` because it activates the environment in the current shell without spawning a new one. This is required for Slurm batch jobs. +{/note} + +--- + +## Running with Slurm + +Two job script templates are available in the [kaiju-admin-notes](https://github.com/jcgraciosa/kaiju-admin-notes) repo: + +| Script | Use when | +|--------|----------| +| `uw3_slurm_job.sh` | Per-user install (sources `uw3_install_kaiju_amr.sh`) | +| `uw3_slurm_job_shared.sh` | Shared install (`module load underworld3/...`) | + +### Submitting a job + +```bash +sbatch uw3_slurm_job.sh # per-user install +sbatch uw3_slurm_job_shared.sh # shared install +``` + +Monitor progress: + +```bash +squeue -u $USER +tail -f uw3_.out +``` + +### The `srun` invocation + +`--mpi=pmix` is **required** on Kaiju (Spack has `pmix@5.0.3`): + +```bash +srun --mpi=pmix python3 my_model.py +``` + +### Scaling examples + +```bash +# 1 node, 30 ranks +sbatch --nodes=1 --ntasks-per-node=30 uw3_slurm_job.sh + +# 4 nodes, 120 ranks +sbatch --nodes=4 --ntasks-per-node=30 uw3_slurm_job.sh +``` + +--- + +## Shared Installation (Admin) + +A system-wide installation can be deployed to `/opt/cluster/software/underworld3/` so all users access it via Environment Modules: + +```bash +module load underworld3/development-12Mar26 +``` + +Run as an admin with write access to `/opt/cluster/software`: + +```bash +source uw3_install_kaiju_shared.sh install +``` + +This script is identical to the per-user script except: +- `INSTALL_PATH=/opt/cluster/software` +- Adds `fix_permissions()` — sets world-readable permissions after install +- Adds `install_modulefile()` — copies the TCL modulefile with a date-stamped name to `/opt/cluster/modulefiles/underworld3/` + +The modulefile (`modulefiles/underworld3/development.tcl`) hardcodes the spack OpenMPI and pixi env paths. If spack is rebuilt (hash changes), update `mpi_root` in the modulefile. + +### Slurm job script (shared install) + +Users with the shared install should use `uw3_slurm_job_shared.sh`: + +```bash +# Edit UW3_MODULE and SCRIPT at the top, then: +sbatch uw3_slurm_job_shared.sh +``` + +The key difference from the per-user job script is environment setup: + +```bash +# Shared install: load module +module load underworld3/development-12Mar26 + +# Per-user install: source install script +source ~/install_scripts/uw3_install_kaiju_amr.sh +``` + +--- + +## Troubleshooting + +### `import underworld3` fails on compute nodes + +Sourcing the install script in the job script (not the login shell) ensures all paths propagate to compute nodes. The `uw3_slurm_job.sh` template does this correctly. + +### h5py HDF5 version mismatch + +h5py must be built against the same HDF5 that PETSc built. If you see HDF5 errors, rebuild: + +```bash +source uw3_install_kaiju_amr.sh +install_h5py +``` + +### PETSc needs rebuilding after Spack module update + +PETSc links against Spack's OpenMPI at build time. If `openmpi@4.1.6` is reinstalled or updated, rebuild PETSc: + +```bash +source uw3_install_kaiju_amr.sh +rm -rf ~/uw3-installation/underworld3/petsc-custom/petsc +install_petsc +install_h5py +``` + +### h5py replaces source-built mpi4py + +`pip install h5py` without `--no-deps` silently replaces the source-built mpi4py (spack OpenMPI) with a pre-built wheel linked to a different MPI. Always use `--no-deps` when installing h5py. The install script handles this correctly. + +If mpi4py was accidentally replaced, rebuild it from source: +```bash +source uw3_install_kaiju_amr.sh +pip install --no-binary :all: --no-cache-dir --force-reinstall "mpi4py>=4,<5" +``` + +Verify it links to spack OpenMPI: +```bash +ldd $(python3 -c "import mpi4py; print(mpi4py.__file__.replace('__init__.py',''))") \ + MPI*.so | grep mpi +# Should show: libmpi.so.40 => /opt/cluster/spack/.../openmpi-4.1.6-.../lib/libmpi.so.40 +``` + +### numpy ABI mismatch after h5py install + +If numpy is upgraded after petsc4py is compiled, `import petsc4py` fails with: +``` +ValueError: numpy.dtype size changed, may indicate binary incompatibility. +``` + +Fix: restore the numpy version used during the PETSc build, then rebuild h5py: +```bash +pip install --force-reinstall "numpy==1.26.4" +CC=mpicc HDF5_MPI="ON" HDF5_DIR="${PETSC_DIR}/${PETSC_ARCH}" \ + pip install --no-binary=h5py --no-cache-dir --force-reinstall --no-deps h5py +``` + +### PARMMG configure failure (pixi ld + spack transitive deps) + +pixi's conda linker (`ld` 14.x) requires transitive shared library dependencies to be explicitly linked. `libmmg.so` built with SCOTCH support causes PARMMG's `MMG_WORKS` link test to fail because `libscotch.so` is not explicitly passed. This is fixed in `petsc-custom/build-petsc-kaiju.sh` by building MMG without SCOTCH (`-DUSE_SCOTCH=OFF`). PARMMG uses ptscotch separately for parallel partitioning, which is unaffected. + +### Checking what's installed + +```bash +source uw3_install_kaiju_amr.sh +verify_install +``` + +--- + +## Rebuilding Underworld3 after source changes + +After pulling new UW3 code: + +```bash +source uw3_install_kaiju_amr.sh +cd ~/uw3-installation/underworld3 +git pull +pip install -e . +``` + +--- + +## Related + +- [Development Setup](development-setup.md) — local development with pixi +- [Branching Strategy](branching-strategy.md) — git workflow +- [Parallel Computing](../../advanced/parallel-computing.md) — writing parallel-safe UW3 code diff --git a/docs/developer/index.md b/docs/developer/index.md index 823ff7a2..faff147f 100644 --- a/docs/developer/index.md +++ b/docs/developer/index.md @@ -114,6 +114,7 @@ guides/SPELLING_CONVENTION guides/version-management guides/branching-strategy guides/BINDER_CONTAINER_SETUP +guides/kaiju-cluster-setup ``` ```{toctree} diff --git a/petsc-custom/build-petsc-kaiju.sh b/petsc-custom/build-petsc-kaiju.sh new file mode 100644 index 00000000..d46852ba --- /dev/null +++ b/petsc-custom/build-petsc-kaiju.sh @@ -0,0 +1,181 @@ +#!/bin/bash +# +# Build PETSc with AMR tools for the Kaiju cluster (Rocky Linux 8, Spack OpenMPI) +# +# Differences from build-petsc.sh (local macOS/pixi): +# MPI auto-detected from PATH (spack load puts mpicc in PATH; no --with-mpi-dir needed) +# --download-hdf5 → PETSc downloads HDF5 (not provided by pixi) +# --download-fblaslapack → no guaranteed system BLAS on Rocky Linux 8 +# --download-cmake → spack does not have cmake +# --with-petsc4py → built during configure (not a separate step) +# +# This script builds the same AMR tool set as build-petsc.sh: +# pragmatic, mmg, parmmg, slepc, mumps, metis, parmetis, ptscotch, scalapack +# +# Usage (must be inside a pixi kaiju shell with spack OpenMPI loaded): +# spack load openmpi@4.1.6 +# pixi shell -e kaiju +# ./build-petsc-kaiju.sh # Full build +# ./build-petsc-kaiju.sh configure # Just reconfigure +# ./build-petsc-kaiju.sh build # Just make +# ./build-petsc-kaiju.sh clean # Remove PETSc directory +# +# Build time: ~1 hour +# +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PETSC_DIR="${SCRIPT_DIR}/petsc" +PETSC_ARCH="petsc-4-uw" + +# Require spack OpenMPI to be loaded +if ! command -v mpicc &>/dev/null; then + echo "Error: mpicc not found. Load spack OpenMPI first:" + echo " spack load openmpi@4.1.6" + exit 1 +fi + +# Require pixi kaiju environment +# Check PATH since PIXI_ENVIRONMENT is not set by pixi shell-hook (only by pixi shell) +if ! echo "$PATH" | tr ':' '\n' | grep -q "\.pixi/envs/kaiju/bin"; then + echo "Error: must be run inside the pixi kaiju environment" + echo " source uw3_install_kaiju_amr.sh (sets up env via pixi shell-hook)" + exit 1 +fi + +echo "==========================================" +echo "PETSc AMR Build Script (Kaiju)" +echo "==========================================" +echo "PETSC_DIR: $PETSC_DIR" +echo "PETSC_ARCH: $PETSC_ARCH" +echo "mpicc: $(which mpicc)" +echo "==========================================" + +clone_petsc() { + if [ -d "$PETSC_DIR" ]; then + echo "PETSc directory already exists. Skipping clone." + echo "To force fresh clone, run: ./build-petsc-kaiju.sh clean" + return 0 + fi + + echo "Cloning PETSc release branch..." + git clone -b release https://gitlab.com/petsc/petsc.git "$PETSC_DIR" + echo "Clone complete." +} + +configure_petsc() { + echo "Configuring PETSc with AMR tools..." + cd "$PETSC_DIR" + + # Downloads and builds: + # AMR: mmg, parmmg, pragmatic, eigen, bison + # Solvers: mumps, scalapack, slepc + # Partitions: metis, parmetis, ptscotch + # BLAS/LAPACK: fblaslapack (Rocky Linux 8 has no guaranteed system BLAS) + # HDF5: downloaded (not provided by pixi in kaiju env) + # cmake: downloaded (spack does not have cmake) + # MPI: spack OpenMPI (not downloaded) + # petsc4py: built during configure + # MPI_DIR is computed from `which mpicc` (spack OpenMPI in PATH). + # LD_LIBRARY_PATH must include $MPI_DIR/lib so PETSc configure test binaries + # can find libmpi.so at runtime (spack uses RPATH for its own binaries but + # does not set LD_LIBRARY_PATH — load_env in uw3_install_kaiju_amr.sh sets it). + MPI_DIR="$(dirname "$(dirname "$(which mpicc)")")" + python3 ./configure \ + --with-petsc-arch="$PETSC_ARCH" \ + --with-debugging=0 \ + --with-mpi-dir="$MPI_DIR" \ + --download-hdf5=1 \ + --download-fblaslapack=1 \ + --download-cmake=1 \ + --download-bison=1 \ + --download-eigen=1 \ + --download-metis=1 \ + --download-parmetis=1 \ + --download-mumps=1 \ + --download-scalapack=1 \ + --download-slepc=1 \ + --download-ptscotch=1 \ + --download-mmg=1 \ + --download-mmg-cmake-arguments="-DMMG_INSTALL_PRIVATE_HEADERS=ON -DUSE_SCOTCH=OFF" \ + --download-parmmg=1 \ + --download-pragmatic=1 \ + --with-pragmatic=1 \ + --with-petsc4py=1 \ + --with-x=0 \ + --with-make-np=40 + + echo "Configure complete." +} + +build_petsc() { + echo "Building PETSc..." + cd "$PETSC_DIR" + + export PETSC_DIR + export PETSC_ARCH + + make all + echo "PETSc build complete." +} + +test_petsc() { + echo "Testing PETSc..." + cd "$PETSC_DIR" + + export PETSC_DIR + export PETSC_ARCH + + make check + echo "PETSc tests complete." +} + +clean_petsc() { + echo "Removing PETSc directory..." + if [ -d "$PETSC_DIR" ]; then + rm -rf "$PETSC_DIR" + echo "Cleaned." + else + echo "Nothing to clean." + fi +} + +show_help() { + echo "Usage: $0 [command]" + echo "" + echo "Commands:" + echo " (none) Full build: clone, configure, build" + echo " clone Clone PETSc repository" + echo " configure Configure PETSc with AMR tools" + echo " build Build PETSc" + echo " test Run PETSc tests" + echo " clean Remove PETSc directory" + echo " help Show this help" +} + +case "${1:-all}" in + all) + clone_petsc + configure_petsc + build_petsc + echo "" + echo "==========================================" + echo "PETSc AMR build complete!" + echo "Set these environment variables:" + echo " export PETSC_DIR=$PETSC_DIR" + echo " export PETSC_ARCH=$PETSC_ARCH" + echo " export PYTHONPATH=\$PETSC_DIR/\$PETSC_ARCH/lib:\$PYTHONPATH" + echo "==========================================" + ;; + clone) clone_petsc ;; + configure) configure_petsc ;; + build) build_petsc ;; + test) test_petsc ;; + clean) clean_petsc ;; + help|--help|-h) show_help ;; + *) + echo "Unknown command: $1" + show_help + exit 1 + ;; +esac diff --git a/pixi.toml b/pixi.toml index 26c7b8d7..ddd20fbf 100644 --- a/pixi.toml +++ b/pixi.toml @@ -229,6 +229,18 @@ PETSC_ARCH = "petsc-4-uw-openmpi" petsc-local-build = { cmd = "./build-petsc.sh", cwd = "petsc-custom" } petsc-local-clean = { cmd = "./build-petsc.sh clean", cwd = "petsc-custom" } +# ============================================ +# KAIJU CLUSTER FEATURE +# ============================================ +# For the Kaiju HPC cluster (Rocky Linux 8, Spack OpenMPI, Slurm) +# Pure Python only — base dependencies cover all pure-Python needs. +# mpi4py, h5py, petsc, petsc4py are built from source against +# spack's OpenMPI using petsc-custom/build-petsc-kaiju.sh +# See: docs/developer/guides/kaiju-cluster-setup.md + +[feature.kaiju] +platforms = ["linux-64"] + # ============================================ # RUNTIME FEATURE (for tutorials/examples) # ============================================ @@ -312,3 +324,7 @@ openmpi-dev = { features = ["conda-petsc-openmpi", "runtime", "dev"], solve-gr amr-openmpi = { features = ["amr-openmpi"], solve-group = "amr-openmpi" } amr-openmpi-dev = { features = ["amr-openmpi", "runtime", "dev"], solve-group = "amr-openmpi" } + +# --- Kaiju Cluster Track (linux-64 only) --- +# Pure Python from pixi; MPI/PETSc/h5py built from source against spack OpenMPI +kaiju = { features = ["kaiju"], solve-group = "kaiju" }