From ff3b0e6108e021e0af9915371db48a867471fe8b Mon Sep 17 00:00:00 2001 From: wcwxy <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 29 Apr 2026 15:31:37 +0800 Subject: [PATCH] Refactor: remove aicpu_build_graph runtime All aicpu_build_graph features have been merged into tensormap_and_ringbuffer; the standalone runtime is no longer needed. - Delete src/a2a3/runtime/aicpu_build_graph/ and the corresponding ST test tree under tests/st/a2a3/aicpu_build_graph/ - Drop the test_discovers_aicpu_build_graph discovery test and the matching skip clause in tests/conftest.py - Drop ABG_EXAMPLE_CASES and runtime branch from tools/benchmark_rounds.sh; redirect the verify_packaging.sh smoke test to the tensormap_and_ringbuffer paged_attention_unroll case - Remove the runtime from issue templates, .claude rules/skills/ commands, READMEs, the per-arch runtime docs, the tensor-dump and dynamic-linking docs, and the L2 perf header comments - Fix stale verification-matrix copy: tools/README.md, the verify_packaging.sh banner, and docs/python-packaging.md all said "5 install paths x 4 entry points" but only two user-facing entry points exist (pytest, standalone test_*.py); update to "x 2" - Update tools/README.md to refer to the single TMR_EXAMPLE_CASES map in benchmark_rounds.sh (was "EXAMPLE_CASES maps ... per runtime" back when ABG had its own map) - Widen the scene-test retry from rc==124 only to any non-zero rc in all four spots (st-sim-a2a3, st-sim-a5, st-onboard-a2a3, st-onboard-a5), so transient PTO-ISA git-clone failures (e.g. SSL_ERROR_SYSCALL) trigger the pinned-commit retry instead of failing the job outright --- .claude/commands/perf-runtime-device.md | 2 +- .claude/commands/test-runtime-device.md | 2 +- .claude/commands/test-runtime-sim.md | 2 +- .claude/rules/architecture.md | 2 +- .claude/skills/benchmark/SKILL.md | 14 +- .github/ISSUE_TEMPLATE/bug_report.yml | 1 - .github/ISSUE_TEMPLATE/performance_issue.yml | 1 - .github/workflows/ci.yml | 14 +- README.md | 3 +- docs/developer-guide.md | 2 - docs/dynamic-linking.md | 2 +- docs/python-packaging.md | 2 +- docs/tensor-dump.md | 17 +- docs/testing.md | 1 - examples/workers/README.md | 4 +- simpler_setup/kernel_compiler.py | 2 +- simpler_setup/tools/swimlane_converter.py | 2 +- src/a2a3/docs/runtimes.md | 34 +- .../include/aicore/l2_perf_collector_aicore.h | 4 +- .../include/aicpu/l2_perf_collector_aicpu.h | 7 +- .../include/common/l2_perf_profiling.h | 8 +- .../aicore/aicore_executor.cpp | 153 -- .../aicpu/aicpu_executor.cpp | 2341 ----------------- .../runtime/aicpu_build_graph/build_config.py | 30 - .../aicpu_build_graph/docs/RUNTIME_LOGIC.md | 31 - .../host/runtime_compile_info.cpp | 27 - .../aicpu_build_graph/host/runtime_maker.cpp | 379 --- .../orchestration/common.cpp | 166 -- .../orchestration/pto_orchestration_api.h | 194 -- .../aicpu_build_graph/runtime/common.h | 70 - .../runtime/pto2_dispatch_payload.h | 43 - .../runtime/pto_orchestrator.cpp | 608 ----- .../runtime/pto_orchestrator.h | 275 -- .../runtime/pto_ring_buffer.cpp | 116 - .../runtime/pto_ring_buffer.h | 619 ----- .../runtime/pto_runtime2.cpp | 183 -- .../aicpu_build_graph/runtime/pto_runtime2.h | 281 -- .../runtime/pto_runtime2_types.h | 431 --- .../runtime/pto_scheduler.cpp | 241 -- .../aicpu_build_graph/runtime/pto_scheduler.h | 729 ----- .../runtime/pto_shared_memory.cpp | 276 -- .../runtime/pto_shared_memory.h | 233 -- .../runtime/pto_submit_types.h | 106 - .../aicpu_build_graph/runtime/pto_types.h | 279 -- .../aicpu_build_graph/runtime/runtime.cpp | 146 - .../aicpu_build_graph/runtime/runtime.h | 293 --- .../aicpu_build_graph/runtime/tensor.h | 409 --- .../docs/RUNTIME_LOGIC.md | 12 +- .../include/aicore/l2_perf_collector_aicore.h | 4 +- .../include/aicpu/l2_perf_collector_aicpu.h | 7 +- .../include/common/l2_perf_profiling.h | 8 +- .../docs/RUNTIME_LOGIC.md | 12 +- tests/conftest.py | 5 - .../st/a2a3/aicpu_build_graph/bgemm/README.md | 86 - .../bgemm/kernels/aic/kernel_gemm_tile.cpp | 122 - .../bgemm/kernels/aiv/kernel_tile_add.cpp | 75 - .../kernels/orchestration/bgemm_orch.cpp | 137 - .../aicpu_build_graph/bgemm/test_bgemm.py | 94 - .../orchestration/example_orchestration.cpp | 55 - .../orch_so_cache/test_orch_so_cache.py | 107 - .../paged_attention/kernels/aic/aic_hub.cpp | 28 - .../kernels/aic/aic_pv_matmul.cpp | 113 - .../kernels/aic/aic_qk_matmul.cpp | 114 - .../paged_attention/kernels/aiv/aiv_hub.cpp | 28 - .../kernels/aiv/aiv_online_update.cpp | 255 -- .../kernels/aiv/aiv_softmax_prepare.cpp | 154 -- .../orchestration/paged_attention_orch.cpp | 196 -- .../paged_attention/test_paged_attention.py | 129 - .../kernels/aic/aic_hub.cpp | 28 - .../kernels/aic/aic_pv_matmul.cpp | 152 -- .../kernels/aic/aic_qk_matmul.cpp | 127 - .../kernels/aiv/aiv_hub.cpp | 28 - .../kernels/aiv/aiv_online_update.cpp | 255 -- .../kernels/aiv/aiv_softmax_prepare.cpp | 263 -- .../orchestration/paged_attention_orch.cpp | 370 --- .../test_paged_attention_unroll.py | 152 -- .../vector_example/README.md | 21 - .../vector_example/kernels/aiv/kernel_add.cpp | 73 - .../kernels/aiv/kernel_add_scalar.cpp | 74 - .../vector_example/kernels/aiv/kernel_mul.cpp | 73 - .../kernels/orchestration/orchestration.cpp | 86 - .../vector_example/test_vector_example.py | 85 - tests/ut/py/test_runtime_builder.py | 8 - tools/README.md | 6 +- tools/benchmark_rounds.sh | 18 +- tools/verify_packaging.sh | 4 +- 86 files changed, 69 insertions(+), 12280 deletions(-) delete mode 100644 src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/build_config.py delete mode 100644 src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md delete mode 100644 src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/common.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/README.md delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py delete mode 100644 tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/README.md delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py diff --git a/.claude/commands/perf-runtime-device.md b/.claude/commands/perf-runtime-device.md index 4b4958103..6ca944a29 100644 --- a/.claude/commands/perf-runtime-device.md +++ b/.claude/commands/perf-runtime-device.md @@ -4,7 +4,7 @@ If `$ARGUMENTS` is provided, use it as the runtime name. Otherwise, default to ` Reference `tools/benchmark_rounds.sh` for the full implementation pattern (device log resolution, timing parsing, reporting format). -1. Validate the runtime is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list valid runtimes and stop. +1. Validate the runtime is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list valid runtimes and stop. 2. Check `command -v npu-smi` — if not found, tell the user this requires hardware and stop. 3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3`. 4. Find the lowest-ID idle device (HBM-Usage = 0) from the `npu-smi info` output. If none, stop. diff --git a/.claude/commands/test-runtime-device.md b/.claude/commands/test-runtime-device.md index 5551d1d12..889f67417 100644 --- a/.claude/commands/test-runtime-device.md +++ b/.claude/commands/test-runtime-device.md @@ -1,6 +1,6 @@ # Run hardware device tests for a single runtime specified by $ARGUMENTS -1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop. +1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop. 2. Check `command -v npu-smi` — if not found, tell the user to use `/test-runtime-sim` instead and stop. 3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3`. 4. Read `.github/workflows/ci.yml` to extract the current `--pto-isa-commit` and `--pto-session-timeout` values from the `st-onboard-` job's `pytest` invocation. diff --git a/.claude/commands/test-runtime-sim.md b/.claude/commands/test-runtime-sim.md index 59e2844c7..3f0a9e9da 100644 --- a/.claude/commands/test-runtime-sim.md +++ b/.claude/commands/test-runtime-sim.md @@ -1,6 +1,6 @@ # Run simulation tests for a single runtime specified by $ARGUMENTS -1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop. +1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop. 2. Read `.github/workflows/ci.yml` to extract the current `--pto-isa-commit` and `--pto-session-timeout` values from the `st-sim-*` jobs' `pytest` invocations. 3. **Detect platform**: If `npu-smi` is available, parse the chip name from `npu-smi info`. Map `910B`/`910C` → `a2a3sim`, `950` → `a5sim`. If `npu-smi` is not found, default to `a2a3sim`. 4. Run: diff --git a/.claude/rules/architecture.md b/.claude/rules/architecture.md index 905422467..7302f0a52 100644 --- a/.claude/rules/architecture.md +++ b/.claude/rules/architecture.md @@ -5,7 +5,7 @@ See [docs/chip-level-arch.md](../../docs/chip-level-arch.md) for the full diagra ## Key Concepts - **Three programs**: Host `.so`, AICPU `.so`, AICore `.o` — compiled independently, linked at runtime -- **Three runtimes** under `src/{arch}/runtime/`: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer` +- **Two runtimes** under `src/{arch}/runtime/`: `host_build_graph`, `tensormap_and_ringbuffer` - **Two platform backends** under `src/{arch}/platform/`: `onboard/` (hardware), `sim/` (simulation) ## Python Package Layout diff --git a/.claude/skills/benchmark/SKILL.md b/.claude/skills/benchmark/SKILL.md index 39bd24a34..130f87ea3 100644 --- a/.claude/skills/benchmark/SKILL.md +++ b/.claude/skills/benchmark/SKILL.md @@ -45,20 +45,8 @@ The `-d` flag specifies NPU device IDs. `tools/benchmark_rounds.sh` supports `-r `: - `tensormap_and_ringbuffer` (default) -- `aicpu_build_graph` -Each runtime has its own example list defined at the top of the script (`TMR_EXAMPLE_CASES` / `ABG_EXAMPLE_CASES`). - -**Auto-detection (compare mode only):** Always benchmark TMR. Also benchmark `aicpu_build_graph` if the diff touches its files: - -```bash -RUNTIMES_TO_BENCH=(tensormap_and_ringbuffer) -if git diff --name-only "$MERGE_BASE"...HEAD | grep -q 'aicpu_build_graph'; then - RUNTIMES_TO_BENCH+=(aicpu_build_graph) -fi -``` - -Run `benchmark_rounds.sh` once per runtime, with `-r ` appended. **Runtimes are always benchmarked serially** — finish all baseline+current runs for one runtime before starting the next. This ensures no device ever runs two benchmark processes concurrently. +The example list is defined at the top of the script (`TMR_EXAMPLE_CASES`). ## Step 1: Detect Mode diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 81131c554..6a9532444 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -29,7 +29,6 @@ body: description: Which runtime variant is affected? options: - tensormap_and_ringbuffer - - aicpu_build_graph - host_build_graph - All / Unknown validations: diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml index fffdfa5ea..918363032 100644 --- a/.github/ISSUE_TEMPLATE/performance_issue.yml +++ b/.github/ISSUE_TEMPLATE/performance_issue.yml @@ -29,7 +29,6 @@ body: description: Which runtime variant is affected? options: - tensormap_and_ringbuffer - - aicpu_build_graph - host_build_graph - All / Unknown validations: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb32ca3f5..ddc403802 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -209,8 +209,8 @@ jobs: set +e pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https rc=$? - if [ $rc -eq 124 ]; then - echo "pytest timed out; retrying with pinned PTO-ISA commit" + if [ $rc -ne 0 ]; then + echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit" pytest examples tests/st --platform a2a3sim --device 0-15 -v \ --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https rc=$? @@ -267,8 +267,8 @@ jobs: set +e pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https rc=$? - if [ $rc -eq 124 ]; then - echo "pytest timed out; retrying with pinned PTO-ISA commit" + if [ $rc -ne 0 ]; then + echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit" pytest examples tests/st --platform a5sim --device 0-15 -v \ --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https rc=$? @@ -338,8 +338,8 @@ jobs: source .venv/bin/activate python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 --clone-protocol https rc=$? - if [ $rc -eq 124 ]; then - echo "pytest timed out; retrying with pinned PTO-ISA commit" + if [ $rc -ne 0 ]; then + echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit" python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v \ --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https rc=$? @@ -450,4 +450,4 @@ jobs: source .venv/bin/activate DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))") PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v --clone-protocol https" - task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -eq 124 ]; then echo 'pytest timed out; retrying with pinned PTO-ISA commit'; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https; rc=\$?; fi; exit \$rc" + task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -ne 0 ]; then echo \"pytest failed with rc=\$rc; retrying with pinned PTO-ISA commit\"; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https; rc=\$?; fi; exit \$rc" diff --git a/README.md b/README.md index 8d7652d48..3dda00399 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,11 @@ PTO ISA headers are automatically cloned on first run. See [Getting Started](doc ## Runtime Variants -Three runtimes under `src/{arch}/runtime/`, each with a different graph-building strategy: +Two runtimes under `src/{arch}/runtime/`, each with a different graph-building strategy: | Runtime | Graph built on | Use case | | ------- | -------------- | -------- | | `host_build_graph` | Host CPU | Development, debugging | -| `aicpu_build_graph` | AICPU (device) | Reduced host-device transfer | | `tensormap_and_ringbuffer` | AICPU (device) | Production workloads | See runtime docs per arch: [a2a3](src/a2a3/docs/runtimes.md), [a5](src/a5/docs/runtimes.md). diff --git a/docs/developer-guide.md b/docs/developer-guide.md index b7722d989..e106f792f 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -22,7 +22,6 @@ pto-runtime/ │ └── runtime/ # Runtime implementations │ ├── common/ # Shared components across runtimes │ ├── host_build_graph/ # Host-built graph runtime -│ ├── aicpu_build_graph/ # AICPU-built graph runtime │ └── tensormap_and_ringbuffer/ # Advanced production runtime │ ├── python/ # Language bindings @@ -55,7 +54,6 @@ pto-runtime/ ├── examples/ # Working examples │ └── {arch}/ # Architecture-specific examples │ ├── host_build_graph/ -│ ├── aicpu_build_graph/ │ └── tensormap_and_ringbuffer/ │ ├── tests/ # Test suite diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md index 546ce30ab..2c622cf3d 100644 --- a/docs/dynamic-linking.md +++ b/docs/dynamic-linking.md @@ -221,7 +221,7 @@ SchedulerContext owns its own teardown: (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`, `finished_count_`). -Applies to all 5 runtime executors: a2a3 (abg, hbg, tmr), a5 (hbg, tmr). +Applies to all 4 runtime executors: a2a3 (hbg, tmr), a5 (hbg, tmr). ## SO Handle Caching and Reuse diff --git a/docs/python-packaging.md b/docs/python-packaging.md index a64dc4c7a..05da8ec8b 100644 --- a/docs/python-packaging.md +++ b/docs/python-packaging.md @@ -96,7 +96,7 @@ Plus one build-time entry point invoked by CMake during `pip install`: ## Install modes -Five install paths × four entry points = the verification matrix. CI enforces the matrix on macOS and Ubuntu via `.github/workflows/ci.yml::packaging-matrix`. +Five install paths × two entry points = the verification matrix. CI enforces the matrix on macOS and Ubuntu via `.github/workflows/ci.yml::packaging-matrix`. ### Mode-by-mode diff --git a/docs/tensor-dump.md b/docs/tensor-dump.md index 0f44061e3..1a79fa40a 100644 --- a/docs/tensor-dump.md +++ b/docs/tensor-dump.md @@ -6,8 +6,8 @@ runtime observability feature: host pre-allocates buffers on device, AICPU writes records during execution, host collects data and exports JSON manifest + binary payload. -Supported on both architectures (`a2a3` / `a5`) and all three runtimes -(`host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`). +Supported on both architectures (`a2a3` / `a5`) and both runtimes +(`host_build_graph`, `tensormap_and_ringbuffer`). Opt-in via `--dump-tensor` — zero overhead when disabled. The **primary design** (a2a3) uses shared memory (`halHostRegister`) + @@ -250,8 +250,8 @@ all device-side writes were globally visible. AICPU only has device addresses and sizes — it does **not** know the logical shape / dtype / view geometry of each tensor unless the runtime -registers it. Each of the three runtimes exposes metadata through a -slightly different path, but they all converge on `TensorInfo` (see +registers it. Each runtime exposes metadata through a slightly different +path, but they all converge on `TensorInfo` (see [`tensor_info.h`](../src/a5/runtime/host_build_graph/runtime/tensor_info.h)): - **`host_build_graph`** — two orchestration-side APIs: @@ -261,11 +261,10 @@ slightly different path, but they all converge on `TensorInfo` (see See [`dump_tensor_orch.cpp`](../tests/st/a5/host_build_graph/dump_tensor_example/kernels/orchestration/dump_tensor_orch.cpp) for both styles in one file. -- **`aicpu_build_graph`** — runtime layer fills `TensorInfo` from - `PTO2TaskPayload::tensors[]` directly. No orchestration API needed. -- **`tensormap_and_ringbuffer`** — identical to `aicpu_build_graph`; - the ring buffer carries `PTO2TaskPayload` which already contains - shape/offset arrays. +- **`tensormap_and_ringbuffer`** — runtime layer fills `TensorInfo` + from `PTO2TaskPayload::tensors[]` directly. The ring buffer carries + `PTO2TaskPayload` which already contains shape/offset arrays, so no + orchestration API is needed. When metadata is missing or inconsistent, the task is **skipped for dump** and a single `LOG_WARN` is emitted (guarded by diff --git a/docs/testing.md b/docs/testing.md index 52a80e76b..d2a73ada0 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -420,7 +420,6 @@ pytest tests/ut --platform a2a3 Small, fast examples that run on both simulation and real hardware. Organized by runtime: - `host_build_graph/` — HBG examples -- `aicpu_build_graph/` — ABG examples - `tensormap_and_ringbuffer/` — TMR examples Each example has a `golden.py` with `generate_inputs()` and `compute_golden()` for result validation. diff --git a/examples/workers/README.md b/examples/workers/README.md index da193e1be..a7c5176d1 100644 --- a/examples/workers/README.md +++ b/examples/workers/README.md @@ -35,8 +35,8 @@ workers/ Why no `tensormap_and_ringbuffer/` layer? Because every example here hard-codes `runtime="tensormap_and_ringbuffer"` in its `Worker(...)` call — that is the -default user-facing runtime. Other runtimes (`host_build_graph`, -`aicpu_build_graph`) are covered by scene tests under `tests/st/`, not here. +default user-facing runtime. The other runtime (`host_build_graph`) is +covered by scene tests under `tests/st/`, not here. ## Prerequisites diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py index 831d83adb..ef5f8be94 100644 --- a/simpler_setup/kernel_compiler.py +++ b/simpler_setup/kernel_compiler.py @@ -372,7 +372,7 @@ def compile_orchestration( Args: runtime_name: Name of the runtime (e.g., "host_build_graph", - "tensormap_and_ringbuffer", "aicpu_build_graph") + "tensormap_and_ringbuffer") source_path: Path to orchestration source file (.cpp) extra_include_dirs: Additional include directories (merged with the runtime/platform include dirs) diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py index e30ad3b04..3d3c789c9 100644 --- a/simpler_setup/tools/swimlane_converter.py +++ b/simpler_setup/tools/swimlane_converter.py @@ -1015,7 +1015,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 # Orchestrator → scheduler dispatch: # - Prefer orch_fanin end → dispatch (explicit deps / fanin path). - # - If no orch_fanin for this task (e.g. aicpu_build_graph without fanin records), use orch_params end → dispatch. + # - If no orch_fanin for this task, use orch_params end → dispatch. if orchestrator_phases and scheduler_phases: orch_fanin_by_task = {} orch_params_by_task = {} diff --git a/src/a2a3/docs/runtimes.md b/src/a2a3/docs/runtimes.md index de5ff380a..a7cd861ee 100644 --- a/src/a2a3/docs/runtimes.md +++ b/src/a2a3/docs/runtimes.md @@ -1,20 +1,20 @@ # Runtime Variants (a2a3) -Three runtime implementations live under `src/a2a3/runtime/`, each providing a different graph-building strategy. The `RUNTIME_CONFIG.runtime` field in `kernel_config.py` selects which runtime to use. +Two runtime implementations live under `src/a2a3/runtime/`, each providing a different graph-building strategy. The `RUNTIME_CONFIG.runtime` field in `kernel_config.py` selects which runtime to use. ## Comparison -| Feature | host_build_graph | aicpu_build_graph | tensormap_and_ringbuffer | -| ------- | ---------------- | ----------------- | ------------------------ | -| Graph built on | Host CPU | AICPU (device) | AICPU (device) | -| Task storage | Fixed `Task[]` array | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) | -| Dependencies | Explicit edges | Explicit edges | Auto-derived via TensorMap | -| Memory management | Host-side | Host + device malloc | Ring buffer heap (GM) | -| Concurrent build+schedule | No | Optional (`build_mode=1`) | Yes (always) | -| Profiling support | Basic | Basic | Multi-level hierarchy | -| Batch/streaming | No | No | Yes (flow control, back-pressure) | -| Thread model | N scheduler threads | 1 builder + N schedulers | 1 orchestrator + 3 schedulers | -| Use case | Development, debugging | Reduced host-device transfer | Production workloads | +| Feature | host_build_graph | tensormap_and_ringbuffer | +| ------- | ---------------- | ------------------------ | +| Graph built on | Host CPU | AICPU (device) | +| Task storage | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) | +| Dependencies | Explicit edges | Auto-derived via TensorMap | +| Memory management | Host-side | Ring buffer heap (GM) | +| Concurrent build+schedule | No | Yes (always) | +| Profiling support | Basic | Multi-level hierarchy | +| Batch/streaming | No | Yes (flow control, back-pressure) | +| Thread model | N scheduler threads | 1 orchestrator + 3 schedulers | +| Use case | Development, debugging | Production workloads | ## host_build_graph @@ -26,16 +26,6 @@ The simplest runtime. The host CPU builds the complete task dependency graph bef See [host_build_graph/docs/RUNTIME_LOGIC.md](../runtime/host_build_graph/docs/RUNTIME_LOGIC.md) for details. -## aicpu_build_graph - -Orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`). - -- Same task array as host_build_graph -- Device-side API: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc` -- Reduces host-device data transfer; graph can depend on device-side data - -See [aicpu_build_graph/docs/RUNTIME_LOGIC.md](../runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md) for details. - ## tensormap_and_ringbuffer (PTO2) The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking. diff --git a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h b/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h index 554e4af62..95c317117 100644 --- a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h +++ b/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h @@ -38,8 +38,8 @@ * Buffer management and final commit are handled by AICPU. * * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended). - * For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites - * with the full (ring_id << 32) | local_id encoding after handshake match. + * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id + * encoding after handshake match. * * @param l2_perf_buf Performance buffer pointer * @param task_id Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h index 80b62c88a..131420cbb 100644 --- a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h @@ -127,7 +127,7 @@ void l2_perf_aicpu_init_phase_profiling(Runtime *runtime, int num_sched_threads) * @param loop_iter Current loop iteration number * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or * full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator - * phases in multi-ring runtimes: tensormap_and_ringbuffer, aicpu_build_graph) + * phases in tensormap_and_ringbuffer) */ void l2_perf_aicpu_record_phase( int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, @@ -164,9 +164,8 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx); * @param start_time Phase start timestamp * @param end_time Phase end timestamp * @param submit_idx Task submission index (acts as loop_iter) - * @param task_id Task identifier. For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), this is the - * full PTO2 encoding: (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler - * swimlanes. + * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding: + * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes. */ void l2_perf_aicpu_record_orch_phase( AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id diff --git a/src/a2a3/platform/include/common/l2_perf_profiling.h b/src/a2a3/platform/include/common/l2_perf_profiling.h index d13bcd94a..3f82eeb4e 100644 --- a/src/a2a3/platform/include/common/l2_perf_profiling.h +++ b/src/a2a3/platform/include/common/l2_perf_profiling.h @@ -83,8 +83,8 @@ struct L2PerfRecord { uint64_t finish_time; // AICPU timestamp: when AICPU observed task completion // AICore writes the register dispatch token (low 32 bits only) zero-extended into task_id. - // For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites - // with the full PTO2 encoding (ring_id << 32) | local_id after FIN/perf row match. + // For tensormap_and_ringbuffer, AICPU overwrites with the full PTO2 encoding + // (ring_id << 32) | local_id after FIN/perf row match. // For host_build_graph, task_id stays as the plain integer task index (ring_id = 0). uint64_t task_id; uint32_t func_id; // Kernel function identifier @@ -273,8 +273,8 @@ struct AicpuPhaseRecord { uint32_t loop_iter; // Loop iteration number AicpuPhaseId phase_id; // Phase type union { - uint64_t task_id; // Multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph): - // full PTO2 encoding (ring_id << 32) | local_id for cross-view correlation. + uint64_t task_id; // tensormap_and_ringbuffer: full PTO2 encoding + // (ring_id << 32) | local_id for cross-view correlation. uint64_t tasks_processed; // Scheduler phases: number of tasks processed in this batch }; }; diff --git a/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp deleted file mode 100644 index 2a356485d..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -#include "aicore/aicore.h" -#include "aicore/l2_perf_collector_aicore.h" -#include "aicore/pmu_collector_aicore.h" -#include "common/l2_perf_profiling.h" -#include "common/platform_config.h" // Register-based communication -#include "pto2_dispatch_payload.h" -#include "runtime.h" - -/** - * Unified function pointer type for kernel dispatch - * - * All kernels follow the same signature: void kernel(__gm__ int64_t* args) - * This enables simple, switch-free dispatch. - */ -typedef void (*UnifiedKernelFunc)(__gm__ int64_t *); - -/** - * Execute task from PTO2DispatchPayload. - * - * Reads function_bin_addr and args from the dispatch payload. - * - * @param payload Pointer to PTO2DispatchPayload in global memory - */ -__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2DispatchPayload *payload) { - if (payload == nullptr || payload->function_bin_addr == 0) { - return; - } - - UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr; - kernel(reinterpret_cast<__gm__ int64_t *>(payload->args)); - OUT_OF_ORDER_STORE_BARRIER(); -} - -/** - * AICore main execution loop - * - * Implements the AICPU-AICore register-based dispatch protocol: - * 1. Wait for AICPU ready signal via handshake buffer - * 2. Report physical core ID and core type, signal AICore ready - * 3. Poll DATA_MAIN_BASE register for task dispatch until exit signal - * - * Task dispatch reads PTO2DispatchPayload address from Handshake.task. - * Task ID is derived from the register value (task_id + 1 encoding). - * - * @param runtime Pointer to Runtime in global memory - * @param block_idx Block index (core ID) - * @param core_type Core type (AIC or AIV) - */ -__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type) { - __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[block_idx]); - - // Phase 1: Wait for AICPU initialization signal - while (my_hank->aicpu_ready == 0) { - dcci(my_hank, SINGLE_CACHE_LINE); - } - - // Phase 2: Report physical core ID, signal ready - my_hank->physical_core_id = get_physical_core_id(); - OUT_OF_ORDER_STORE_BARRIER(); - my_hank->aicore_regs_ready = 1; - dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT); - while (my_hank->aicpu_regs_ready == 0) { - dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE); - } - // Report initial idle status via register - write_reg(RegId::COND, AICORE_IDLE_VALUE); - - // Phase 3: Report core type, signal ready - my_hank->core_type = core_type; - OUT_OF_ORDER_STORE_BARRIER(); - my_hank->aicore_done = block_idx + 1; // Signal ready (use block_idx + 1 to avoid 0) - - dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); - - // Cache payload address (set once by AICPU during initialization, never changes) - __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task); - - bool l2_perf_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE); - bool dump_tensor_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR); - bool pmu_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_PMU); - - // Phase 4: Main execution loop - poll register for tasks until exit signal - // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit - uint32_t reg_val = AICPU_IDLE_TASK_ID; - uint32_t last_reg_val = AICPU_IDLE_TASK_ID; - - while (true) { - reg_val = static_cast(read_reg(RegId::DATA_MAIN_BASE)); - if (reg_val == AICORE_EXIT_SIGNAL) { - // Signal exit acknowledgment to AICPU - write_reg(RegId::COND, AICORE_EXITED_VALUE); - break; - } - - // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task) - if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) { - SPIN_WAIT_HINT(); - continue; - } - - { - uint32_t task_id = reg_val; // Decode: register holds task_id directly - - // Invalidate payload buffer (AICPU updates its content each dispatch) - dcci(payload, ENTIRE_DATA_CACHE); - - write_reg(RegId::COND, MAKE_ACK_VALUE(task_id)); - - // Performance profiling: record start time - uint64_t start_time = get_sys_cnt_aicore(); - - if (pmu_enabled) { - pmu_aicore_begin(); - } - - // Execute the task - execute_task(payload); - - if (pmu_enabled) { - pmu_aicore_end(); - } - - if (dump_tensor_enabled) { - pipe_barrier(PIPE_ALL); - } - - // Performance profiling: record task execution - // (func_id and core_type are filled by AICPU at completion time) - if (l2_perf_enabled) { - uint64_t end_time = get_sys_cnt_aicore(); - __gm__ L2PerfBuffer *l2_perf_buf = (__gm__ L2PerfBuffer *)my_hank->l2_perf_records_addr; - l2_perf_aicore_record_task(l2_perf_buf, task_id, start_time, end_time); - } - - last_reg_val = reg_val; - write_reg(RegId::COND, MAKE_FIN_VALUE(task_id)); - } - } - - // Flush all dirty cache lines to HBM before kernel exit. - dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); -} diff --git a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp deleted file mode 100644 index 13cdd52f7..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp +++ /dev/null @@ -1,2341 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#endif - -#include "aicpu/device_log.h" -#include "aicpu/device_time.h" -#include "aicpu/orch_so_file.h" -#include "pto2_dispatch_payload.h" -#include "runtime.h" -#include "spin_hint.h" - -// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE) -#include "pto_runtime2.h" -#include "pto_runtime2_types.h" -#include "pto_shared_memory.h" - -// Performance profiling headers -#include "aicpu/l2_perf_collector_aicpu.h" -#include "aicpu/pmu_collector_aicpu.h" -#include "aicpu/tensor_dump_aicpu.h" -#include "common/memory_barrier.h" -#include "common/l2_perf_profiling.h" -#include "common/unified_log.h" - -// Register-based communication -#include "aicpu/platform_regs.h" -#include "common/platform_config.h" - -// Core type definitions -#include "common/core_type.h" - -// CoreCallable for resolved dispatch address -#include "callable.h" - -#if PTO2_PROFILING -// Accumulated nanoseconds per sub-step -#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 -#define CYCLE_COUNT_LAP(acc) \ - do { \ - _t1 = get_sys_cnt_aicpu(); \ - acc += (_t1 - _t0); \ - _t0 = _t1; \ - } while (0) -#else -#define CYCLE_COUNT_START() -#define CYCLE_COUNT_LAP(acc) -#endif - -// Device orchestration function signature (loaded via dlopen). -// The orchestration .so receives a PTO2Runtime* (with ops table populated) -// instead of a raw shared-memory pointer. -typedef void (*DeviceOrchestrationFunc)(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args); - -// Config function exported by orchestration .so -typedef PTO2OrchestrationConfig (*DeviceOrchestrationConfigFunc)(const ChipStorageTaskArgs &orch_args); - -constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS; -constexpr int32_t MAX_AIC_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD; -constexpr int32_t MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD; - -constexpr int32_t MAX_IDLE_ITERATIONS = 800000; // ~20s idle then scheduler gives up (avoid long hang) -constexpr int32_t STALL_LOG_INTERVAL = 50000; // DEV_ALWAYS every N idle iters to debug hang -constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024; // Check orchestrator error every N idle iters -constexpr int32_t STALL_DUMP_READY_MAX = 8; -constexpr int32_t STALL_DUMP_WAIT_MAX = 4; -constexpr int32_t STALL_DUMP_CORE_MAX = 8; -constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks -constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold - -static PTO2Runtime *rt{nullptr}; - -// Per-core dispatch payload storage (one per physical core) -static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER]; - -// Core information for discovery (with register address for fast dispatch) -struct CoreInfo { - int32_t worker_id; // Index in runtime.workers[] - uint32_t physical_core_id; // Hardware physical core ID (from AICore) - uint64_t reg_addr; // Cached register address for fast access - CoreType core_type; -}; - -struct CoreTypeTracker { - int32_t idle_count; - int32_t running_count; - int32_t idle[MAX_CORES_PER_THREAD]; - int32_t running[MAX_CORES_PER_THREAD]; - - void move_idle_to_running(int32_t idx) { - running[running_count++] = idle[idx]; - idle[idx] = idle[--idle_count]; - } - - void move_running_to_idle(int32_t idx) { - idle[idle_count++] = running[idx]; - running[idx] = running[--running_count]; - } - - int32_t find_idle_index(int32_t core_id) { - for (int32_t i = 0; i < idle_count; i++) { - if (idle[i] == core_id) return i; - } - return -1; - } -}; - -struct Cluster { - int32_t aic_core_id; - int32_t aiv_core_ids[2]; -}; - -struct CoreStateTracker { - CoreTypeTracker by_type[2]; // indexed by static_cast(CoreType) - Cluster clusters[MAX_AIC_PER_THREAD]; - int32_t cluster_count; - - CoreTypeTracker &aic() { return by_type[0]; } - CoreTypeTracker &aiv() { return by_type[1]; } - - template - CoreTypeTracker &get() { - return by_type[static_cast(CT)]; - } - - int32_t find_cluster_for_shape(PTO2ResourceShape shape, bool *core_idle) { - for (int32_t i = 0; i < cluster_count; i++) { - Cluster &c = clusters[i]; - switch (shape) { - case PTO2ResourceShape::AIC_ONLY: - if (core_idle[c.aic_core_id]) return i; - break; - case PTO2ResourceShape::AIV_X1: - if (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]]) return i; - break; - case PTO2ResourceShape::AIV_X2: - if (core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i; - break; - case PTO2ResourceShape::AIC_AIV_X1: - if (core_idle[c.aic_core_id] && (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]])) - return i; - break; - case PTO2ResourceShape::AIC_AIV_X2: - if (core_idle[c.aic_core_id] && core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i; - break; - } - } - return -1; - } -}; - -struct AicpuExecutor { - int32_t sched_thread_num_; - bool orch_to_sched_{false}; - - // ===== Thread management state ===== - std::atomic thread_idx_{0}; - std::atomic initialized_{false}; - std::atomic init_done_{false}; - std::atomic init_failed_{false}; - std::atomic finished_{false}; - - int32_t thread_num_{0}; - int32_t cores_total_num_{0}; - int32_t thread_cores_num_{0}; // Cores per scheduler thread (0 for orchestrator when thread_num_==4) - int32_t core_count_per_thread_[MAX_AICPU_THREADS]; // Actual core count per thread - int32_t core_assignments_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD]; - - // Core discovery arrays (with register addresses) - CoreInfo aic_cores_[MAX_CORES_PER_THREAD]; - CoreInfo aiv_cores_[MAX_CORES_PER_THREAD]; - int32_t aic_count_{0}; - int32_t aiv_count_{0}; - -#if PTO2_PROFILING - // Logical core_id -> hardware physical core id, collected during handshake. - // Handed to pmu_aicpu_init() so the platform can resolve per-core PMU MMIO - // bases. - uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]; -#endif - - // Fast lookup: core_id -> reg_addr (for register-based dispatch) - uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD]; - - // Per-core monotonic dispatch counter for register protocol uniqueness. - // Multi-ring task_ids can collide in the lower 32 bits (e.g., ring 0 local 0 - // and ring 1 local 0 both truncate to 0), breaking the AICore's last_reg_val - // duplicate detection and causing false-positive COND completion. A per-core - // counter guarantees each dispatch writes a unique DATA_MAIN_BASE value. - uint32_t dispatch_seq_by_core_[RUNTIME_MAX_WORKER]{}; - - // Per-core subtask slot tracking (which PTO2SubtaskSlot is running on each core) - PTO2SubtaskSlot executing_subslot_by_core_[RUNTIME_MAX_WORKER]{}; - - // Per-core slot state tracking (PTO2TaskSlotState* for the running task on each core) - PTO2TaskSlotState *executing_slot_state_by_core_[RUNTIME_MAX_WORKER]{}; - - // Platform register base address array (set via get_platform_regs()) - uint64_t regs_{0}; - - // Track executing register task_id per core (AICPU_TASK_INVALID = idle). - // NOTE: this is NOT the task_id; it is the per-core dispatch id used by the - // register protocol (derived from dispatch_seq_by_core_ and masked by TASK_ID_MASK). - int32_t executing_reg_task_ids_[MAX_CORES_PER_THREAD]; - CoreStateTracker trackers_[MAX_AICPU_THREADS]; - bool core_idle_[MAX_CORES_PER_THREAD]; - - // ===== Task queue state (managed by scheduler ready queues) ===== - - // Task execution tracking - std::atomic completed_tasks_{0}; - int32_t total_tasks_{0}; - std::atomic finished_count_{0}; - // Device orchestration: set by last orchestrator when graph is built; schedulers poll it. - // volatile prevents the compiler from hoisting the load out of spin loops. - volatile bool orchestrator_done_{false}; - std::atomic pto2_init_done_{false}; - std::atomic runtime_init_ready_{false}; - std::atomic pto2_init_complete_{false}; // init block finished; others wait for this - - // ===== Dynamic core transition state ===== - std::atomic transition_requested_{false}; - std::atomic wait_reassign_{0}; - std::atomic reassigned_{false}; - std::atomic completed_{false}; - - // Orchestration SO handle - defer dlclose until all tasks complete - void *orch_so_handle_{nullptr}; - char orch_so_path_[256]{}; // Path to orchestration SO file for cleanup - - // Shared orchestration function pointer (loaded by first orch thread, used by all) - DeviceOrchestrationFunc orch_func_{nullptr}; - const ChipStorageTaskArgs *orch_args_cached_{nullptr}; - - // ===== Performance profiling state ===== - uint64_t dispatch_timestamps_[RUNTIME_MAX_WORKER]; // Per-core AICPU dispatch timestamp - uint32_t - core_dispatch_counts_[RUNTIME_MAX_WORKER]; // Per-core total dispatched task counter (for buffer management) - - uint64_t *func_id_to_addr_; - uint64_t get_function_bin_addr(int func_id) const { - if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0; - return func_id_to_addr_[func_id]; - } - - // ===== Methods ===== - int32_t init(Runtime *runtime); - int32_t handshake_all_cores(Runtime *runtime); - void assign_cores_to_threads(); - void reassign_cores_for_all_threads(); - int32_t resolve_and_dispatch_pto2(Runtime *runtime, int32_t thread_idx); - int32_t shutdown_aicore(Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num); - int32_t run(Runtime *runtime); - void deinit(Runtime *runtime); - void emergency_shutdown(Runtime *runtime); - void diagnose_stuck_state( - Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num, Handshake *hank - ); - - // Build slim PTO2DispatchPayload: only function_bin_addr + args. - // Metadata (task_id, subslot, kernel_id, core_type) stays in TaskDescriptor. - // Dispatch order: tensor args first, then scalar args. - void build_pto2_payload(PTO2DispatchPayload &out, int32_t kernel_id, PTO2TaskPayload &task_pl) { - uint64_t callable_addr = get_function_bin_addr(kernel_id); - const CoreCallable *callable = reinterpret_cast(callable_addr); - out.function_bin_addr = callable->resolved_addr(); - int32_t n = 0; - for (int32_t i = 0; i < task_pl.tensor_count; i++) { - task_pl.tensors[i].update_start_offset(); - out.args[n++] = reinterpret_cast(&task_pl.tensors[i]); - } - for (int32_t i = 0; i < task_pl.scalar_count; i++) { - out.args[n++] = task_pl.scalars[i]; - } - } - - // Template methods for Phase 1 and Phase 2 - template - void check_running_cores_for_completion( - int32_t thread_idx, CoreTypeTracker &ct, Handshake *hank, int32_t &completed_this_turn, - int32_t &cur_thread_completed, bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], - int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs -#if PTO2_PROFILING - , - bool l2_perf_enabled, uint32_t &phase_complete_count -#endif -#if PTO2_SCHED_PROFILING - , - uint64_t &complete_probe_count, uint64_t &complete_hit_count, uint64_t ¬ify_edges_total, - int32_t ¬ify_max_degree, uint64_t ¬ify_tasks_enqueued, uint64_t &fanin_edges_total, - int32_t &fanin_max_degree, uint64_t &sched_complete_perf_cycle -#endif - ) { - for (int32_t i = ct.running_count - 1; i >= 0; i--) { - int32_t core_id = ct.running[i]; - uint64_t reg_addr = core_id_to_reg_addr_[core_id]; - - int32_t expected_reg_task_id = executing_reg_task_ids_[core_id]; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); - int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); - int32_t reg_state = EXTRACT_TASK_STATE(reg_val); - bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE; -#if PTO2_SCHED_PROFILING - if (l2_perf_enabled) { - complete_probe_count++; - if (done) { - complete_hit_count++; - } - } -#endif - - if (done) { - executing_reg_task_ids_[core_id] = AICPU_TASK_INVALID; - PTO2SubtaskSlot subslot = executing_subslot_by_core_[core_id]; - PTO2TaskSlotState &slot_state = *executing_slot_state_by_core_[core_id]; - - // Two-stage completion: mark subtask done, then handle mixed-task completion - bool mixed_complete = rt->scheduler.on_subtask_complete(slot_state, subslot); - if (mixed_complete) { -#if PTO2_SCHED_PROFILING - PTO2CompletionStats cstats = - rt->scheduler.on_mixed_task_complete(slot_state, thread_idx, local_bufs); - notify_edges_total += cstats.fanout_edges; - if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges; - notify_tasks_enqueued += cstats.tasks_enqueued; - phase_complete_count++; -#else - rt->scheduler.on_mixed_task_complete(slot_state, local_bufs); -#if PTO2_PROFILING - phase_complete_count++; -#endif -#endif - if (deferred_release_count < 256) { - deferred_release_slot_states[deferred_release_count++] = &slot_state; - } else { - DEV_ALWAYS("Thread %d: release", thread_idx); - while (deferred_release_count > 0) { -#if PTO2_SCHED_PROFILING - int32_t fe = rt->scheduler.on_task_release( - *deferred_release_slot_states[--deferred_release_count], thread_idx - ); -#else - int32_t fe = - rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count]); -#endif - (void)fe; -#if PTO2_SCHED_PROFILING - fanin_edges_total += fe; - if (fe > fanin_max_degree) fanin_max_degree = fe; -#endif - } - deferred_release_slot_states[deferred_release_count++] = &slot_state; - } - } - ct.move_running_to_idle(i); - core_idle_[core_id] = true; -#if PTO2_PROFILING - if (l2_perf_enabled) { -#if PTO2_SCHED_PROFILING - uint64_t t_perf_start = get_sys_cnt_aicpu(); -#endif - Handshake *h = &hank[core_id]; - uint64_t finish_ts = get_sys_cnt_aicpu(); - L2PerfBuffer *l2_perf_buf = reinterpret_cast(h->l2_perf_records_addr); - - // Pre-extract fanout (platform layer cannot depend on PTO2DepListEntry) - uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; - int32_t fanout_n = 0; - PTO2DepListEntry *cur = slot_state.fanout_head; - while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) { - fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw; - cur = cur->next; - } - - int32_t perf_slot_idx = static_cast(executing_subslot_by_core_[core_id]); - if (l2_perf_aicpu_complete_record( - l2_perf_buf, static_cast(expected_reg_task_id), slot_state.task->task_id.raw, - slot_state.task->kernel_id[perf_slot_idx], CT, dispatch_timestamps_[core_id], finish_ts, - fanout_arr, fanout_n - ) != 0) { - DEV_ERROR( - "Core %d: l2_perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id, - static_cast(slot_state.task->task_id.raw) - ); - } -#if PTO2_SCHED_PROFILING - sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); -#endif - } -#endif - -#if PTO2_PROFILING - if (is_pmu_enabled()) { - pmu_aicpu_record_task( - core_id, thread_idx, slot_state.task->task_id.raw, - slot_state.task->kernel_id[static_cast(subslot)], hank[core_id].core_type - ); - } -#endif - - DEV_DEBUG( - "Thread %d: %s core %d completed PTO2 task %d (mixed_complete=%d)", thread_idx, - CT == CoreType::AIC ? "AIC" : "AIV", core_id, expected_reg_task_id, mixed_complete ? 1 : 0 - ); - cur_thread_completed++; - if (mixed_complete) { -#if PTO2_PROFILING - if (is_dump_tensor_enabled()) { - dump_tensors_for_task( - thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION, - [](uint8_t active_mask, uint8_t raw_subtask_id) { - return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); - }, - [this](int32_t func_id) { - return get_function_bin_addr(func_id); - } - ); - } -#endif - completed_this_turn++; - } - made_progress = true; - } - } - } - - static const char *shape_name(PTO2ResourceShape shape) { - switch (shape) { - case PTO2ResourceShape::AIC_ONLY: - return "AIC_ONLY"; - case PTO2ResourceShape::AIV_X1: - return "AIV_X1"; - case PTO2ResourceShape::AIV_X2: - return "AIV_X2"; - case PTO2ResourceShape::AIC_AIV_X1: - return "AIC_AIV_X1"; - case PTO2ResourceShape::AIC_AIV_X2: - return "AIC_AIV_X2"; - } - return "UNKNOWN"; - } - - struct ResourceCount { - int32_t aic; - int32_t aiv; - }; - - static constexpr ResourceCount shape_resource_count(PTO2ResourceShape shape) { - constexpr ResourceCount kTable[PTO2_NUM_RESOURCE_SHAPES] = { - {1, 0}, // AIC_ONLY = 0 - {0, 1}, // AIV_X1 = 1 - {0, 2}, // AIV_X2 = 2 - {1, 1}, // AIC_AIV_X1 = 3 - {1, 2}, // AIC_AIV_X2 = 4 - }; - return kTable[static_cast(shape)]; - } - - /** - * Returns the dispatch probe order for a given scheduler thread. - * Widest shapes first to avoid consuming cluster resources with narrow tasks. - * Even/odd threads use different fallback orders (AIC-first vs AIV-first) - * to reduce contention on the same ready queue across adjacent threads. - */ - static const PTO2ResourceShape *get_dispatch_order(int32_t thread_idx) { - // Even threads: AIC-first fallback after widest - static constexpr PTO2ResourceShape kEvenOrder[PTO2_NUM_RESOURCE_SHAPES] = { - PTO2ResourceShape::AIC_AIV_X2, PTO2ResourceShape::AIC_AIV_X1, PTO2ResourceShape::AIC_ONLY, - PTO2ResourceShape::AIV_X2, PTO2ResourceShape::AIV_X1, - }; - // Odd threads: AIV-first fallback after widest - static constexpr PTO2ResourceShape kOddOrder[PTO2_NUM_RESOURCE_SHAPES] = { - PTO2ResourceShape::AIC_AIV_X2, PTO2ResourceShape::AIV_X2, PTO2ResourceShape::AIC_AIV_X1, - PTO2ResourceShape::AIV_X1, PTO2ResourceShape::AIC_ONLY, - }; - return (thread_idx % 2 == 0) ? kEvenOrder : kOddOrder; - } - - PTO2TaskSlotState *pop_ready_task( - PTO2ResourceShape shape, int32_t thread_idx -#if PTO2_SCHED_PROFILING - , - uint64_t &pop_hit, uint64_t &pop_miss, uint64_t &sched_dispatch_pop_cycle -#endif - ) { - (void)thread_idx; -#if PTO2_SCHED_PROFILING - extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; - uint64_t t_pop_start = get_sys_cnt_aicpu(); - PTO2TaskSlotState *slot_state = rt->scheduler.get_ready_task( - shape, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx] - ); - sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); -#else - PTO2TaskSlotState *slot_state = rt->scheduler.get_ready_task(shape); -#endif - if (slot_state) { -#if PTO2_SCHED_PROFILING - pop_hit++; -#endif - } else { -#if PTO2_SCHED_PROFILING - pop_miss++; -#endif - } - return slot_state; - } - - void dispatch_subtask_to_core( - Runtime *runtime, CoreStateTracker &tracker, int32_t core_id, CoreType core_type, PTO2TaskSlotState &slot_state, - PTO2SubtaskSlot subslot -#if PTO2_PROFILING - , - bool l2_perf_enabled -#endif -#if PTO2_PROFILING - , - int32_t thread_idx -#endif - ) { -#if !PTO2_PROFILING - (void)runtime; // NOLINT(readability/casting) -#endif - PTO2DispatchPayload &payload = s_pto2_payload_per_core[core_id]; - PTO2TaskDescriptor &task = *slot_state.task; - int32_t slot_idx = static_cast(subslot); - build_pto2_payload(payload, task.kernel_id[slot_idx], *slot_state.payload); - executing_subslot_by_core_[core_id] = subslot; - executing_slot_state_by_core_[core_id] = &slot_state; -#if PTO2_PROFILING - if (l2_perf_enabled) { - dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); - if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) { - l2_perf_aicpu_switch_buffer(runtime, core_id, thread_idx); - core_dispatch_counts_[core_id] = 0; - } - core_dispatch_counts_[core_id]++; - } -#endif - - // Per-core monotonic counter for register protocol uniqueness. - // PTO2 task_id encodes (ring_id << 32 | local_id); truncation to uint32 loses ring_id, - // so tasks from different rings with the same local_id would write identical DATA_MAIN_BASE - // values. The AICore uses last_reg_val to detect new dispatches and would skip the - // duplicate, while the stale COND register from the previous task (same local_id) would - // cause a false-positive completion. - dispatch_seq_by_core_[core_id]++; - uint32_t reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK; - // Skip reserved sentinel range [AICORE_EXIT_SIGNAL, 0x7FFFFFFF]: jump directly to 0. - if (reg_task_id >= AICORE_EXIT_SIGNAL) { - dispatch_seq_by_core_[core_id] += (TASK_ID_MASK - reg_task_id + 1); - reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK; - } - write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast(reg_task_id)); - - CoreTypeTracker &ct = tracker.by_type[static_cast(core_type)]; - int32_t idle_idx = ct.find_idle_index(core_id); - ct.move_idle_to_running(idle_idx); - core_idle_[core_id] = false; - executing_reg_task_ids_[core_id] = reg_task_id; - } -}; - -static AicpuExecutor g_aicpu_executor; - -// ===== AicpuExecutor Method Implementations ===== - -/** - * Handshake with all cores and discover their types - * Sets up register addresses for fast dispatch. - */ -int32_t AicpuExecutor::handshake_all_cores(Runtime *runtime) { - Handshake *all_handshakes = reinterpret_cast(runtime->workers); - cores_total_num_ = runtime->worker_count; - - // Validate cores_total_num_ before using as array index - if (cores_total_num_ == 0 || cores_total_num_ > MAX_CORES_PER_THREAD) { - DEV_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, MAX_CORES_PER_THREAD); - return -1; - } - - aic_count_ = 0; - aiv_count_ = 0; - - DEV_INFO("Handshaking with %d cores", cores_total_num_); - - // Step 1: Write per-core payload addresses and send handshake signal - // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before - // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. - for (int32_t i = 0; i < cores_total_num_; i++) { - all_handshakes[i].task = reinterpret_cast(&s_pto2_payload_per_core[i]); - OUT_OF_ORDER_STORE_BARRIER(); - all_handshakes[i].aicpu_ready = 1; - } - OUT_OF_ORDER_STORE_BARRIER(); - - // Get platform physical cores count for validation - uint32_t max_physical_cores_count = platform_get_physical_cores_count(); - - // Step 2: Wait for all cores to respond, collect core type and register addresses - bool handshake_failed = false; - for (int32_t i = 0; i < cores_total_num_; i++) { - Handshake *hank = &all_handshakes[i]; - - while (hank->aicore_regs_ready == 0) {} - - uint32_t physical_core_id = hank->physical_core_id; - - // Validate physical_core_id before using as array index - if (physical_core_id >= max_physical_cores_count) { - DEV_ERROR( - "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, - max_physical_cores_count - ); - handshake_failed = true; - continue; - } - - // Get register address using physical_core_id - uint64_t *regs = reinterpret_cast(regs_); - uint64_t reg_addr = regs[physical_core_id]; - - // Initialize AICore registers after discovery (first round) - platform_init_aicore_regs(reg_addr); - OUT_OF_ORDER_STORE_BARRIER(); - hank->aicpu_regs_ready = 1; - - OUT_OF_ORDER_STORE_BARRIER(); - - while (hank->aicore_done == 0) {} - - CoreType type = hank->core_type; - - if (type == CoreType::AIC) { - aic_cores_[aic_count_].worker_id = i; - aic_cores_[aic_count_].physical_core_id = physical_core_id; - aic_cores_[aic_count_].reg_addr = reg_addr; - aic_cores_[aic_count_].core_type = type; - aic_count_++; - DEV_INFO("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); - } else { - aiv_cores_[aiv_count_].worker_id = i; - aiv_cores_[aiv_count_].physical_core_id = physical_core_id; - aiv_cores_[aiv_count_].reg_addr = reg_addr; - aiv_cores_[aiv_count_].core_type = type; - aiv_count_++; - DEV_INFO("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); - } - - core_id_to_reg_addr_[i] = reg_addr; -#if PTO2_PROFILING - physical_core_ids_[i] = physical_core_id; -#endif - } - - if (handshake_failed) { - emergency_shutdown(runtime); - return -1; - } - - DEV_INFO("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); - return 0; -} - -/** - * Assign discovered cores to scheduler threads - * (Aligned with host_build_graph mechanism) - */ -void AicpuExecutor::assign_cores_to_threads() { - // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % divisor. - // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together. - int32_t divisor = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_; - int32_t cluster_count = aic_count_; - - DEV_INFO( - "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count, divisor, - aic_count_, aiv_count_ - ); - - memset(core_idle_, true, sizeof(core_idle_)); - for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) { - executing_reg_task_ids_[i] = AICPU_TASK_INVALID; - } - for (int32_t i = 0; i < thread_num_; i++) { - trackers_[i].aic().running_count = 0; - trackers_[i].aiv().running_count = 0; - trackers_[i].aic().idle_count = 0; - trackers_[i].aiv().idle_count = 0; - trackers_[i].cluster_count = 0; - core_count_per_thread_[i] = 0; - } - - // Per-sched-thread running core index used while filling core_assignments_. - int32_t core_idx[MAX_AICPU_THREADS] = {}; - - for (int32_t ci = 0; ci < cluster_count; ci++) { - int32_t t = ci % divisor; - CoreStateTracker &tracker = trackers_[t]; - int32_t &idx = core_idx[t]; - - int32_t aic_wid = aic_cores_[ci].worker_id; - int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id; - int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id; - - tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}}; - - core_assignments_[t][idx++] = aic_wid; - tracker.aic().idle[tracker.aic().idle_count++] = aic_wid; - - core_assignments_[t][idx++] = aiv0_wid; - core_assignments_[t][idx++] = aiv1_wid; - tracker.aiv().idle[tracker.aiv().idle_count++] = aiv0_wid; - tracker.aiv().idle[tracker.aiv().idle_count++] = aiv1_wid; - - DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid); - } - - for (int32_t t = 0; t < divisor; t++) { - core_count_per_thread_[t] = core_idx[t]; - DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], trackers_[t].cluster_count); - } - - // Max clusters any single sched thread can hold: ceil(cluster_count / divisor). - int32_t max_clusters_per_thread = (cluster_count + divisor - 1) / divisor; - thread_cores_num_ = max_clusters_per_thread * 3; -} - -/** - * Reassign all cores evenly across all threads (schedulers + orchestrators). - * Called by the last orchestrator thread when orchestration completes. - * Writes into new_core_assignments_ / new_core_count_per_thread_. - */ -void AicpuExecutor::reassign_cores_for_all_threads() { - DEV_INFO("Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", thread_num_, aic_count_, aiv_count_); - - // Collect running/idle state from all threads before reassignment - bool running_cores[MAX_CORES_PER_THREAD]; - memset(running_cores, 0, sizeof(running_cores)); - - for (int32_t i = 0; i < thread_num_; i++) { - for (int32_t j = 0; j < trackers_[i].aic().running_count; j++) { - int32_t core_id = trackers_[i].aic().running[j]; - running_cores[core_id] = true; - } - for (int32_t j = 0; j < trackers_[i].aiv().running_count; j++) { - int32_t core_id = trackers_[i].aiv().running[j]; - running_cores[core_id] = true; - } - } - - // Reset all trackers - for (int32_t i = 0; i < thread_num_; i++) { - core_count_per_thread_[i] = 0; - trackers_[i].aic().running_count = 0; - trackers_[i].aic().idle_count = 0; - trackers_[i].aiv().running_count = 0; - trackers_[i].aiv().idle_count = 0; - trackers_[i].cluster_count = 0; - } - - // Restore a single core's running/idle state into its new thread's tracker - auto reassign_core = [&](int32_t worker_id, CoreTypeTracker &type_tracker, int32_t thread_idx) { - core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = worker_id; - if (running_cores[worker_id]) { - type_tracker.running[type_tracker.running_count++] = worker_id; - } else { - type_tracker.idle[type_tracker.idle_count++] = worker_id; - } - }; - - // Assign whole clusters round-robin across all threads - for (int32_t ci = 0; ci < aic_count_; ci++) { - int32_t t = ci % thread_num_; - CoreStateTracker &tracker = trackers_[t]; - - int32_t aic_wid = aic_cores_[ci].worker_id; - int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id; - int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id; - - tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}}; - - reassign_core(aic_wid, tracker.aic(), t); - reassign_core(aiv0_wid, tracker.aiv(), t); - reassign_core(aiv1_wid, tracker.aiv(), t); - } - - // Log final distribution for verification - DEV_INFO("Core reassignment complete:"); - for (int32_t t = 0; t < thread_num_; t++) { - DEV_INFO( - " Thread %d: %d cores, %d clusters (AIC: running=%d idle=%d, AIV: running=%d idle=%d)", t, - core_count_per_thread_[t], trackers_[t].cluster_count, trackers_[t].aic().running_count, - trackers_[t].aic().idle_count, trackers_[t].aiv().running_count, trackers_[t].aiv().idle_count - ); - } -} - -int32_t AicpuExecutor::init(Runtime *runtime) { - bool expected = false; - if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) { - return 0; - } - - DEV_INFO("AicpuExecutor: Initializing"); - - if (runtime == nullptr) { - DEV_ERROR("runtime is nullptr"); - init_failed_.store(true, std::memory_order_release); - return -1; - } - - func_id_to_addr_ = runtime->func_id_to_addr_; - - // Read execution parameters from runtime - thread_num_ = runtime->sche_cpu_num; - if (thread_num_ == 0) thread_num_ = 1; - sched_thread_num_ = thread_num_ - 1; - orch_to_sched_ = runtime->orch_to_sched; - - if (thread_num_ < 1 || thread_num_ > MAX_AICPU_THREADS) { - DEV_ERROR("Invalid thread_num: %d", thread_num_); - init_failed_.store(true, std::memory_order_release); - return -1; - } - - // Initialize core_id_to_reg_addr_ array to 0 before handshake - for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) { - core_id_to_reg_addr_[i] = 0; - } - - // Use handshake mechanism to discover cores (aligned with host_build_graph) - int32_t rc = handshake_all_cores(runtime); - if (rc != 0) { - DEV_ERROR("handshake_all_cores failed"); - init_failed_.store(true, std::memory_order_release); - return -1; - } - - // Dynamically assign cores to threads - assign_cores_to_threads(); - - DEV_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_); - - // Initialize runtime execution state - // Task count comes from PTO2 shared memory - if (runtime->get_gm_sm_ptr()) { - auto *header = static_cast(runtime->get_gm_sm_ptr()); - int32_t pto2_count = 0; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - pto2_count += header->rings[r].fc.current_task_index.load(std::memory_order_acquire); - } - total_tasks_ = pto2_count > 0 ? pto2_count : 0; - } else { - total_tasks_ = 0; - } - completed_tasks_.store(0, std::memory_order_release); - // Host orchestration: graph already built, no wait needed. Device orch: Thread 3 will set this. - bool orch_on_host = runtime->get_orch_built_on_host(); - DEV_INFO("Init: orch_built_on_host=%d", orch_on_host ? 1 : 0); - orchestrator_done_ = orch_on_host; - - // Initial ready tasks will be populated via scheduler ready queues - - // Reset per-core dispatch timestamps and task counters - for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { - dispatch_timestamps_[i] = 0; - core_dispatch_counts_[i] = 0; - } - - // Clear per-core dispatch payloads and subslot tracking - memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core)); - memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_)); - memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_)); - memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_)); - - DEV_INFO("Init: PTO2 mode, task count from shared memory"); - - finished_count_.store(0, std::memory_order_release); - - init_done_.store(true, std::memory_order_release); - DEV_INFO("AicpuExecutor: Init complete"); - return 0; -} - -/** - * Shutdown AICore - Send exit signal via registers to all AICore kernels - */ -int32_t AicpuExecutor::shutdown_aicore( - Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num -) { - (void)runtime; - if (core_num == 0) return 0; - - DEV_INFO("Thread %d: Shutting down %d cores", thread_idx, core_num); - - for (int32_t i = 0; i < core_num; i++) { - int32_t core_id = cur_thread_cores[i]; - uint64_t reg_addr = core_id_to_reg_addr_[core_id]; - if (reg_addr != 0) { - platform_deinit_aicore_regs(reg_addr); - } else { - DEV_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id); - } - } - DEV_INFO("Thread %d: Shutdown complete", thread_idx); - return 0; -} - -int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t thread_idx) { - int32_t &core_num = core_count_per_thread_[thread_idx]; - CoreStateTracker &tracker = trackers_[thread_idx]; - DEV_INFO("Thread %d: resolve_and_dispatch_pto2 entry", thread_idx); - - void *sm_base = runtime->get_gm_sm_ptr(); - if (!sm_base) { - DEV_ERROR("PTO2 dispatch: sm_base is null"); - return -1; - } - DEV_INFO("Thread %d: sm_base=%p", thread_idx, sm_base); - - PTO2SharedMemoryHeader *header = static_cast(sm_base); - DEV_INFO( - "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast(header), - static_cast(header->rings[0].task_descriptors_offset), - static_cast(header->rings[0].task_window_size) - ); - - Handshake *hank = static_cast(runtime->workers); - DEV_INFO( - "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast(hank), - static_cast(header->rings[0].task_window_size) - ); - - // One-time init: assign perf buffers (one thread does it; others wait) - if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) { - DEV_INFO("Thread %d: doing one-time init", thread_idx); - -#if PTO2_PROFILING - // Assign perf buffers to cores early so profiling captures all tasks - // (total_tasks written to header later when orchestrator completes) - if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_init_profiling(runtime); - // Initialize phase profiling for scheduler threads + orchestrator threads - l2_perf_aicpu_init_phase_profiling(runtime, sched_thread_num_); - l2_perf_aicpu_set_orch_thread_idx(sched_thread_num_); - } -#endif -#if PTO2_PROFILING - if (is_dump_tensor_enabled()) { - dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_); - } -#endif - -#if PTO2_PROFILING - // Initialize PMU: program events, start counters, and pop initial buffers - if (is_pmu_enabled()) { - pmu_aicpu_init(physical_core_ids_, cores_total_num_); - DEV_INFO("PMU profiling started on %d cores", cores_total_num_); - } -#endif - - DEV_INFO("Thread %d: one-time init done", thread_idx); - pto2_init_complete_.store(true, std::memory_order_release); - } else { - while (!pto2_init_complete_.load(std::memory_order_acquire)) { - SPIN_WAIT_HINT(); - } - } - - DEV_INFO("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_num); - int32_t cur_thread_completed = 0; - int32_t idle_iterations = 0; - int32_t last_progress_count = 0; -#if PTO2_PROFILING - bool l2_perf_enabled = is_l2_swimlane_enabled(); -#endif - - // Scheduler profiling counters -#if PTO2_PROFILING - uint64_t sched_scan_cycle = 0; - uint64_t sched_complete_cycle = 0; - uint64_t sched_dispatch_cycle = 0; - uint64_t sched_idle_cycle = 0; - uint64_t sched_loop_count = 0; - uint32_t phase_complete_count = 0; - uint32_t phase_dispatch_count = 0; -#if PTO2_SCHED_PROFILING - uint64_t complete_probe_count = 0; - uint64_t complete_hit_count = 0; - uint64_t notify_edges_total = 0; - int32_t notify_max_degree = 0; - uint64_t notify_tasks_enqueued = 0; - uint64_t fanin_edges_total = 0; - int32_t fanin_max_degree = 0; - uint64_t pop_hit = 0; - uint64_t pop_miss = 0; - uint64_t local_dispatch_count = 0; - uint64_t local_overflow_count = 0; - uint64_t sched_complete_perf_cycle = 0; - uint64_t sched_dispatch_pop_cycle = 0; - uint64_t sched_dispatch_setup_cycle = 0; -#endif -#endif - - // Local-first dispatch buffers (stack-allocated, one per CoreType per scheduling thread). - // Initialized once; must be empty at the start of each iteration. - constexpr int LOCAL_READY_CAP_PER_TYPE = 256; - PTO2TaskSlotState *local_aic_ptrs[LOCAL_READY_CAP_PER_TYPE]; - PTO2TaskSlotState *local_aiv_ptrs[LOCAL_READY_CAP_PER_TYPE]; - PTO2LocalReadyBuffer local_bufs[PTO2_LOCAL_DISPATCH_TYPE_NUM]; // [0]=AIC, [1]=AIV - local_bufs[0].reset(local_aic_ptrs, LOCAL_READY_CAP_PER_TYPE); - local_bufs[1].reset(local_aiv_ptrs, LOCAL_READY_CAP_PER_TYPE); - PTO2TaskSlotState *deferred_release_slot_states[256]; - int32_t deferred_release_count = 0; - - bool cores_released = false; - - while (true) { - bool made_progress = false; -#if PTO2_PROFILING - CYCLE_COUNT_START(); - sched_loop_count++; - uint64_t _t0_phase = _t0; -#endif - int32_t task_count = 0; - if (tracker.aic().running_count == 0 && tracker.aiv().running_count == 0) { - bool orch_done = orchestrator_done_; - if (orch_done) { - // Check for orchestrator fatal error — exit immediately - int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire); - if (orch_err != PTO2_ERROR_NONE) { - DEV_ERROR( - "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. " - "completed_tasks=%d, total_tasks=%d", - thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_ - ); - emergency_shutdown(runtime); - completed_.store(true, std::memory_order_release); - break; - } - - // Normal exit: all tasks complete - task_count = total_tasks_; - if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) { - completed_.store(true, std::memory_order_release); - DEV_INFO( - "Thread %d: PTO2 completed tasks %d/%d", thread_idx, - completed_tasks_.load(std::memory_order_relaxed), task_count - ); - break; - } - } - } - - // Check for core transition request (execute once per thread) - if (!cores_released && orch_to_sched_ && transition_requested_.load(std::memory_order_acquire)) { - if (!reassigned_.load(std::memory_order_acquire)) { - wait_reassign_.fetch_add(1, std::memory_order_release); - while (!reassigned_.load(std::memory_order_acquire)) { - if (completed_.load(std::memory_order_acquire)) { - break; - } - SPIN_WAIT_HINT(); - } - if (completed_.load(std::memory_order_acquire)) { - break; - } - } - cores_released = true; - } - -#if PTO2_PROFILING - CYCLE_COUNT_LAP(sched_idle_cycle); -#endif - - // Process completed and dispatch FIRST to minimize Sched (dispatch→finish) latency. - // Sched time = finish_ts - dispatch_ts; recording finish_ts here at loop start reduces - // tail overhead (time from AICore done to AICPU recording finish). - - // Phase 1: Check running cores for completion, process and move to idle - int32_t completed_this_turn = 0; - - // Check AIC running cores - bool try_completed = false; - always_assert( - local_bufs[0].count == 0 && local_bufs[1].count == 0 - ); // Invariant: previous iteration fully consumed - if (tracker.aic().running_count > 0) { - try_completed = true; - check_running_cores_for_completion( - thread_idx, tracker.aic(), hank, completed_this_turn, cur_thread_completed, made_progress, - deferred_release_slot_states, deferred_release_count, local_bufs -#if PTO2_PROFILING - , - l2_perf_enabled, phase_complete_count -#endif -#if PTO2_SCHED_PROFILING - , - complete_probe_count, complete_hit_count, notify_edges_total, notify_max_degree, notify_tasks_enqueued, - fanin_edges_total, fanin_max_degree, sched_complete_perf_cycle -#endif - ); - } - - // Check AIV running cores - if (tracker.aiv().running_count > 0) { - try_completed = true; - check_running_cores_for_completion( - thread_idx, tracker.aiv(), hank, completed_this_turn, cur_thread_completed, made_progress, - deferred_release_slot_states, deferred_release_count, local_bufs -#if PTO2_PROFILING - , - l2_perf_enabled, phase_complete_count -#endif -#if PTO2_SCHED_PROFILING - , - complete_probe_count, complete_hit_count, notify_edges_total, notify_max_degree, notify_tasks_enqueued, - fanin_edges_total, fanin_max_degree, sched_complete_perf_cycle -#endif - ); - } - if (completed_this_turn > 0) { -#if PTO2_SCHED_PROFILING - rt->scheduler.tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed); -#endif - int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed); - int32_t new_total = prev + completed_this_turn; - last_progress_count = new_total; - if (thread_idx == 0 && task_count > 0) { - if (new_total <= PROGRESS_VERBOSE_THRESHOLD || - new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) { - DEV_ALWAYS( - "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count, - 100.0 * new_total / task_count - ); - } - } - } - -#if PTO2_PROFILING - if (!try_completed) { - CYCLE_COUNT_LAP(sched_idle_cycle); - } else { - CYCLE_COUNT_LAP(sched_complete_cycle); - if (l2_perf_enabled && phase_complete_count > 0) { - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count - ); - _t0_phase = _t1; - phase_complete_count = 0; - } - } -#endif - - // Phase 2: Local dispatch — drain local_bufs, match to idle clusters (zero MPMC operations) - // Phase 3: Global queue — push overflow to readyQ + fill remaining idle cores from readyQ - bool try_pushed = false; - - // Local dispatch: drain both per-CoreType local_bufs, match to idle clusters by shape - PTO2TaskSlotState *overflow_ptrs[LOCAL_READY_CAP_PER_TYPE * PTO2_LOCAL_DISPATCH_TYPE_NUM]; - int overflow_count = 0; - for (int bi = 0; bi < PTO2_LOCAL_DISPATCH_TYPE_NUM; bi++) { - while (local_bufs[bi].count > 0) { - PTO2TaskSlotState *slot_state = local_bufs[bi].pop(); - PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask); - int32_t ci = tracker.find_cluster_for_shape(shape, core_idle_); - - if (ci >= 0) { - try_pushed = true; - Cluster &c = tracker.clusters[ci]; -#if PTO2_SCHED_PROFILING - uint64_t t_setup_start = get_sys_cnt_aicpu(); -#endif - ResourceCount rc = shape_resource_count(shape); -#if PTO2_PROFILING - if (is_dump_tensor_enabled()) { - dump_tensors_for_task( - thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH, - [](uint8_t active_mask, uint8_t raw_subtask_id) { - return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); - }, - [this](int32_t func_id) { - return get_function_bin_addr(func_id); - } - ); - } -#endif - if (rc.aic) { - dispatch_subtask_to_core( - runtime, tracker, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC -#if PTO2_PROFILING - , - l2_perf_enabled -#endif -#if PTO2_PROFILING - , - thread_idx -#endif - ); - } - if (rc.aiv >= 1) { - int32_t aiv0 = core_idle_[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1]; - dispatch_subtask_to_core( - runtime, tracker, aiv0, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0 -#if PTO2_PROFILING - , - l2_perf_enabled -#endif -#if PTO2_PROFILING - , - thread_idx -#endif - ); - } - if (rc.aiv >= 2) { - dispatch_subtask_to_core( - runtime, tracker, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1 -#if PTO2_PROFILING - , - l2_perf_enabled -#endif -#if PTO2_PROFILING - , - thread_idx -#endif - ); - } -#if PTO2_PROFILING - phase_dispatch_count++; -#endif -#if PTO2_SCHED_PROFILING - pop_hit++; - local_dispatch_count++; - sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); -#endif - made_progress = true; - DEV_DEBUG( - "Thread %d: Dispatching %s task %" PRId64 " to cluster %d (local)", thread_idx, - shape_name(shape), static_cast(slot_state->task->task_id.raw), ci - ); - } else { - overflow_ptrs[overflow_count++] = slot_state; -#if PTO2_SCHED_PROFILING - local_overflow_count++; -#endif - } - } - } - - // Push overflow to global readyQ (shape-based) - for (int i = 0; i < overflow_count; i++) { - rt->scheduler.requeue_ready_task(*overflow_ptrs[i]); - } - - // Phase 3: Global dispatch — fill remaining idle cores from global readyQ (cluster-based) - const PTO2ResourceShape *dispatch_order = get_dispatch_order(thread_idx); - - for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) { - PTO2ResourceShape shape = dispatch_order[si]; - if (rt->scheduler.ready_queues[static_cast(shape)].size() == 0) continue; - - while (true) { - int32_t ci = tracker.find_cluster_for_shape(shape, core_idle_); - if (ci < 0) break; - - PTO2TaskSlotState *slot_state = pop_ready_task( - shape, thread_idx -#if PTO2_SCHED_PROFILING - , - pop_hit, pop_miss, sched_dispatch_pop_cycle -#endif - ); - if (!slot_state) break; - - try_pushed = true; -#if PTO2_PROFILING - phase_dispatch_count++; -#endif -#if PTO2_SCHED_PROFILING - uint64_t t_setup_start = get_sys_cnt_aicpu(); -#endif - Cluster &c = tracker.clusters[ci]; - ResourceCount rc = shape_resource_count(shape); -#if PTO2_PROFILING - if (is_dump_tensor_enabled()) { - dump_tensors_for_task( - thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH, - [](uint8_t active_mask, uint8_t raw_subtask_id) { - return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); - }, - [this](int32_t func_id) { - return get_function_bin_addr(func_id); - } - ); - } -#endif - if (rc.aic) { - dispatch_subtask_to_core( - runtime, tracker, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC -#if PTO2_PROFILING - , - l2_perf_enabled -#endif -#if PTO2_PROFILING - , - thread_idx -#endif - ); - } - if (rc.aiv >= 1) { - int32_t aiv_id = core_idle_[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1]; - dispatch_subtask_to_core( - runtime, tracker, aiv_id, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0 -#if PTO2_PROFILING - , - l2_perf_enabled -#endif -#if PTO2_PROFILING - , - thread_idx -#endif - ); - } - if (rc.aiv >= 2) { - dispatch_subtask_to_core( - runtime, tracker, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1 -#if PTO2_PROFILING - , - l2_perf_enabled -#endif -#if PTO2_PROFILING - , - thread_idx -#endif - ); - } - made_progress = true; -#if PTO2_SCHED_PROFILING - sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); -#endif - DEV_DEBUG( - "Thread %d: Dispatching %s task %" PRId64 " to cluster %d", thread_idx, shape_name(shape), - static_cast(slot_state->task->task_id.raw), ci - ); - } - } - -#if PTO2_PROFILING - if (!try_pushed) { - CYCLE_COUNT_LAP(sched_idle_cycle); - } else { - CYCLE_COUNT_LAP(sched_dispatch_cycle); - if (l2_perf_enabled && phase_dispatch_count > 0) { - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count - ); - _t0_phase = _t1; - phase_dispatch_count = 0; - } -#endif - } - - if (made_progress) { - idle_iterations = 0; - } else { - // Batch deferred fanin releases during idle. - // Processing all pending releases at once advances the ring faster, - // freeing heap space for the orchestrator without blocking completion polling. - while (deferred_release_count > 0) { -#if PTO2_SCHED_PROFILING - int32_t fe = - rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); -#else - int32_t fe = rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count]); -#endif - (void)fe; -#if PTO2_SCHED_PROFILING - fanin_edges_total += fe; - if (fe > fanin_max_degree) fanin_max_degree = fe; -#endif - } - idle_iterations++; - - // Check for orchestrator fatal error during idle (every 1024 iterations) - // orch_error_code is set in shared memory by the orchestrator's spin loop - // BEFORE orchestrator_done_ is set, so this catches errors earlier. - if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) { - int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire); - if (orch_err != PTO2_ERROR_NONE) { - DEV_ERROR( - "Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, - orch_err - ); - emergency_shutdown(runtime); - completed_.store(true, std::memory_order_release); - break; - } - } - - if (thread_idx == 0 && task_count > 0 && idle_iterations % STALL_LOG_INTERVAL == 0) { - int32_t c = completed_tasks_.load(std::memory_order_relaxed); - DEV_ALWAYS( - "PTO2 stall: no progress for %d iterations, completed=%d total=%d (last progress at %d)", - idle_iterations, c, task_count, last_progress_count - ); - // Scan all task slots to find truly stuck tasks using scheduler state - PTO2SchedulerState *sched = &rt->scheduler; - PTO2SharedMemoryHeader *sm_header_diag = static_cast(sm_base); - int32_t cnt_ready = 0, cnt_waiting = 0, cnt_inflight = 0; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - int32_t ring_task_count = - sm_header_diag->rings[r].fc.current_task_index.load(std::memory_order_relaxed); - for (int32_t si = 0; si < ring_task_count; si++) { - PTO2TaskSlotState &slot_state = sched->get_slot_state(r, si); - PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed); - int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed); - int32_t fi = slot_state.fanin_count; - int32_t kid = slot_state.task->kernel_id[0]; - if (st >= PTO2_TASK_COMPLETED) continue; // Already done - if (st == PTO2_TASK_READY || st == PTO2_TASK_RUNNING) { - cnt_inflight++; - continue; - } - // PENDING - if (rc >= fi) { - // Ready (all deps satisfied) but not enqueued — this is the real bug - cnt_ready++; - if (cnt_ready <= STALL_DUMP_READY_MAX) { - DEV_ALWAYS( - " STUCK-READY ring=%d task_id=%" PRId64 - " kernel_id=%d refcount=%d fanin=%d state=%d", - r, static_cast(slot_state.task->task_id.raw), kid, rc, fi, - static_cast(st) - ); - } - } else { - cnt_waiting++; - if (cnt_waiting <= STALL_DUMP_WAIT_MAX) { - DEV_ALWAYS( - " STUCK-WAIT ring=%d task_id=%" PRId64 - " kernel_id=%d refcount=%d fanin=%d state=%d", - r, static_cast(slot_state.task->task_id.raw), kid, rc, fi, - static_cast(st) - ); - } - } - } - } - DEV_ALWAYS( - " scan result: stuck_ready=%d stuck_waiting=%d in_flight=%d", cnt_ready, cnt_waiting, cnt_inflight - ); - // Log this thread's dispatch state - int32_t total_idle = tracker.aic().idle_count + tracker.aiv().idle_count; - int32_t total_running = tracker.aic().running_count + tracker.aiv().running_count; - DEV_ALWAYS( - " thread=%d idle_cores=%d (AIC=%d AIV=%d) running_cores=%d (AIC=%d AIV=%d) core_num=%d", - thread_idx, total_idle, tracker.aic().idle_count, tracker.aiv().idle_count, total_running, - tracker.aic().running_count, tracker.aiv().running_count, core_num - ); - // Dump AIC running cores - for (int32_t ci = 0; ci < tracker.aic().running_count && ci < STALL_DUMP_CORE_MAX; ci++) { - int32_t cid = tracker.aic().running[ci]; - int32_t sw_tid = executing_reg_task_ids_[cid]; - int32_t hw_kernel = -1; - if (sw_tid >= 0 && executing_slot_state_by_core_[cid]) { - int32_t diag_slot = static_cast(executing_subslot_by_core_[cid]); - hw_kernel = executing_slot_state_by_core_[cid]->task->kernel_id[diag_slot]; - } - uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND); - DEV_ALWAYS( - " core=%d cond=0x%x(state=%d,id=%d) exec_id=%d kernel=%d", cid, - static_cast(cond_reg), EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg), - sw_tid, hw_kernel - ); - } - // Dump AIV running cores - for (int32_t ci = 0; ci < tracker.aiv().running_count && ci < STALL_DUMP_CORE_MAX; ci++) { - int32_t cid = tracker.aiv().running[ci]; - int32_t sw_tid = executing_reg_task_ids_[cid]; - int32_t hw_kernel = -1; - if (sw_tid >= 0 && executing_slot_state_by_core_[cid]) { - int32_t diag_slot = static_cast(executing_subslot_by_core_[cid]); - hw_kernel = executing_slot_state_by_core_[cid]->task->kernel_id[diag_slot]; - } - uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND); - DEV_ALWAYS( - " core=%d cond=0x%x(state=%d,id=%d) exec_id=%d kernel=%d", cid, - static_cast(cond_reg), EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg), - sw_tid, hw_kernel - ); - } - // Dump cluster state - for (int32_t cli = 0; cli < tracker.cluster_count && cli < STALL_DUMP_CORE_MAX; cli++) { - Cluster &cl = tracker.clusters[cli]; - DEV_ALWAYS( - " cluster[%d] aic=%d(%s) aiv0=%d(%s) aiv1=%d(%s)", cli, cl.aic_core_id, - core_idle_[cl.aic_core_id] ? "idle" : "busy", cl.aiv_core_ids[0], - core_idle_[cl.aiv_core_ids[0]] ? "idle" : "busy", cl.aiv_core_ids[1], - core_idle_[cl.aiv_core_ids[1]] ? "idle" : "busy" - ); - } - } - if (idle_iterations > MAX_IDLE_ITERATIONS) { - DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations); - return -1; - } else { - SPIN_WAIT_HINT(); - } -#if PTO2_PROFILING - CYCLE_COUNT_LAP(sched_idle_cycle); - if (l2_perf_enabled) { - l2_perf_aicpu_record_phase( - thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, sched_loop_count, 0 - ); - _t0_phase = _t1; - } -#endif - } - } - -#if PTO2_PROFILING - // Scheduler summary logging (always print when PTO2_PROFILING=1) - uint64_t sched_total = sched_complete_cycle + sched_scan_cycle + sched_dispatch_cycle + sched_idle_cycle; - if (sched_total == 0) sched_total = 1; // avoid div-by-zero - -#if PTO2_SCHED_PROFILING - // Two-level tree display: sub-phase breakdown within complete and dispatch - { - PTO2SchedProfilingData sp = pto2_scheduler_get_profiling(thread_idx); - uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; - uint64_t complete_poll = (sched_complete_cycle > otc_total + sched_complete_perf_cycle) ? - (sched_complete_cycle - otc_total - sched_complete_perf_cycle) : - 0; - uint64_t dispatch_poll = (sched_dispatch_cycle > sched_dispatch_pop_cycle + sched_dispatch_setup_cycle) ? - (sched_dispatch_cycle - sched_dispatch_pop_cycle - sched_dispatch_setup_cycle) : - 0; - - DEV_ALWAYS( - "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx, - cycles_to_us(sched_total), cur_thread_completed - ); - - // Level 1: complete - double notify_avg = - cur_thread_completed > 0 ? static_cast(notify_edges_total) / cur_thread_completed : 0.0; - double fanin_avg = - cur_thread_completed > 0 ? static_cast(fanin_edges_total) / cur_thread_completed : 0.0; - DEV_ALWAYS( - "Thread %d: complete : %.3fus (%.1f%%) [fanout: edges=%" PRIu64 - ", max_degree=%d, avg=%.1f] [fanin: " - "edges=%" PRIu64 ", max_degree=%d, avg=%.1f]", - thread_idx, cycles_to_us(sched_complete_cycle), sched_complete_cycle * 100.0 / sched_total, - static_cast(notify_edges_total), notify_max_degree, notify_avg, - static_cast(fanin_edges_total), fanin_max_degree, fanin_avg - ); - - // Level 2: complete sub-phases (percentage relative to complete) - uint64_t c_parent = sched_complete_cycle > 0 ? sched_complete_cycle : 1; - uint64_t complete_miss_count = - (complete_probe_count > complete_hit_count) ? (complete_probe_count - complete_hit_count) : 0; - double complete_hit_rate = complete_probe_count > 0 ? complete_hit_count * 100.0 / complete_probe_count : 0.0; - DEV_ALWAYS( - "Thread %d: poll : %.3fus (%.1f%%) hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%", - thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent, - static_cast(complete_hit_count), static_cast(complete_miss_count), complete_hit_rate - ); - DEV_ALWAYS( - "Thread %d: otc_lock : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, - cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent, - cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle), - static_cast(sp.lock_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: otc_fanout : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, - cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent, - cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle), - static_cast(sp.fanout_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: otc_fanin : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, - cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent, - static_cast(sp.fanin_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: otc_self : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, - cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent, - static_cast(sp.self_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: perf : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_complete_perf_cycle), - sched_complete_perf_cycle * 100.0 / c_parent - ); - - // Level 1: dispatch - uint64_t pop_total = pop_hit + pop_miss; - double pop_hit_rate = pop_total > 0 ? pop_hit * 100.0 / pop_total : 0.0; - DEV_ALWAYS( - "Thread %d: dispatch : %.3fus (%.1f%%) [pop: hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%]", - thread_idx, cycles_to_us(sched_dispatch_cycle), sched_dispatch_cycle * 100.0 / sched_total, - static_cast(pop_hit), static_cast(pop_miss), pop_hit_rate - ); - uint64_t global_dispatch_count = pop_hit - local_dispatch_count; - uint64_t total_dispatched = local_dispatch_count + global_dispatch_count; - double local_hit_rate = total_dispatched > 0 ? local_dispatch_count * 100.0 / total_dispatched : 0.0; - DEV_ALWAYS( - "Thread %d: local_disp : local=%" PRIu64 ", global=%" PRIu64 ", overflow=%" PRIu64 - ", local_rate=%.1f%%", - thread_idx, static_cast(local_dispatch_count), static_cast(global_dispatch_count), - static_cast(local_overflow_count), local_hit_rate - ); - - // Level 2: dispatch sub-phases (percentage relative to dispatch) - uint64_t d_parent = sched_dispatch_cycle > 0 ? sched_dispatch_cycle : 1; - DEV_ALWAYS( - "Thread %d: poll : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll), - dispatch_poll * 100.0 / d_parent - ); - DEV_ALWAYS( - "Thread %d: pop : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, - cycles_to_us(sched_dispatch_pop_cycle), sched_dispatch_pop_cycle * 100.0 / d_parent, - cycles_to_us(sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), - static_cast(sp.pop_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: setup : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_dispatch_setup_cycle), - sched_dispatch_setup_cycle * 100.0 / d_parent - ); - - // Level 1: scan - DEV_ALWAYS( - "Thread %d: scan : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_scan_cycle), - sched_scan_cycle * 100.0 / sched_total - ); - - // Level 1: idle - DEV_ALWAYS( - "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_idle_cycle), - sched_idle_cycle * 100.0 / sched_total - ); - - // Average per completion - if (cur_thread_completed > 0) { - DEV_ALWAYS( - "Thread %d: avg/complete : %.3fus", thread_idx, - cycles_to_us(sched_complete_cycle) / cur_thread_completed - ); - } - } -#endif - // Summary line (always print when PTO2_PROFILING=1) - DEV_ALWAYS( - "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, - cycles_to_us(sched_total), static_cast(sched_loop_count), cur_thread_completed - ); -#endif - -#if PTO2_PROFILING - // Flush performance buffers for cores managed by this thread - if (l2_perf_enabled) { - l2_perf_aicpu_flush_buffers(thread_idx, core_assignments_[thread_idx], core_num); - l2_perf_aicpu_flush_phase_buffers(thread_idx); - } - if (is_pmu_enabled()) { - pmu_aicpu_flush_buffers(thread_idx, core_assignments_[thread_idx], core_num); - } -#endif -#if PTO2_PROFILING - if (is_dump_tensor_enabled()) { - dump_tensor_flush(thread_idx); - } -#endif - - return cur_thread_completed; -} - -int32_t AicpuExecutor::run(Runtime *runtime) { - int32_t thread_idx = thread_idx_++; - - DEV_ALWAYS("Thread %d: Start", thread_idx); - - // Orchestrator check - if (thread_idx >= sched_thread_num_) { - if (runtime->get_orch_built_on_host()) { - DEV_INFO("Thread %d: Host orchestration mode, no-op", thread_idx); - } else { - DEV_INFO("Thread %d: Orchestrator, loading SO via dlopen", thread_idx); - - const void *so_data = reinterpret_cast(runtime->get_dev_orch_so_addr()); - size_t so_size = runtime->get_dev_orch_so_size(); - - if (so_data == nullptr || so_size == 0) { - DEV_ERROR("Thread %d: Device orchestration SO not set", thread_idx); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - - // Try multiple paths that may allow execution on AICPU - char so_path[256]; - bool file_created = false; - const char *candidate_dirs[] = { - "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" - }; - const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); - - for (int32_t i = 0; i < num_candidates && !file_created; i++) { - int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path)); - if (fd < 0) { - DEV_INFO( - "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - continue; - } - ssize_t written = write(fd, so_data, so_size); - close(fd); - if (written != static_cast(so_size)) { - DEV_INFO( - "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - unlink(so_path); - continue; - } - file_created = true; - DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); - } - - if (!file_created) { - DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - - dlerror(); - void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); - const char *dlopen_err = dlerror(); - if (handle == nullptr) { - DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); - - dlerror(); - auto config_func = - reinterpret_cast(dlsym(handle, "aicpu_orchestration_config")); - - dlerror(); - DeviceOrchestrationFunc orch_func = - reinterpret_cast(dlsym(handle, "aicpu_orchestration_entry")); - const char *dlsym_error = dlerror(); - if (dlsym_error != nullptr) { - DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - if (orch_func == nullptr) { - DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - - const ChipStorageTaskArgs &args = runtime->get_orch_args(); - int32_t arg_count = args.tensor_count() + args.scalar_count(); - DEV_INFO("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count); - for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { - const ContinuousTensor &t = args.tensor(i); - DEV_INFO( - "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i, - static_cast(t.data), t.ndims, static_cast(t.dtype) - ); - } - for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { - DEV_INFO( - "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i, - static_cast(args.scalar(i)) - ); - } - - uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE; - uint64_t heap_size = PTO2_HEAP_SIZE; - int32_t expected_arg_count = 0; - if (config_func) { - PTO2OrchestrationConfig cfg = config_func(args); - expected_arg_count = cfg.expected_arg_count; - DEV_INFO("Thread %d: Config: expected_args=%d", thread_idx, expected_arg_count); - } else { - DEV_INFO("Thread %d: No config function, using defaults", thread_idx); - } - - if (expected_arg_count > 0 && arg_count < expected_arg_count) { - DEV_ERROR("Thread %d: arg_count %d < expected %d", thread_idx, arg_count, expected_arg_count); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - - if (runtime->task_window_size > 0) { - task_window_size = runtime->task_window_size; - } - if (runtime->heap_size > 0) { - heap_size = runtime->heap_size; - } - int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE; - if (runtime->dep_pool_size > 0) { - dep_pool_capacity = static_cast(runtime->dep_pool_size); - } - DEV_INFO( - "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", thread_idx, - static_cast(task_window_size), static_cast(heap_size), dep_pool_capacity - ); - - void *sm_ptr = runtime->get_gm_sm_ptr(); - void *gm_heap = runtime->get_gm_heap_ptr(); - - uint64_t sm_size = pto2_sm_calculate_size(task_window_size); - PTO2SharedMemoryHandle *sm_handle = - pto2_sm_create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size); - if (!sm_handle) { - DEV_ERROR("Thread %d: Failed to create shared memory handle", thread_idx); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - - rt = pto2_runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity); - if (!rt) { - DEV_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx); - pto2_sm_destroy(sm_handle); - dlclose(handle); - unlink(so_path); - // Unblock scheduler threads before returning so they don't spin forever. - runtime_init_ready_.store(true, std::memory_order_release); - return -1; - } - -#if PTO2_PROFILING - rt->orchestrator.enable_l2_swimlane = is_l2_swimlane_enabled(); -#endif - - // With multi-ring, slot_states are per-ring inside the scheduler. - runtime->set_slot_states_ptr(nullptr); - - // Store shared state for orchestrator thread - orch_func_ = orch_func; - orch_args_cached_ = &args; - orch_so_handle_ = handle; - snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path); - - runtime_init_ready_.store(true, std::memory_order_release); - - // Wait for scheduler's one-time init to complete - while (!pto2_init_complete_.load(std::memory_order_acquire)) { - SPIN_WAIT_HINT(); - } - -#if PTO2_PROFILING - // Each orchestrator thread sets its own phase buffer index (thread-local) - if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_set_orch_thread_idx(thread_idx); - } -#endif - - // Call orchestration function wrapped in an outer scope - DEV_ALWAYS("Thread %d: Calling aicpu_orchestration_entry from SO", thread_idx); -#if PTO2_PROFILING - uint64_t orch_cycle_start = get_sys_cnt_aicpu(); -#endif - PTO2_SCOPE(rt) { orch_func_(rt, *orch_args_cached_); } -#if PTO2_PROFILING - uint64_t orch_cycle_end = get_sys_cnt_aicpu(); - DEV_ALWAYS( - "Thread %d: orch_start=%" PRIu64 " orch_func_cost=%.3fus", thread_idx, - static_cast(orch_cycle_start), cycles_to_us(orch_cycle_end - orch_cycle_start) - ); -#endif - - // Print orchestrator profiling data -#if PTO2_ORCH_PROFILING - PTO2OrchProfilingData p = pto2_orchestrator_get_profiling(); - uint64_t total = p.alloc_cycle + p.args_cycle + p.heap_cycle + p.fanin_cycle; - if (total == 0) total = 1; // avoid div-by-zero - DEV_ALWAYS( - "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx, - static_cast(p.submit_count), cycles_to_us(total) - ); - DEV_ALWAYS( - "Thread %d: task_ring_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, - cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), - static_cast(p.alloc_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: heap_alloc : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - thread_idx, cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total, - cycles_to_us(p.heap_cycle - p.heap_wait_cycle), cycles_to_us(p.heap_wait_cycle), - static_cast(p.heap_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, - cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", - thread_idx, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, - cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle), - static_cast(p.fanin_atomic_count) - ); - DEV_ALWAYS( - "Thread %d: avg/task : %.3fus", thread_idx, - p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 - ); - -#if PTO2_PROFILING - // Write orchestrator summary to shared memory for host-side export (only if profiling enabled) - if (is_l2_swimlane_enabled()) { - AicpuOrchSummary orch_summary = {}; - orch_summary.start_time = orch_cycle_start; - orch_summary.end_time = orch_cycle_end; - orch_summary.sync_cycle = 0; - orch_summary.alloc_cycle = p.alloc_cycle; - orch_summary.args_cycle = p.args_cycle; - orch_summary.lookup_cycle = 0; - orch_summary.heap_cycle = p.heap_cycle; - orch_summary.insert_cycle = 0; - orch_summary.fanin_cycle = p.fanin_cycle; - orch_summary.scope_end_cycle = p.scope_end_cycle; - orch_summary.submit_count = p.submit_count; - l2_perf_aicpu_write_orch_summary(&orch_summary); - } -#endif -#endif - -#if PTO2_PROFILING - // Write core-to-thread mapping (one-time, after orchestration) - if (is_l2_swimlane_enabled()) { - l2_perf_aicpu_init_core_assignments(cores_total_num_); - for (int32_t t = 0; t < sched_thread_num_; t++) { - l2_perf_aicpu_write_core_assignments_for_thread(t, core_assignments_[t], core_count_per_thread_[t]); - } - // Flush orchestrator's phase record buffer - l2_perf_aicpu_flush_phase_buffers(thread_idx); - } -#endif - - // Signal completion and trigger core transition - rt_orchestration_done(rt); - - void *sm = runtime->get_gm_sm_ptr(); - PTO2SharedMemoryHeader *sm_header = static_cast(sm); - int32_t pto2_task_count = 0; - if (sm_header) { - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - pto2_task_count += sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); - } - } -#if PTO2_PROFILING - DEV_ALWAYS( - "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_task_count, - completed_tasks_.load(std::memory_order_acquire) - ); -#endif - total_tasks_ = pto2_task_count; - if (is_l2_swimlane_enabled() && pto2_task_count > 0) { - l2_perf_aicpu_update_total_tasks(static_cast(pto2_task_count)); - } - orchestrator_done_ = true; - { - int32_t orch_err = 0; - void *sm = runtime->get_gm_sm_ptr(); - if (sm) { - orch_err = - static_cast(sm)->orch_error_code.load(std::memory_order_relaxed); - } - - // Fatal error: shutdown AICore immediately before core transition. - if (orch_err != PTO2_ERROR_NONE) { - emergency_shutdown(runtime); - completed_.store(true, std::memory_order_release); - } - } - -#if PTO2_ORCH_PROFILING - uint64_t reassign_cycle_start = get_sys_cnt_aicpu(); -#endif - - // Skip core transition on fatal error — cores already shut down above - if (completed_.load(std::memory_order_acquire)) { - // Signal transition to unblock scheduler threads waiting at core transition - transition_requested_.store(true, std::memory_order_release); - reassigned_.store(true, std::memory_order_release); - } else if (orch_to_sched_) { - // Compute new core assignments for all threads and initialize donated slots - DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); -#if PTO2_PROFILING - uint64_t orch_stage_end_ts = get_sys_cnt_aicpu(); -#endif - transition_requested_.store(true, std::memory_order_release); -#if PTO2_PROFILING - DEV_ALWAYS( - "Thread %d: orch_stage_end=%" PRIu64 "", thread_idx, static_cast(orch_stage_end_ts) - ); -#endif - - // Wait for scheduler threads to acknowledge transition request - if (sched_thread_num_ > 0) { - while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { - if (completed_.load(std::memory_order_acquire)) { - break; - } - SPIN_WAIT_HINT(); - } - } - if (!completed_.load(std::memory_order_acquire)) { - reassign_cores_for_all_threads(); - reassigned_.store(true, std::memory_order_release); - } - } - -#if PTO2_ORCH_PROFILING - uint64_t reassign_cycle_end = get_sys_cnt_aicpu(); - DEV_ALWAYS( - "Thread %d: reassign, cost %.3fus", thread_idx, cycles_to_us(reassign_cycle_end - reassign_cycle_start) - ); -#endif - } - DEV_INFO("Thread %d: Orchestrator completed", thread_idx); - } - - // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) - if (!completed_.load(std::memory_order_acquire) && (thread_idx < sched_thread_num_ || orch_to_sched_)) { - DEV_ALWAYS("Thread %d: Starting PTO2 dispatch", thread_idx); - // Device orchestration: wait for primary orchestrator to initialize SM header - if (!runtime->get_orch_built_on_host()) { - while (!runtime_init_ready_.load(std::memory_order_acquire)) { - SPIN_WAIT_HINT(); - } - } - if (rt == nullptr) { - DEV_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx); - } else { - int32_t completed = resolve_and_dispatch_pto2(runtime, thread_idx); - DEV_INFO("Thread %d: Executed %d tasks from runtime", thread_idx, completed); - } - } - - // Always shutdown AICore — even if completed_ was already true. - // platform_deinit_aicore_regs is idempotent; orchestrator threads have - // core_count_per_thread_ == 0 so they skip the loop harmlessly. - { - const int32_t *shutdown_cores = core_assignments_[thread_idx]; - int32_t shutdown_count = core_count_per_thread_[thread_idx]; -#if PTO2_PROFILING - if (shutdown_count > 0) { - uint64_t sched_end_ts = get_sys_cnt_aicpu(); - DEV_ALWAYS("Thread %d: sched_end=%" PRIu64 "", thread_idx, static_cast(sched_end_ts)); - } -#endif - if (shutdown_count > 0) { -#if PTO2_PROFILING - // Restore PMU CTRL registers for this thread's cores before AICore shutdown - if (is_pmu_enabled()) { - pmu_aicpu_finalize(shutdown_cores, shutdown_count); - } -#endif - auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count); - if (rc != 0) { - return rc; - } - } - } - - DEV_INFO("Thread %d: Completed", thread_idx); - - // Check if this is the last thread to finish - int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); - if (prev_finished + 1 == thread_num_) { - finished_.store(true, std::memory_order_release); - // Destroy PTO2 runtime and close orchestration SO (moved from orchestrator path) - if (!runtime->get_orch_built_on_host() && orch_so_handle_ != nullptr) { - pto2_runtime_destroy(rt); - } - DEV_ALWAYS("Thread %d: Last thread, marking executor finished", thread_idx); - } - - return 0; -} - -void AicpuExecutor::deinit(Runtime *runtime) { - // 1. Invalidate AICPU cache for Runtime address range. - // Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but - // bypasses this cache. Invalidating now ensures next round reads from HBM. - cache_invalidate_range(runtime, sizeof(Runtime)); - - // Reset per-core dispatch timestamps and task counters - for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { - dispatch_timestamps_[i] = 0; - core_dispatch_counts_[i] = 0; - } - - // Clear per-core dispatch payloads and subslot tracking - memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core)); - memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_)); - memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_)); - memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_)); - - completed_tasks_.store(0, std::memory_order_release); - total_tasks_ = 0; - finished_count_.store(0, std::memory_order_release); - orchestrator_done_ = false; - pto2_init_done_.store(false, std::memory_order_release); - pto2_init_complete_.store(false, std::memory_order_release); - runtime_init_ready_.store(false, std::memory_order_release); - - // Reset core transition state - transition_requested_.store(false, std::memory_order_release); - wait_reassign_.store(0, std::memory_order_release); - reassigned_.store(false, std::memory_order_release); - completed_.store(false, std::memory_order_release); - - // Reset core discovery and assignment state - aic_count_ = 0; - aiv_count_ = 0; - cores_total_num_ = 0; - thread_num_ = 0; - sched_thread_num_ = 0; - thread_cores_num_ = 0; - orch_to_sched_ = false; - memset(trackers_, 0, sizeof(trackers_)); - memset(core_idle_, 0, sizeof(core_idle_)); - memset(core_assignments_, 0, sizeof(core_assignments_)); - memset(core_count_per_thread_, 0, sizeof(core_count_per_thread_)); - - // Reset orchestration SO state (handle freed by last thread before deinit) - orch_func_ = nullptr; - orch_args_cached_ = nullptr; - if (orch_so_handle_ != nullptr) { - dlclose(orch_so_handle_); - } - if (orch_so_path_[0] != '\0') { - unlink(orch_so_path_); - } - orch_so_handle_ = nullptr; - orch_so_path_[0] = '\0'; - - // Reset register-related state - for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) { - core_id_to_reg_addr_[i] = 0; - executing_reg_task_ids_[i] = AICPU_TASK_INVALID; - } - regs_ = 0; - - // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) - rt = nullptr; - - DEV_INFO("DeInit: Runtime execution state reset"); - - initialized_.store(false, std::memory_order_release); - init_done_.store(false, std::memory_order_release); - init_failed_.store(false, std::memory_order_release); - thread_idx_.store(0, std::memory_order_release); - finished_.store(false, std::memory_order_release); - - DEV_INFO("DeInit: AicpuExecutor reset complete"); -} - -void AicpuExecutor::emergency_shutdown(Runtime *runtime) { - DEV_WARN("Emergency shutdown: sending exit signal to all initialized cores"); - Handshake *all_handshakes = reinterpret_cast(runtime->workers); - for (int32_t i = 0; i < cores_total_num_; i++) { - Handshake *hank = &all_handshakes[i]; - OUT_OF_ORDER_STORE_BARRIER(); - hank->aicpu_regs_ready = 1; - if (core_id_to_reg_addr_[i] != 0) { - platform_deinit_aicore_regs(core_id_to_reg_addr_[i]); - } - } - - DEV_WARN("Emergency shutdown complete"); -} - -void AicpuExecutor::diagnose_stuck_state( - Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num, Handshake *hank -) { - (void)runtime; - PTO2SchedulerState *sched = &rt->scheduler; - DEV_ALWAYS("========== DIAGNOSTIC REPORT: Thread %d ==========", thread_idx); - - int32_t completed = completed_tasks_.load(std::memory_order_acquire); - int32_t total = total_tasks_; - DEV_ALWAYS("Progress: %d/%d tasks (%.1f%%)", completed, total, total > 0 ? completed * 100.0 / total : 0.0); - - uint64_t aic_ready = 0, aiv_ready = 0, aiv_x2_ready = 0, mixed_x1_ready = 0, mixed_x2_ready = 0; - if (rt) { - aic_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIC_ONLY)].size(); - aiv_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIV_X1)].size(); - aiv_x2_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIV_X2)].size(); - mixed_x1_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIC_AIV_X1)].size(); - mixed_x2_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIC_AIV_X2)].size(); - } - DEV_ALWAYS( - "Ready Queues: AIC=%lu, AIV=%lu, AIV_X2=%lu, AIC_AIV_X1=%lu, AIC_AIV_X2=%lu", aic_ready, aiv_ready, - aiv_x2_ready, mixed_x1_ready, mixed_x2_ready - ); - - int32_t busy_cores = 0; - int32_t idle_cores = 0; - - DEV_ALWAYS("Core Status:"); - for (int32_t i = 0; i < core_num; i++) { - int32_t core_id = cur_thread_cores[i]; - Handshake *h = &hank[core_id]; - const char *core_type_str = core_type_to_string(h->core_type); - - uint64_t reg_addr = core_id_to_reg_addr_[core_id]; - uint64_t reg_val = read_reg(reg_addr, RegId::COND); - int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); - int32_t reg_state = EXTRACT_TASK_STATE(reg_val); - int32_t task_id = executing_reg_task_ids_[core_id]; - - if (reg_state != TASK_FIN_STATE || task_id >= 0) { - busy_cores++; - if (task_id >= 0) { - int32_t kernel_id = -1; - if (rt && rt->sm_handle && executing_slot_state_by_core_[core_id]) { - int32_t diag_slot = static_cast(executing_subslot_by_core_[core_id]); - kernel_id = executing_slot_state_by_core_[core_id]->task->kernel_id[diag_slot]; - } - DEV_ALWAYS( - " Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s), executing_reg_task_id=%d, " - "kernel_id=%d", - core_id, core_type_str, reg_val, reg_task_id, reg_state == TASK_FIN_STATE ? "FIN" : "ACK", task_id, - kernel_id - ); - } else { - DEV_ALWAYS( - " Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s) but task_id not tracked", core_id, - core_type_str, reg_val, reg_task_id, reg_state == TASK_FIN_STATE ? "FIN" : "ACK" - ); - } - } else { - idle_cores++; - } - } - - DEV_ALWAYS("Summary: %d busy, %d idle", busy_cores, idle_cores); - - // Diagnose deadlock vs livelock - if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) { - DEV_ALWAYS("*** DEADLOCK DETECTED ***"); - DEV_ALWAYS("All cores idle, no ready tasks, but %d tasks incomplete", total - completed); - DEV_ALWAYS("Check PTO2 shared memory for task dependency state"); - } else if (busy_cores > 0) { - DEV_ALWAYS("*** LIVELOCK / HUNG TASK ***"); - DEV_ALWAYS("%d cores executing but no progress", busy_cores); - } - - DEV_ALWAYS("========== END DIAGNOSTIC =========="); -} - -// ===== Public Entry Point ===== - -/** - * aicpu_execute - Main AICPU kernel execution entry point - * - * This is called by DynTileFwkBackendKernelServer in kernel.cpp. - * Orchestrates the complete task runtime execution: - * 1. Initialize executor (thread-safe, first thread only) - * 2. Wait for initialization to complete - * 3. Execute tasks on managed cores - * 4. Cleanup when last thread finishes - * - * @param runtime Pointer to Runtime structure - * @return 0 on success, non-zero on error - */ -extern "C" int32_t aicpu_execute(Runtime *runtime) { - if (runtime == nullptr) { - DEV_ERROR("%s", "Invalid argument: null Runtime pointer"); - return -1; - } - - DEV_INFO("%s", "aicpu_execute: Starting AICPU kernel execution"); - - // Get platform register addresses from platform-level global - g_aicpu_executor.regs_ = get_platform_regs(); - - g_aicpu_executor.init(runtime); - - while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) { - if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) { - DEV_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution"); - return -1; - } - } - - int32_t rc = g_aicpu_executor.run(runtime); - if (rc != 0) { - DEV_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); - return rc; - } - - // Last thread cleans up - if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) { - DEV_INFO("aicpu_execute: Last thread finished, cleaning up"); - g_aicpu_executor.deinit(runtime); - } - - DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully"); - return 0; -} diff --git a/src/a2a3/runtime/aicpu_build_graph/build_config.py b/src/a2a3/runtime/aicpu_build_graph/build_config.py deleted file mode 100644 index 17569e35f..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/build_config.py +++ /dev/null @@ -1,30 +0,0 @@ -# AICPU Build Graph Runtime build configuration -# All paths are relative to this file's directory (src/runtime/aicpu_build_graph/) -# -# This is a device-orchestration runtime where: -# - AICPU thread 3 runs the orchestrator (builds task graph on device) -# - AICPU threads 0/1/2 run schedulers (dispatch tasks to AICore) -# - AICore executes tasks via PTO2DispatchPayload -# -# The "orchestration" directory contains source files compiled into both -# runtime targets AND the orchestration .so (e.g., tensor methods needed -# by the Tensor constructor's validation logic). - -BUILD_CONFIG = { - "aicore": { - "include_dirs": ["runtime"], - "source_dirs": ["aicore", "orchestration"] - }, - "aicpu": { - "include_dirs": ["runtime"], - "source_dirs": ["aicpu", "runtime", "orchestration"] - }, - "host": { - "include_dirs": ["runtime"], - "source_dirs": ["host", "runtime", "orchestration"] - }, - "orchestration": { - "include_dirs": ["runtime", "orchestration"], - "source_dirs": ["orchestration"] - } -} diff --git a/src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md deleted file mode 100644 index 561b1bb73..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md +++ /dev/null @@ -1,31 +0,0 @@ -# Runtime Logic: aicpu_build_graph - -## Overview -The aicpu_build_graph runtime builds the task graph on AICPU using a small orchestration plugin. A dedicated builder thread runs the plugin and emits tasks into the shared Runtime object, while scheduler threads dispatch published tasks to AICore. This enables concurrent build and schedule on device. - -## Core Data Structures -- `Runtime` stores task state, orchestration arguments, kernel address table, and the embedded orchestration plugin. See `src/runtime/aicpu_build_graph/runtime/runtime.h`. -- `Task` adds two concurrency flags, `published` and `completed`, so tasks can be made visible to schedulers only when fully defined. -- `AicpuBuildApi` is a device-side function table used by orchestration plugins to add tasks, add edges, and publish tasks without linking against runtime symbols. -- `HostApi` provides device memory ops used during host-side initialization. - -## Host Init Flow -1. `init_runtime_impl` registers kernel binaries and fills `Runtime::kernel_addrs[]` so AICPU-side builders can resolve `func_id` to `function_bin_addr`. See `src/runtime/aicpu_build_graph/host/runtime_maker.cpp`. -2. The host marshals orchestration arguments. Pointer args are allocated on device and copied; scalars are passed directly. Output and inout buffers are recorded with `runtime->record_tensor_pair`. -3. The orchestration plugin SO is embedded into `Runtime` (`try_set_aicpu_orch_so`), and the entry symbol name is stored in `Runtime::aicpu_orch_func_name`. -4. The build mode is set from `PTO_AICPU_BUILD_GRAPH_BUILD_MODE` (0 = sequential build then schedule, 1 = concurrent build and schedule). - -## Device Build And Schedule Flow -1. AICPU thread 0 loads the embedded orchestration plugin via `dlopen` and calls its entry function. See `src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp`. -2. The plugin uses `Runtime::aicpu_build_api` to build the graph. Typical sequence per task is `add_task`, `add_successor_conditional`, then `publish_task`. -3. In concurrent mode, scheduler threads start immediately and only see tasks that have been published. In sequential mode, schedulers wait for the builder to finish. -4. When a task completes, the scheduler decrements fanin counters and pushes newly-ready tasks to the ready queues. -5. Tasks are dispatched to AICore using the same per-core handshake protocol as host_build_graph. - -## Finalize And Cleanup -`validate_runtime_impl` copies recorded output tensors back to the host and frees any recorded device allocations. It also clears `tensor_pairs` and `device_allocs` for reuse. See `src/runtime/aicpu_build_graph/host/runtime_maker.cpp`. - -## Key Files -- `src/runtime/aicpu_build_graph/runtime/runtime.h` -- `src/runtime/aicpu_build_graph/host/runtime_maker.cpp` -- `src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp` diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp deleted file mode 100644 index 5dc3cf69d..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include "host/platform_compile_info.h" -#include "host/runtime_compile_info.h" -#include - -extern "C" { - -ToolchainType get_incore_compiler(void) { - if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_CCEC; - return TOOLCHAIN_HOST_GXX_15; -} - -ToolchainType get_orchestration_compiler(void) { - // aicpu_build_graph: a2a3 needs aarch64 cross-compile (AICPU is aarch64) - if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX; - return TOOLCHAIN_HOST_GXX; -} -} diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp deleted file mode 100644 index 32072f707..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp +++ /dev/null @@ -1,379 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Runtime Builder - rt2 Implementation (Device Orchestration) - * - * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime. - * Supports device orchestration where AICPU thread 3 runs the orchestrator. - * - * init_runtime_impl: - * - Converts host tensor pointers to device pointers (all tensors copied both directions) - * - Copies orchestration SO to device memory - * - Sets up runtime state for device orchestration - * - * validate_runtime_impl: - * - Copies recorded tensors back from device to host - * - Frees device memory - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "../runtime/pto_shared_memory.h" -#include "../runtime/runtime.h" -#include "callable.h" -#include "common/platform_config.h" -#include "common/unified_log.h" - -// Helper: return current time in milliseconds -static int64_t _now_ms() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; -} - -/** - * Parse an environment variable as uint64_t with optional power-of-2 constraint. - * Returns the parsed value on success, or 0 if unset or validation fails. - */ -static uint64_t parse_env_uint64(const char *name, uint64_t min_val, bool require_power_of_2) { - const char *env = std::getenv(name); - if (!env) return 0; - char *endptr; - errno = 0; - uint64_t val = strtoull(env, &endptr, 10); - if (errno == ERANGE || endptr == env || *endptr != '\0' || val < min_val) { - LOG_WARN("%s=%s invalid (must be a valid integer >= %" PRIu64 "), ignored", name, env, min_val); - return 0; - } - if (require_power_of_2 && (val & (val - 1)) != 0) { - LOG_WARN("%s=%s invalid (must be a power of 2, >= %" PRIu64 "), ignored", name, env, min_val); - return 0; - } - return static_cast(val); -} - -/** - * Initialize a pre-allocated runtime for device orchestration. - * - * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side). - * This function: - * - Converts host pointers to device pointers - * - Copies all tensor data to device - * - Records all tensors for copy-back - * - Copies orchestration SO to device memory - * - Sets up runtime state for device orchestration - * - * @param runtime Pointer to pre-constructed Runtime - * @param callable ChipCallable containing orch binary, func_name, and child kernels - * @param orch_args Separated tensor/scalar arguments - * @return 0 on success, -1 on failure - */ -extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) { - // Validate inputs - if (runtime == nullptr) { - LOG_ERROR("Runtime pointer is null"); - return -1; - } - - // Register kernel binaries from ChipCallable children - if (callable->child_count() > 0) { - LOG_INFO("Registering %d kernel(s) in init_runtime_impl", callable->child_count()); - for (int32_t i = 0; i < callable->child_count(); i++) { - int func_id = callable->child_func_id(i); - if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { - LOG_ERROR("func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); - return -1; - } - const auto &kernel = callable->child(i); - uint64_t addr = runtime->host_api.upload_kernel_binary( - func_id, reinterpret_cast(&kernel), - CoreCallable::binary_data_offset() + kernel.binary_size() - ); - if (addr == 0) { - LOG_ERROR("Failed to upload kernel binary for func_id=%d", func_id); - return -1; - } - runtime->set_function_bin_addr(func_id, addr); - } - } - - const uint8_t *orch_so_binary = static_cast(callable->binary_data()); - size_t orch_so_size = callable->binary_size(); - - if (orch_so_binary == nullptr || orch_so_size == 0) { - LOG_ERROR("Orchestration SO binary is required for device orchestration"); - return -1; - } - - if (orch_args == nullptr) { - LOG_ERROR("orch_args pointer is null"); - return -1; - } - - int tensor_count = orch_args->tensor_count(); - int scalar_count = orch_args->scalar_count(); - LOG_INFO("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); - - int64_t t_total_start = _now_ms(); - - // Build device args: copy from input, replace host tensor pointers with device pointers - ChipStorageTaskArgs device_args; - - int64_t t_args_start = _now_ms(); - for (int i = 0; i < tensor_count; i++) { - ContinuousTensor t = orch_args->tensor(i); - - if (t.is_child_memory()) { - LOG_INFO(" Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.data); - device_args.add_tensor(t); - continue; - } - - void *host_ptr = reinterpret_cast(static_cast(t.data)); - size_t size = static_cast(t.nbytes()); - - void *dev_ptr = runtime->host_api.device_malloc(size); - if (dev_ptr == nullptr) { - LOG_ERROR("Failed to allocate device memory for tensor %d", i); - return -1; - } - - int rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size); - if (rc != 0) { - LOG_ERROR("Failed to copy tensor %d to device", i); - runtime->host_api.device_free(dev_ptr); - return -1; - } - runtime->record_tensor_pair(host_ptr, dev_ptr, size); - LOG_INFO(" Tensor %d: %zu bytes at %p", i, size, dev_ptr); - - t.data = reinterpret_cast(dev_ptr); - device_args.add_tensor(t); - } - for (int i = 0; i < scalar_count; i++) { - device_args.add_scalar(orch_args->scalar(i)); - } - int64_t t_args_end = _now_ms(); - - // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume. - int64_t t_so_start = _now_ms(); - runtime->pending_orch_so_data_ = orch_so_binary; - runtime->pending_orch_so_size_ = orch_so_size; - LOG_INFO("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); - int64_t t_so_end = _now_ms(); - - // Read ready queue shard count from environment for AICPU scheduler - { - const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS"); - if (env_shards) { - char *endptr; - int64_t val = strtol(env_shards, &endptr, 10); - if (endptr != env_shards && *endptr == '\0' && val >= 1 && val <= PLATFORM_MAX_AICPU_THREADS) { - runtime->ready_queue_shards = static_cast(val); - } else { - LOG_WARN( - "PTO2_READY_QUEUE_SHARDS=%s is invalid or out of range [1,%d], using default %d", env_shards, - PLATFORM_MAX_AICPU_THREADS, RUNTIME_DEFAULT_READY_QUEUE_SHARDS - ); - runtime->ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS; - } - } - LOG_INFO("Ready queue shards: %d", runtime->ready_queue_shards); - } - - // Read orchestrator-to-scheduler transition flag from environment - { - const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED"); - if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) { - runtime->orch_to_sched = true; - } - LOG_INFO("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled"); - } - - // Read ring buffer size overrides from environment - { - runtime->task_window_size = parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true); - runtime->heap_size = parse_env_uint64("PTO2_RING_HEAP", 1024, true); - runtime->dep_pool_size = parse_env_uint64("PTO2_RING_DEP_POOL", 4, false); - if (runtime->task_window_size || runtime->heap_size || runtime->dep_pool_size) { - LOG_INFO( - "Ring buffer overrides: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%" PRIu64, - static_cast(runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE), - static_cast(runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE), - static_cast(runtime->dep_pool_size ? runtime->dep_pool_size : PTO2_DEP_LIST_POOL_SIZE) - ); - } - } - - // Resolve effective sizes (env override or compile-time default) - uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE; - uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE; - - // Allocate GM heap for orchestrator output buffers (all rings combined) - uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH; - int64_t t_heap_start = _now_ms(); - void *gm_heap = runtime->host_api.device_malloc(total_heap_size); - int64_t t_heap_end = _now_ms(); - if (gm_heap == nullptr) { - LOG_ERROR("Failed to allocate GM heap"); - return -1; - } - runtime->record_tensor_pair(nullptr, gm_heap, total_heap_size); - runtime->set_gm_heap(gm_heap); - - // Allocate PTO2 shared memory - int64_t t_sm_start = _now_ms(); - uint64_t sm_size = pto2_sm_calculate_size(eff_task_window_size); - void *sm_ptr = runtime->host_api.device_malloc(sm_size); - int64_t t_sm_end = _now_ms(); - if (sm_ptr == nullptr) { - LOG_ERROR("Failed to allocate PTO2 shared memory"); - return -1; - } - runtime->set_gm_sm_ptr(sm_ptr); - runtime->record_tensor_pair(nullptr, sm_ptr, static_cast(sm_size)); - - // Set up device orchestration state - runtime->set_orch_built_on_host(false); - runtime->set_orch_args(device_args); - - LOG_INFO("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); - - int64_t t_total_end = _now_ms(); - LOG_INFO("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start); - LOG_INFO("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start); - LOG_INFO("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start); - LOG_INFO("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start); - LOG_INFO("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); - - return 0; -} - -/** - * Validate runtime results and cleanup. - * - * This function: - * 1. Copies recorded tensors from device back to host - * 2. Frees device memory for recorded tensors - * 3. Clears tensor pair state - * - * @param runtime Pointer to Runtime - * @return 0 on success, -1 on failure - */ -extern "C" int validate_runtime_impl(Runtime *runtime) { - if (runtime == nullptr) { - LOG_ERROR("Runtime pointer is null"); - return -1; - } - - int rc = 0; - - LOG_INFO("=== Copying Results Back to Host ==="); - - // Copy all recorded tensors from device back to host - TensorPair *tensor_pairs = runtime->get_tensor_pairs(); - int tensor_pair_count = runtime->get_tensor_pair_count(); - - LOG_INFO("Tensor pairs to process: %d", tensor_pair_count); - - // PTO2 (device orchestration): graph output may be in packed buffer - void *pto2_sm = runtime->get_gm_sm_ptr(); - uint64_t graph_out_ptr = 0; - uint64_t graph_out_size = 0; - - if (pto2_sm != nullptr) { - // Copy header from device to host to read graph_output_ptr/size - PTO2SharedMemoryHeader host_header; - int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)); - if (hdr_rc == 0) { - graph_out_ptr = host_header.graph_output_ptr; - graph_out_size = host_header.graph_output_size; - if (graph_out_ptr != 0) { - LOG_INFO("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size); - } - } else { - LOG_WARN("Failed to copy PTO2 header from device"); - } - } - - bool first_output_tensor = true; - for (int i = 0; i < tensor_pair_count; i++) { - const TensorPair &pair = tensor_pairs[i]; - - // Skip if device pointer is null - if (pair.dev_ptr == nullptr) { - LOG_WARN("Tensor %d has null device pointer, skipping", i); - continue; - } - - // If host pointer is null, this is a device-only allocation (no copy-back) - if (pair.host_ptr == nullptr) { - LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i); - continue; - } - - void *src_ptr = pair.dev_ptr; - size_t copy_size = pair.size; - - // Use graph_output_ptr for the first output tensor if available - if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { - src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); - copy_size = static_cast(graph_out_size); - LOG_INFO("Using packed output buffer for tensor %d", i); - first_output_tensor = false; - } - - int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size); - if (copy_rc != 0) { - LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); - rc = copy_rc; - } else { - LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size); - } - } - - // Cleanup device tensors - LOG_INFO("=== Cleaning Up ==="); - for (int i = 0; i < tensor_pair_count; i++) { - if (tensor_pairs[i].dev_ptr != nullptr) { - runtime->host_api.device_free(tensor_pairs[i].dev_ptr); - } - } - LOG_INFO("Freed %d device allocations", tensor_pair_count); - - // Cleanup kernel binaries - int kernel_count = runtime->get_registered_kernel_count(); - for (int i = 0; i < kernel_count; i++) { - int func_id = runtime->get_registered_kernel_func_id(i); - runtime->host_api.remove_kernel_binary(func_id); - runtime->set_function_bin_addr(func_id, 0); - } - if (kernel_count > 0) { - LOG_INFO("Freed %d kernel binaries", kernel_count); - } - runtime->clear_registered_kernels(); - - // Clear tensor pairs - runtime->clear_tensor_pairs(); - - LOG_INFO("=== Finalize Complete ==="); - - return rc; -} diff --git a/src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp b/src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp deleted file mode 100644 index 8ac00ea30..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include "common.h" - -#ifdef __linux__ -#include -#include -#include -#include - -#include -#include -#include -#endif - -/** - * Use addr2line to convert an address to file:line information. - * Uses the -i flag to expand inlines; returns the first line (innermost actual code location). - * If inlining is present, also returns the outer call chain via inline_chain. - */ -#ifdef __linux__ -static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) { - char cmd[512]; - snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr); - - std::array buffer; - std::string raw_output; - - FILE *pipe = popen(cmd, "r"); - if (pipe) { - while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { - raw_output += buffer.data(); - } - pclose(pipe); - } - - if (raw_output.empty() || raw_output.find("??") != std::string::npos) { - return ""; - } - - // Split by lines - std::vector lines; - size_t pos = 0; - while (pos < raw_output.size()) { - size_t nl = raw_output.find('\n', pos); - if (nl == std::string::npos) nl = raw_output.size(); - std::string line = raw_output.substr(pos, nl - pos); - while (!line.empty() && line.back() == '\r') - line.pop_back(); - if (!line.empty()) lines.push_back(line); - pos = nl + 1; - } - - if (lines.empty()) return ""; - - // First line is the innermost actual code location; subsequent lines are outer inline callers - if (inline_chain && lines.size() > 1) { - *inline_chain = ""; - for (size_t j = 1; j < lines.size(); j++) { - *inline_chain += " [inlined by] " + lines[j] + "\n"; - } - } - - return lines.front(); -} -#endif - -/** - * Get current stack trace information (including file paths and line numbers). - * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses. - */ -std::string get_stacktrace(int skip_frames) { - (void)skip_frames; // May be unused on non-Linux platforms - std::string result; -#ifdef __linux__ - const int max_frames = 64; - void *buffer[max_frames]; - int nframes = backtrace(buffer, max_frames); - char **symbols = backtrace_symbols(buffer, nframes); - - if (symbols) { - result = "Stack trace:\n"; - for (int i = skip_frames; i < nframes; i++) { - std::string frame_info; - - void *addr = (void *)((char *)buffer[i] - 1); - - Dl_info dl_info; - std::string inline_chain; - if (dladdr(addr, &dl_info) && dl_info.dli_fname) { - void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase); - std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain); - - if (addr2line_result.empty()) { - addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain); - } - - if (!addr2line_result.empty()) { - frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result; - } - } - - if (frame_info.empty()) { - std::string frame(symbols[i]); - - size_t start = frame.find('('); - size_t end = frame.find('+', start); - if (start != std::string::npos && end != std::string::npos) { - std::string mangled = frame.substr(start + 1, end - start - 1); - int status; - char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status); - if (status == 0 && demangled) { - frame = frame.substr(0, start + 1) + demangled + frame.substr(end); - free(demangled); - } - } - frame_info = frame; - } - - char buf[16]; - snprintf(buf, sizeof(buf), " #%d ", i - skip_frames); - result += buf + frame_info + "\n"; - if (!inline_chain.empty()) { - result += inline_chain; - } - } - free(symbols); - } -#else - result = "(Stack trace is only available on Linux)\n"; -#endif - return result; -} - -// AssertionError constructor -static std::string build_assert_message(const char *condition, const char *file, int line) { - std::string msg = "Assertion failed: " + std::string(condition) + "\n"; - msg += " Location: " + std::string(file) + ":" + std::to_string(line) + "\n"; - msg += get_stacktrace(3); - return msg; -} - -AssertionError::AssertionError(const char *condition, const char *file, int line) : - std::runtime_error(build_assert_message(condition, file, line)), - condition_(condition), - file_(file), - line_(line) {} - -[[noreturn]] void assert_impl(const char *condition, const char *file, int line) { - fprintf(stderr, "\n========================================\n"); - fprintf(stderr, "Assertion failed: %s\n", condition); - fprintf(stderr, "Location: %s:%d\n", file, line); - fprintf(stderr, "%s", get_stacktrace(2).c_str()); - fprintf(stderr, "========================================\n\n"); - fflush(stderr); - - throw AssertionError(condition, file, line); -} diff --git a/src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h deleted file mode 100644 index 25c6cbc23..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Orchestration API - Slim header for orchestration .so files - * - * This header provides everything an orchestration source needs without - * pulling in runtime implementation headers. The orchestration .so has - * zero link dependencies on runtime .cpp files; all runtime calls go - * through the PTO2RuntimeOps function-pointer table embedded in - * PTO2Runtime. - * - * Orchestration sources include ONLY this header: - * #include "pto_orchestration_api.h" - * - * Runtime sources continue to use pto_runtime2.h (which defines the - * full PTO2Runtime struct with all internal fields). - */ - -#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_ORCHESTRATION_PTO_ORCHESTRATION_API_H_ -#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_ORCHESTRATION_PTO_ORCHESTRATION_API_H_ - -#include -#include -#include - -// Type headers needed by orchestration -#include "pto_runtime2_types.h" // PTO2TaskId -#include "pto_submit_types.h" // MixedKernels, INVALID_KERNEL_ID, subtask slots -#include "pto_types.h" // Arg, PTOTensorEntry, TensorArgType -#include "task_args.h" // ChipStorageTaskArgs, ContinuousTensor -#include "tensor.h" // Tensor, TensorCreateInfo, make_tensor_external - -// Convert ContinuousTensor to Tensor (needs make_tensor_external from tensor.h) -static_assert( - CONTINUOUS_TENSOR_MAX_DIMS == RUNTIME_MAX_TENSOR_DIMS, "ContinuousTensor and runtime max dims must match" -); -inline Tensor from_tensor_arg(const ContinuousTensor &t, bool manual_dep = false, int32_t version = 0) { - return make_tensor_external( - reinterpret_cast(static_cast(t.data)), t.shapes, t.ndims, t.dtype, manual_dep, version - ); -} - -// ============================================================================= -// Ops Table and Opaque Runtime -// ============================================================================= - -/** - * Forward declaration — the orchestration sees PTO2Runtime as a partial - * struct whose first field is the ops pointer. The full definition - * lives in pto_runtime2.h (used only by runtime .cpp files). - */ -typedef struct PTO2Runtime PTO2Runtime; - -/** - * Function-pointer table for runtime operations. - * Populated by the runtime; called by orchestration through inline wrappers. - */ -typedef struct PTO2RuntimeOps { - SubmitResult (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); - void (*add_dependency)(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer); - void (*scope_begin)(PTO2Runtime *rt); - void (*scope_end)(PTO2Runtime *rt); - void (*orchestration_done)(PTO2Runtime *rt); - bool (*is_fatal)(PTO2Runtime *rt); - - // Logging (populated by runtime, called by orchestration) - void (*log_error)(const char *func, const char *fmt, ...); - void (*log_warn)(const char *func, const char *fmt, ...); - void (*log_info)(const char *func, const char *fmt, ...); - void (*log_debug)(const char *func, const char *fmt, ...); - void (*log_always)(const char *func, const char *fmt, ...); -} PTO2RuntimeOps; - -/** - * Partial PTO2Runtime definition for orchestration. - * - * Only the ops pointer is visible. The real struct (in pto_runtime2.h) - * has the same first field, so accessing rt->ops through this definition - * is well-defined (C struct layout guarantee). - */ -struct PTO2Runtime { - const PTO2RuntimeOps *ops; -}; - -// ============================================================================= -// Inline Convenience Wrappers (call through ops table) -// ============================================================================= - -static inline SubmitResult rt_submit_task(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) { - return rt->ops->submit_task(rt, mixed_kernels, args); -} - -/** - * Convenience wrapper: submit an AIC-only task. - */ -static inline SubmitResult rt_submit_aic_task(PTO2Runtime *rt, int32_t kernel_id, const Arg &args) { - MixedKernels mk; - mk.aic_kernel_id = kernel_id; - return rt->ops->submit_task(rt, mk, args); -} - -/** - * Convenience wrapper: submit an AIV-only task (uses AIV0 slot). - */ -static inline SubmitResult rt_submit_aiv_task(PTO2Runtime *rt, int32_t kernel_id, const Arg &args) { - MixedKernels mk; - mk.aiv0_kernel_id = kernel_id; - return rt->ops->submit_task(rt, mk, args); -} - -/** - * Add an explicit dependency: consumer waits for producer to complete. - */ -static inline void rt_add_dependency(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer) { - rt->ops->add_dependency(rt, producer, consumer); -} - -static inline void rt_scope_begin(PTO2Runtime *rt) { rt->ops->scope_begin(rt); } - -static inline void rt_scope_end(PTO2Runtime *rt) { rt->ops->scope_end(rt); } - -static inline void rt_orchestration_done(PTO2Runtime *rt) { rt->ops->orchestration_done(rt); } - -static inline bool rt_is_fatal(PTO2Runtime *rt) { return rt->ops->is_fatal(rt); } - -// ============================================================================= -// Logging Macros for Orchestration (call through ops table) -// ============================================================================= - -#define LOG_ERROR(rt, fmt, ...) (rt)->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__) -#define LOG_WARN(rt, fmt, ...) (rt)->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__) -#define LOG_INFO(rt, fmt, ...) (rt)->ops->log_info(__FUNCTION__, fmt, ##__VA_ARGS__) -#define LOG_DEBUG(rt, fmt, ...) (rt)->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__) -#define LOG_ALWAYS(rt, fmt, ...) (rt)->ops->log_always(__FUNCTION__, fmt, ##__VA_ARGS__) - -// ============================================================================= -// C++ Scope Guards and Macros -// ============================================================================= - -/** - * RAII Scope Guard (calls through ops table) - */ -class PTO2ScopeGuard { -public: - explicit PTO2ScopeGuard(PTO2Runtime *rt) : - rt_(rt) { - rt_->ops->scope_begin(rt_); - } - ~PTO2ScopeGuard() { rt_->ops->scope_end(rt_); } - -private: - PTO2Runtime *rt_; -}; - -#define _PTO2_CONCATENATE_IMPL(x, y) x##y -#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y) - -#define PTO2_SCOPE_GUARD(rt) [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)(rt) - -/** - * Scoped block macro: - * PTO2_SCOPE(rt) { - * rt_submit_task(rt, ...); - * } - */ -#define PTO2_SCOPE(rt) if (PTO2_SCOPE_GUARD(rt); true) - -// ============================================================================= -// Orchestration Config -// ============================================================================= - -/** - * Configuration exported by orchestration .so via aicpu_orchestration_config(). - * The executor reads these values to set up shared memory and runtime. - * - * This struct is defined identically in pto_runtime2.h (with an include - * guard) so the executor can use the same type without including this header. - */ -#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED -#define PTO2_ORCHESTRATION_CONFIG_DEFINED -struct PTO2OrchestrationConfig { - int expected_arg_count; -}; -#endif - -#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_ORCHESTRATION_PTO_ORCHESTRATION_API_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/common.h b/src/a2a3/runtime/aicpu_build_graph/runtime/common.h deleted file mode 100644 index 1cb9647ce..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/common.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#pragma once - -#include -#include -#include -#include - -/** - * Get current stack trace information (including file paths and line numbers). - * Implemented in common.cpp. - */ -std::string get_stacktrace(int skip_frames = 1); - -/** - * Assertion failure exception, containing file, line number, condition, and stack trace information. - */ -class AssertionError : public std::runtime_error { -public: - AssertionError(const char *condition, const char *file, int line); - - const char *condition() const { return condition_; } - const char *file() const { return file_; } - int line() const { return line_; } - -private: - const char *condition_; - const char *file_; - int line_; -}; - -/** - * Handler function for assertion failures. - * Implemented in common.cpp. - */ -[[noreturn]] void assert_impl(const char *condition, const char *file, int line); - -/** - * debug_assert macro - checks condition in debug mode; throws exception and prints stack trace on failure. - * No-op in release mode (NDEBUG). - */ -#ifdef NDEBUG -#define debug_assert(cond) ((void)0) -#else -#define debug_assert(cond) \ - do { \ - if (!(cond)) { \ - assert_impl(#cond, __FILE__, __LINE__); \ - } \ - } while (0) -#endif - -/** - * always_assert macro - checks condition in both debug and release modes. - */ -#define always_assert(cond) \ - do { \ - if (!(cond)) { \ - assert_impl(#cond, __FILE__, __LINE__); \ - } \ - } while (0) diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h deleted file mode 100644 index aa847eea4..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * @file pto2_dispatch_payload.h - * @brief Minimal dispatch payload for AICore kernel execution - * - * Shared between AICPU (builds in-place) and AICore (reads to run kernel). - * Handshake.task points to PTO2DispatchPayload embedded in PTO2TaskPayload. - * - * Only contains fields AICore needs to execute: function address + arguments. - * Metadata (task_id, kernel_id, core_type) lives in PTO2TaskDescriptor and - * is accessed by AICPU when needed (profiling, diagnostics). - */ - -#ifndef RT2_PTO2_DISPATCH_PAYLOAD_H_ -#define RT2_PTO2_DISPATCH_PAYLOAD_H_ - -#include - -/** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */ -#ifndef PTO2_DISPATCH_MAX_ARGS -#define PTO2_DISPATCH_MAX_ARGS 128 -#endif - -/** - * Dispatch payload: minimal execution interface for AICore. - * Layout: function_bin_addr followed by args[]. - * AICore reads function_bin_addr, casts to UnifiedKernelFunc, calls with args. - */ -struct PTO2DispatchPayload { - uint64_t function_bin_addr; /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */ - uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars) */ -}; - -#endif // RT2_PTO2_DISPATCH_PAYLOAD_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp deleted file mode 100644 index adabc68e0..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp +++ /dev/null @@ -1,608 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -/** - * PTO Runtime2 - Orchestrator Implementation (Explicit Dependency Variant) - * - * Implements orchestrator state management, scope handling, task submission - * with explicit dependencies, and scope-end batch publish. - * - * Key differences from tensormap_and_ringbuffer: - * - No TensorMap: submit_task is a 3-step process (alloc, heap, write) - * - add_dependency: explicitly wires producer -> consumer edges - * - scope_end: batch-publishes all tasks (releases +1 fanin redundance) - */ - -#include "pto_orchestrator.h" - -#include -#include -#include -#include -#include - -#include "common/unified_log.h" -#include "pto_runtime2_types.h" -#include "pto_shared_memory.h" -#include "pto_types.h" -#include "tensor.h" - -// ============================================================================= -// Orchestrator Profiling (compile-time toggle) -// ============================================================================= -#if PTO2_ORCH_PROFILING -#include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" -// Weak fallback for builds that don't link device_time.cpp (e.g. host). -__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } -__attribute__((weak, visibility("hidden"))) void -l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} -static uint64_t g_orch_alloc_cycle = 0; -static uint64_t g_orch_args_cycle = 0; -static uint64_t g_orch_heap_cycle = 0; -static uint64_t g_orch_fanin_cycle = 0; -static uint64_t g_orch_scope_end_cycle = 0; -static int64_t g_orch_submit_count = 0; -static uint32_t g_orch_submit_idx = 0; -uint64_t g_orch_alloc_wait_cycle = 0; -uint64_t g_orch_heap_wait_cycle = 0; -uint64_t g_orch_fanin_wait_cycle = 0; -uint64_t g_orch_alloc_atomic_count = 0; -uint64_t g_orch_args_atomic_count = 0; -uint64_t g_orch_heap_atomic_count = 0; -uint64_t g_orch_fanin_atomic_count = 0; -uint64_t g_orch_scope_end_atomic_count = 0; -#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 -#define CYCLE_COUNT_LAP(acc) \ - do { \ - _t1 = get_sys_cnt_aicpu(); \ - acc += (_t1 - _t0); \ - _t0 = _t1; \ - } while (0) -#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) \ - do { \ - _t1 = get_sys_cnt_aicpu(); \ - acc += (_t1 - _t0); \ - l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \ - _t0 = _t1; \ - } while (0) -#elif PTO2_PROFILING -#include "aicpu/device_time.h" -#include "aicpu/l2_perf_collector_aicpu.h" -__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } -__attribute__((weak, visibility("hidden"))) void -l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} -static uint32_t g_orch_submit_idx = 0; -#define CYCLE_COUNT_START() \ - bool _prof_active = orch->enable_l2_swimlane; \ - uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0 -#define CYCLE_COUNT_LAP(acc) \ - do { \ - } while (0) -#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) \ - do { \ - if (_prof_active) { \ - _t1 = get_sys_cnt_aicpu(); \ - l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \ - _t0 = _t1; \ - } \ - } while (0) -#else -#define CYCLE_COUNT_START() -#define CYCLE_COUNT_LAP(acc) -#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) -#endif - -// ============================================================================= -// Orchestrator Initialization -// ============================================================================= - -bool pto2_orchestrator_init( - PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, - int32_t dep_pool_capacity -) { - *orch = PTO2OrchestratorState{}; - - orch->sm_handle = sm_handle; - orch->gm_heap_base = gm_heap; - orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH; - orch->fatal = false; - - // Initialize per-ring resources - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - void *ring_heap_base = reinterpret_cast(gm_heap) + r * heap_size; - auto &fc = sm_handle->header->rings[r].fc; - - pto2_heap_ring_init(&orch->rings[r].heap_ring, ring_heap_base, heap_size, &fc.heap_tail, &fc.heap_top); - orch->rings[r].heap_ring.error_code_ptr = &sm_handle->header->orch_error_code; - - pto2_task_ring_init( - &orch->rings[r].task_ring, sm_handle->task_descriptors[r], sm_handle->header->rings[r].task_window_size, - &fc.last_task_alive, &fc.current_task_index - ); - orch->rings[r].task_ring.error_code_ptr = &sm_handle->header->orch_error_code; - - PTO2DepListEntry *dep_entries = - reinterpret_cast(calloc(dep_pool_capacity, sizeof(PTO2DepListEntry))); - if (!dep_entries) { - for (int j = 0; j < r; j++) { - free(orch->rings[j].dep_pool.base); - } - return false; - } - orch->rings[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_handle->header->orch_error_code); - } - - // Initialize scope stack - uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH; - int32_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP; - orch->scope_tasks = reinterpret_cast(malloc(init_cap * sizeof(PTO2TaskSlotState *))); - orch->scope_begins = reinterpret_cast(malloc(max_depth * sizeof(int32_t))); - if (!orch->scope_tasks || !orch->scope_begins) { - free(orch->scope_tasks); - free(orch->scope_begins); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - free(orch->rings[r].dep_pool.base); - } - return false; - } - orch->scope_tasks_size = 0; - orch->scope_tasks_capacity = init_cap; - orch->scope_stack_top = -1; - orch->scope_stack_capacity = max_depth; - - return true; -} - -void pto2_orchestrator_destroy(PTO2OrchestratorState *orch) { - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - free(orch->rings[r].dep_pool.base); - orch->rings[r].dep_pool.base = NULL; - } - - free(orch->scope_tasks); - orch->scope_tasks = NULL; - free(orch->scope_begins); - orch->scope_begins = NULL; -} - -void pto2_orchestrator_set_scheduler(PTO2OrchestratorState *orch, PTO2SchedulerState *scheduler) { - orch->scheduler = scheduler; -} - -// ============================================================================= -// Scope Management -// ============================================================================= - -static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) { - if (orch->scope_tasks_size >= orch->scope_tasks_capacity) { - int32_t new_cap = orch->scope_tasks_capacity * 2; - PTO2TaskSlotState **new_buf = - reinterpret_cast(realloc(orch->scope_tasks, new_cap * sizeof(PTO2TaskSlotState *))); - assert(new_buf && "Failed to grow scope task buffer"); - orch->scope_tasks = new_buf; - orch->scope_tasks_capacity = new_cap; - } - orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state; -} - -void pto2_scope_begin(PTO2OrchestratorState *orch) { - if (orch->fatal) { - return; - } - assert(orch->scope_stack_top < static_cast(orch->scope_stack_capacity - 1) && "Scope stack overflow"); - - ++orch->scope_stack_top; - orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size; -} - -void pto2_scope_end(PTO2OrchestratorState *orch) { - if (orch->fatal) { - return; - } - assert(orch->scope_stack_top >= 0 && "Scope stack underflow"); - -#if PTO2_ORCH_PROFILING - uint64_t _se0 = get_sys_cnt_aicpu(); -#endif - - int32_t begin = orch->scope_begins[orch->scope_stack_top--]; - int32_t count = orch->scope_tasks_size - begin; - - if (orch->scheduler && count > 0) { - PTO2TaskSlotState **tasks = &orch->scope_tasks[begin]; - - // Batch publish: release the "+1 redundance" in fanin for each task. - // Tasks whose fanin is fully satisfied become READY and are pushed - // to the scheduler's ready queues. - for (int32_t i = 0; i < count; i++) { - PTO2TaskSlotState *slot = tasks[i]; - if (!slot) continue; - - // task_state is already PENDING from submit_task (defensive store) - slot->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); - - // Release the +1 fanin redundance - int32_t new_rc = slot->fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; - if (new_rc >= slot->fanin_count) { - PTO2ResourceShape shape = pto2_active_mask_to_shape(slot->active_mask); - orch->scheduler->ready_queues[static_cast(shape)].push(slot); - } - } - - // Release the scope's fanout reference on each task (enables CONSUMED transition) - orch->scheduler->on_scope_end(tasks, count); - } - - // Rewind the task buffer - orch->scope_tasks_size = begin; - -#if PTO2_ORCH_PROFILING - uint64_t _se1 = get_sys_cnt_aicpu(); - g_orch_scope_end_cycle += (_se1 - _se0); -#endif -} - -// ============================================================================= -// Task Submission (3-step: alloc, heap, write — no TensorMap) -// ============================================================================= -SubmitResult pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args) { - CYCLE_COUNT_START(); - - SubmitResult result; - - if (orch->fatal) { - return result; - } - - // Validate Arg - if (args.has_error) { - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Invalid Arg Detected!"); - LOG_ERROR("========================================"); - LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)"); - LOG_ERROR(" tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count()); - LOG_ERROR("========================================"); - orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release); - orch->fatal = true; - return result; - } - - uint8_t ring_id = orch->current_ring_id(); - auto &task_ring = orch->rings[ring_id].task_ring; - PTO2SchedulerState *sched = orch->scheduler; - - // Validate submit inputs - uint8_t active_mask = pto2_mixed_kernels_to_active_mask(mixed_kernels); - always_assert(active_mask != 0 && "MixedKernels must have at least one active slot"); - - // Normalize single-AIV tasks - MixedKernels normalized = mixed_kernels; - bool has_aiv0 = (active_mask & PTO2_SUBTASK_MASK_AIV0) != 0; - bool has_aiv1 = (active_mask & PTO2_SUBTASK_MASK_AIV1) != 0; - if (has_aiv1 && !has_aiv0) { - normalized.aiv0_kernel_id = normalized.aiv1_kernel_id; - normalized.aiv1_kernel_id = INVALID_KERNEL_ID; - active_mask = pto2_mixed_kernels_to_active_mask(normalized); - } - - always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope"); - - // Scope deadlock pre-check - { - int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top]; - if (scope_task_count >= task_ring.window_size - 1) { - int32_t total_submitted = task_ring.current_index_ptr->load(std::memory_order_acquire); - int32_t last_alive = task_ring.last_alive_ptr->load(std::memory_order_acquire); - int32_t active_count = total_submitted - last_alive; - - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id); - LOG_ERROR("========================================"); - LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, task_ring.window_size); - LOG_ERROR(" scope_depth: %d", orch->scope_stack_top + 1); - LOG_ERROR(" ring_id: %d", ring_id); - LOG_ERROR(" scope_task_count: %d", scope_task_count); - LOG_ERROR(" total_submitted: %d", total_submitted); - LOG_ERROR(" last_task_alive: %d", last_alive); - LOG_ERROR(" active_tasks: %d / %d", active_count, task_ring.window_size); - LOG_ERROR("========================================"); - orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_SCOPE_DEADLOCK, std::memory_order_release); - orch->fatal = true; - return result; - } - } - - // === STEP 1: Allocate task slot from Task Ring === - int32_t local_id = task_ring.pto2_task_ring_alloc(); - if (local_id < 0) { - orch->fatal = true; - return result; - } - int32_t slot = task_ring.get_task_slot(local_id); - PTO2TaskId task_id = pto2_make_task_id(ring_id, static_cast(local_id)); - - PTO2TaskDescriptor &task = task_ring.get_task_by_slot(slot); - PTO2TaskPayload *payload = &orch->sm_handle->task_payloads[ring_id][slot]; - - // Prefetch payload cache lines for write - for (int32_t i = 0; i < args.tensor_count(); i++) { - __builtin_prefetch(&payload->tensors[i], 1, 3); - __builtin_prefetch(reinterpret_cast(&payload->tensors[i]) + 64, 1, 3); - } - for (int32_t i = 0; i < args.scalar_count(); i += 8) { - __builtin_prefetch(&payload->scalars[i], 1, 3); - } - __builtin_prefetch(payload, 1, 3); - __builtin_prefetch(reinterpret_cast(payload) + 64, 1, 3); - __builtin_prefetch(reinterpret_cast(payload) + 128, 1, 3); - - // Initialize slot state - if (sched) { - auto &rs = sched->ring_sched_states[ring_id]; - PTO2TaskSlotState &slot_state = rs.get_slot_state_by_slot(slot); - // fanin_count starts at 1: the "+1 redundance" released at scope_end - slot_state.fanin_count = 1; - slot_state.fanout_head = nullptr; - slot_state.fanout_lock.store(0, std::memory_order_relaxed); - // fanout_count = 1 (owning scope holds one reference) - slot_state.fanout_count = 1; - slot_state.fanout_refcount.store(0, std::memory_order_release); - slot_state.fanin_refcount.store(0, std::memory_order_release); - slot_state.payload = payload; - slot_state.task = &task; - slot_state.active_mask = active_mask; - slot_state.subtask_done_mask.store(0, std::memory_order_relaxed); - slot_state.ring_id = ring_id; - // Reset task_state so add_dependency doesn't see stale COMPLETED/CONSUMED - // from a previously-reused slot. The scheduler won't act on PENDING tasks - // until they're pushed to a ready queue at scope_end. - slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); - scope_tasks_push(orch, &slot_state); - } else { - scope_tasks_push(orch, nullptr); - } - - CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw); - - // === STEP 2: Heap allocation for OUTPUT tensors === - int32_t total_output_size = 0; - for (int i = 0; i < args.tensor_count(); i++) { - if (args.tag(i) == TensorArgType::OUTPUT) { - total_output_size += - PTO2_ALIGN_UP(args.tensor(i).create_info.buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN); - } - } - - void *local_packed_base = nullptr; - void *local_packed_end = nullptr; - if (total_output_size > 0) { - local_packed_base = orch->pto2_alloc_packed_buffer(total_output_size); - if (!local_packed_base) { - orch->fatal = true; - return result; - } - local_packed_end = reinterpret_cast(local_packed_base) + total_output_size; - } - - // Materialize OUTPUT tensors into TaskOutputTensors - int32_t offset = 0; - for (int i = 0; i < args.tensor_count(); i++) { - if (args.tag(i) == TensorArgType::OUTPUT) { - const TensorCreateInfo &ci = args.tensor(i).create_info; - uint64_t buffer_size = ci.buffer_size_bytes(); - uint64_t alloc_addr = reinterpret_cast(reinterpret_cast(local_packed_base) + offset); - offset += PTO2_ALIGN_UP(buffer_size, PTO2_PACKED_OUTPUT_ALIGN); - result.outputs.materialize_output(ci, reinterpret_cast(alloc_addr), /*version=*/0); - } - } - - CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, task_id.raw); - - // Periodically reclaim dep_pool entries from retired tasks - if (sched) { - int32_t sm_last_task_alive = task_ring.last_alive_ptr->load(std::memory_order_acquire); - orch->rings[ring_id].dep_pool.reclaim(*sched, ring_id, sm_last_task_alive); - } - - // === STEP 3: Write task descriptor and payload === - __builtin_prefetch(&task, 1, 1); - task.task_id = task_id; - task.kernel_id[static_cast(PTO2SubtaskSlot::AIC)] = normalized.aic_kernel_id; - task.kernel_id[static_cast(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id; - task.kernel_id[static_cast(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id; - task.packed_buffer_base = local_packed_base; - task.packed_buffer_end = local_packed_end; - - payload->fanin_actual_count = 0; - payload->init(args, result.outputs); - - CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw); - - // Record dep pool watermark - if (sched) { - auto &rs = sched->ring_sched_states[ring_id]; - PTO2TaskSlotState &slot_state = rs.get_slot_state_by_slot(slot); - slot_state.dep_pool_mark = orch->rings[ring_id].dep_pool.top; - } - -#if PTO2_PROFILING - orch->tasks_submitted++; -#if PTO2_ORCH_PROFILING - g_orch_submit_count++; -#endif - g_orch_submit_idx++; -#endif - - result.task_id = task_id; - return result; -} - -// ============================================================================= -// Explicit Dependency Management -// ============================================================================= - -void pto2_add_dependency(PTO2OrchestratorState *orch, PTO2TaskId producer_id, PTO2TaskId consumer_id) { - if (orch->fatal) return; - - PTO2SchedulerState *sched = orch->scheduler; - if (!sched) return; - - uint8_t prod_ring = producer_id.ring(); - uint32_t prod_local = producer_id.local(); - uint8_t cons_ring = consumer_id.ring(); - uint32_t cons_local = consumer_id.local(); - - auto &prod_rs = sched->ring_sched_states[prod_ring]; - auto &cons_rs = sched->ring_sched_states[cons_ring]; - - PTO2TaskSlotState &prod_state = prod_rs.get_slot_state_by_task_id(prod_local); - PTO2TaskSlotState &cons_state = cons_rs.get_slot_state_by_task_id(cons_local); - - // Increment consumer's fanin_count (+1 for this dependency) - cons_state.fanin_count += 1; - - // Record producer in consumer's payload for DFX/debugging - PTO2TaskPayload *cons_payload = cons_state.payload; - if (cons_payload->fanin_actual_count < PTO2_MAX_INPUTS) { - cons_payload->fanin_slot_states[cons_payload->fanin_actual_count] = &prod_state; - cons_payload->fanin_actual_count++; - } - - // Wire the fanout edge from producer to consumer. - // Always use fanout_lock: the producer may be from a previous scope - // and already visible to the scheduler. - auto &dep_pool = orch->rings[cons_ring].dep_pool; - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING - pto2_fanout_lock(prod_state, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle); -#else - pto2_fanout_lock(prod_state); -#endif - - prod_state.fanout_count += 1; - int32_t prod_task_state = prod_state.task_state.load(std::memory_order_acquire); - - if (prod_task_state >= PTO2_TASK_COMPLETED) { - // Producer already completed — count as early finish - cons_state.fanin_refcount.fetch_add(1, std::memory_order_relaxed); - } else { - // Producer not yet completed — add consumer to producer's fanout list - prod_state.fanout_head = dep_pool.prepend(prod_state.fanout_head, &cons_state); - } - - pto2_fanout_unlock(prod_state); - -#if PTO2_ORCH_PROFILING - g_orch_fanin_atomic_count += 3; // lock CAS + load(task_state) + unlock store -#endif -} - -// ============================================================================= -// Flow Control -// ============================================================================= - -void pto2_orchestrator_done(PTO2OrchestratorState *orch) { - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - int32_t total_tasks = orch->rings[r].task_ring.current_index_ptr->load(std::memory_order_acquire); - if (total_tasks > 0) { - LOG_INFO("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks); - } - auto &pool = orch->rings[r].dep_pool; - if (pool.top > 0) { - LOG_INFO( - "=== [DepPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, pool.top, pool.tail, - pool.top - pool.tail, pool.high_water, pool.capacity - ); - } - } - orch->sm_handle->header->orchestrator_done.store(1, std::memory_order_release); -#if !PTO2_ORCH_PROFILING && PTO2_PROFILING - g_orch_submit_idx = 0; -#endif -} - -// ============================================================================= -// Debug Utilities -// ============================================================================= - -void pto2_orchestrator_print_stats(PTO2OrchestratorState *orch) { - LOG_INFO("=== Orchestrator Statistics ==="); -#if PTO2_PROFILING - LOG_INFO("Tasks submitted: %" PRId64, orch->tasks_submitted); - LOG_INFO("Buffers allocated: %" PRId64, orch->buffers_allocated); - LOG_INFO("Bytes allocated: %" PRId64, orch->bytes_allocated); -#endif - LOG_INFO("Current scope depth: %d", orch->scope_stack_top + 1); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - int32_t active = pto2_task_ring_active_count(&orch->rings[r].task_ring); - if (active > 0) { - LOG_INFO("Ring %d task active: %d", r, active); - LOG_INFO( - "Ring %d heap used: %" PRIu64 " / %" PRIu64, r, - orch->rings[r].heap_ring.top_ptr->load(std::memory_order_relaxed), orch->rings[r].heap_ring.size - ); - LOG_INFO( - "Ring %d dep pool: %d / %d", r, orch->rings[r].dep_pool.used(), orch->rings[r].dep_pool.capacity - ); - } - } - LOG_INFO("==============================="); -} - -void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState *orch) { - LOG_INFO("=== Scope Stack ==="); - LOG_INFO("Depth: %d", orch->scope_stack_top + 1); - - for (int i = 0; i <= orch->scope_stack_top; i++) { - int32_t begin = orch->scope_begins[i]; - int32_t end = (i < orch->scope_stack_top) ? orch->scope_begins[i + 1] : orch->scope_tasks_size; - LOG_INFO(" [%d] tasks_owned = %d", i, end - begin); - } - - LOG_INFO("=================="); -} - -#if PTO2_ORCH_PROFILING -PTO2OrchProfilingData pto2_orchestrator_get_profiling() { - PTO2OrchProfilingData d; - d.alloc_cycle = g_orch_alloc_cycle; - d.args_cycle = g_orch_args_cycle; - d.heap_cycle = g_orch_heap_cycle; - d.fanin_cycle = g_orch_fanin_cycle; - d.scope_end_cycle = g_orch_scope_end_cycle; - d.submit_count = g_orch_submit_count; - d.alloc_wait_cycle = g_orch_alloc_wait_cycle; - d.heap_wait_cycle = g_orch_heap_wait_cycle; - d.fanin_wait_cycle = g_orch_fanin_wait_cycle; - d.alloc_atomic_count = g_orch_alloc_atomic_count; - d.args_atomic_count = g_orch_args_atomic_count; - d.heap_atomic_count = g_orch_heap_atomic_count; - d.fanin_atomic_count = g_orch_fanin_atomic_count; - d.scope_end_atomic_count = g_orch_scope_end_atomic_count; - - // Reset - g_orch_alloc_cycle = g_orch_args_cycle = 0; - g_orch_heap_cycle = g_orch_fanin_cycle = 0; - g_orch_scope_end_cycle = 0; - g_orch_submit_count = 0; - g_orch_submit_idx = 0; - g_orch_alloc_wait_cycle = 0; - g_orch_heap_wait_cycle = 0; - g_orch_fanin_wait_cycle = 0; - g_orch_alloc_atomic_count = 0; - g_orch_args_atomic_count = 0; - g_orch_heap_atomic_count = 0; - g_orch_fanin_atomic_count = 0; - g_orch_scope_end_atomic_count = 0; - return d; -} -#endif diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h deleted file mode 100644 index 1e1ce5e2d..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -/** - * PTO Runtime2 - Orchestrator Interface (Explicit Dependency Variant) - * - * The Orchestrator is responsible for: - * 1. Executing the orchestration function (Turing-complete control flow) - * 2. Allocating intermediate buffers from the heap - * 3. Submitting tasks via async InCore function calls - * 4. Building the dependency graph via explicit add_dependency calls - * 5. Managing buffer scopes for lifecycle control - * - * Key differences from the tensormap_and_ringbuffer variant: - * - No TensorMap: dependencies are explicitly specified by orchestration code - * - Scope-end batch publish: tasks are invisible to the scheduler until scope_end - * - submit_task returns PTO2TaskId for use in add_dependency calls - */ - -#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_ORCHESTRATOR_H_ -#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_ORCHESTRATOR_H_ - -#include "pto_ring_buffer.h" -#include "pto_runtime2_types.h" -#include "pto_scheduler.h" -#include "pto_shared_memory.h" -#include "pto_submit_types.h" -#include "pto_types.h" - -// ============================================================================= -// Orchestrator State -// ============================================================================= - -/** - * Orchestrator state structure (private to Orchestrator) - * - * Contains all state needed for task graph construction and buffer management. - * No TensorMap — dependencies are added explicitly via pto2_add_dependency(). - */ -struct PTO2OrchestratorState { - // === SHARED MEMORY ACCESS === - PTO2SharedMemoryHandle *sm_handle; - - // === PER-RING RESOURCES === - PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; - - // === SCOPE STACK (Private) === - // Single contiguous buffer of task IDs, partitioned by scope level. - // scope_begins[i] is the index into scope_tasks where scope i starts. - // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size). - PTO2TaskSlotState **scope_tasks; // Flat buffer of taskSlotState (all scopes concatenated) - int32_t scope_tasks_size; // Number of task IDs currently in the buffer - int32_t scope_tasks_capacity; // Allocated capacity of scope_tasks - int32_t *scope_begins; // scope_begins[i] = start index of scope i in scope_tasks - int32_t scope_stack_top; // Current top of stack (-1 = no scope open) - uint64_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) - - // === SCHEDULER REFERENCE === - // Note: In simulated mode, orchestrator and scheduler share address space - // In real mode, they communicate via shared memory only - PTO2SchedulerState *scheduler; // For simulated mode only -#if PTO2_PROFILING - // Runtime profiling switch copied from Runtime::enable_l2_swimlane. - bool enable_l2_swimlane; -#endif - - // === GM HEAP (for output buffers) === - void *gm_heap_base; // Base address of GM heap - uint64_t gm_heap_size; // Total size of GM heap (all rings) - - // === FATAL ERROR === - // Fatal error flag (single-thread access by orchestrator, no atomic needed) - // Cross-thread notification uses shared memory orch_error_code (atomic) - bool fatal; - - // === STATISTICS === -#if PTO2_PROFILING - int64_t tasks_submitted; - int64_t buffers_allocated; - int64_t bytes_allocated; -#endif - - /** - * Get current ring index from scope depth. - * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) - */ - uint8_t current_ring_id() const { - int32_t depth = scope_stack_top; - if (depth < 0) depth = 0; - return depth < PTO2_MAX_RING_DEPTH ? static_cast(depth) : PTO2_MAX_RING_DEPTH - 1; - } - - /** - * Allocate packed output buffer from current ring's heap - */ - void *pto2_alloc_packed_buffer(int32_t total_size) { - if (total_size <= 0) { - return NULL; - } - - uint8_t rid = current_ring_id(); - void *buffer = rings[rid].heap_ring.pto2_heap_ring_alloc(total_size); - -#if PTO2_PROFILING - buffers_allocated++; - bytes_allocated += total_size; -#endif - - return buffer; - } -}; - -// ============================================================================= -// Orchestrator API -// ============================================================================= - -/** - * Initialize orchestrator state - * - * @param orch Orchestrator state to initialize - * @param sm_handle Shared memory handle - * @param gm_heap GM heap memory for output buffers - * @param heap_size Size of GM heap - * @return true on success - */ -bool pto2_orchestrator_init( - PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, - int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE -); - -/** - * Destroy orchestrator state and free resources - */ -void pto2_orchestrator_destroy(PTO2OrchestratorState *orch); - -/** - * Set scheduler reference (for simulated mode) - */ -void pto2_orchestrator_set_scheduler(PTO2OrchestratorState *orch, PTO2SchedulerState *scheduler); - -// ============================================================================= -// Scope Management -// ============================================================================= - -/** - * Begin a new scope - * - * Pushes a new empty task list onto the scope stack. - * Tasks submitted while this scope is at the top of the stack are - * owned by it and have their fanout_count initialized to 1. - */ -void pto2_scope_begin(PTO2OrchestratorState *orch); - -/** - * End current scope - * - * Batch-publishes all tasks in the scope: - * 1. For each task, releases the "+1 redundance" in fanin_refcount - * 2. Tasks with all deps satisfied are pushed to the ready queue - * 3. Releases the scope's fanout reference (enables CONSUMED transition) - * - * This is the scope-end batch publish mechanism: tasks are invisible - * to the scheduler until this point. - */ -void pto2_scope_end(PTO2OrchestratorState *orch); - -// ============================================================================= -// Task Submission -// ============================================================================= - -/** - * Submit a task with InCore function and parameters - * - * Simplified flow (no TensorMap): - * 1. Allocates task slot from TaskRing (blocks until available) - * 2. Allocates packed output buffer from HeapRing (blocks until available) - * 3. Writes task descriptor and payload - * 4. Initializes fanin with +1 redundance (released at scope_end) - * - * The task is NOT visible to the scheduler until scope_end. - * Dependencies must be added via pto2_add_dependency() before scope_end. - * - * @param orch Orchestrator state - * @param mixed_kernels Kernel IDs for AIC/AIV0/AIV1 slots - * @param args Aggregated tensor and scalar parameters - * @return PTO2TaskId for use in pto2_add_dependency() - */ -SubmitResult pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args); - -// ============================================================================= -// Explicit Dependency Management -// ============================================================================= - -/** - * Add a dependency edge: producer -> consumer - * - * The consumer task will not become ready until the producer completes. - * Both tasks must have been created via pto2_submit_mixed_task(). - * - * For cross-scope dependencies (producer from a previous scope that is - * already visible to the scheduler), this uses the fanout_lock for - * thread safety and handles the case where the producer has already - * completed (early-finish optimization). - * - * @param orch Orchestrator state - * @param producer Producer task ID (must complete before consumer starts) - * @param consumer Consumer task ID (depends on producer) - */ -void pto2_add_dependency(PTO2OrchestratorState *orch, PTO2TaskId producer, PTO2TaskId consumer); - -// ============================================================================= -// Flow Control -// ============================================================================= - -/** - * Mark orchestration as complete - * - * Signals to scheduler that no more tasks will be submitted. - */ -void pto2_orchestrator_done(PTO2OrchestratorState *orch); - -// ============================================================================= -// Debug Utilities -// ============================================================================= - -/** - * Print orchestrator statistics - */ -void pto2_orchestrator_print_stats(PTO2OrchestratorState *orch); - -/** - * Print scope stack state - */ -void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState *orch); - -// ============================================================================= -// Orchestrator Profiling Data -// ============================================================================= - -#if PTO2_ORCH_PROFILING -struct PTO2OrchProfilingData { - uint64_t alloc_cycle; - uint64_t args_cycle; - uint64_t heap_cycle; - uint64_t fanin_cycle; - uint64_t scope_end_cycle; - int64_t submit_count; - // Wait time tracking for blocking phases - uint64_t alloc_wait_cycle; // Cycles spent waiting in task_ring_alloc - uint64_t heap_wait_cycle; // Cycles spent waiting in heap_ring_alloc - uint64_t fanin_wait_cycle; // Cycles spent waiting in fanout_lock - // Atomic operation counts per phase - uint64_t alloc_atomic_count; - uint64_t args_atomic_count; - uint64_t heap_atomic_count; - uint64_t fanin_atomic_count; - uint64_t scope_end_atomic_count; -}; - -/** - * Get and reset orchestrator profiling data. - * Returns accumulated profiling data and resets counters. - */ -PTO2OrchProfilingData pto2_orchestrator_get_profiling(); -#endif - -#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_ORCHESTRATOR_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp deleted file mode 100644 index 3ac6c8e31..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Runtime2 - Ring Buffer Implementation - * - * Implements HeapRing, TaskRing, and DepListPool ring buffers - * for zero-overhead memory management. - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#include "pto_ring_buffer.h" -#include -#include -#include // for exit() -#include "common/unified_log.h" -#include "pto_scheduler.h" - -// ============================================================================= -// Heap Ring Buffer Implementation -// ============================================================================= - -void pto2_heap_ring_init( - PTO2HeapRing *ring, void *base, uint64_t size, std::atomic *tail_ptr, std::atomic *top_ptr -) { - ring->base = base; - ring->size = size; - ring->top_ptr = top_ptr; - ring->tail_ptr = tail_ptr; -} - -// ============================================================================= -// Task Ring Buffer Implementation -// ============================================================================= - -void pto2_task_ring_init( - PTO2TaskRing *ring, PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *last_alive_ptr, - std::atomic *current_index_ptr -) { - ring->descriptors = descriptors; - ring->window_size = window_size; - ring->current_index_ptr = current_index_ptr; - ring->last_alive_ptr = last_alive_ptr; -} - -// ============================================================================= -// Dependency List Pool Implementation -// ============================================================================= -void PTO2DepListPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive) { - if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { - int32_t mark = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; - if (mark > 0) { - advance_tail(mark); - } - last_reclaimed = sm_last_task_alive; - } -} - -void PTO2DepListPool::ensure_space( - PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed -) { - if (available() >= needed) return; - - int spin_count = 0; - int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire); - while (available() < needed) { - reclaim(sched, ring_id, prev_last_alive); - if (available() >= needed) return; - - spin_count++; - - // Progress detection: reset spin counter if last_task_alive advances - int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire); - if (cur_last_alive > prev_last_alive) { - spin_count = 0; - prev_last_alive = cur_last_alive; - } - - if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { - int32_t current = fc.current_task_index.load(std::memory_order_acquire); - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Dependency Pool Deadlock Detected! (ring %d)", ring_id); - LOG_ERROR("========================================"); - LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count); - LOG_ERROR( - " - Pool used: %d / %d (%.1f%%)", used(), capacity, - (capacity > 0) ? (100.0 * used() / capacity) : 0.0 - ); - LOG_ERROR(" - Pool top: %d (linear)", top); - LOG_ERROR(" - Pool tail: %d (linear)", tail); - LOG_ERROR(" - High water: %d", high_water); - LOG_ERROR(" - Needed: %d entries", needed); - LOG_ERROR(" - last_task_alive: %d (stuck here)", cur_last_alive); - LOG_ERROR(" - current_task: %d", current); - LOG_ERROR(" - In-flight tasks: %d", current - cur_last_alive); - LOG_ERROR("Diagnosis:"); - LOG_ERROR(" last_task_alive is not advancing, so dep pool tail"); - LOG_ERROR(" cannot reclaim. Check TaskRing diagnostics for root cause."); - LOG_ERROR("Solution:"); - LOG_ERROR(" Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2); - LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); - LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", high_water * 2); - LOG_ERROR("========================================"); - exit(1); - } - SPIN_WAIT_HINT(); - } -} diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h deleted file mode 100644 index cc0c1bd56..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h +++ /dev/null @@ -1,619 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Runtime2 - Ring Buffer Data Structures - * - * Implements ring buffer designs for zero-overhead memory management: - * - * 1. HeapRing - Output buffer allocation from GM Heap - * - O(1) bump allocation - * - Wrap-around at end, skip to beginning if buffer doesn't fit - * - Implicit reclamation via heap_tail advancement - * - Back-pressure: stalls when no space available - * - * 2. TaskRing - Task slot allocation - * - Fixed window size (TASK_WINDOW_SIZE) - * - Wrap-around modulo window size - * - Implicit reclamation via last_task_alive advancement - * - Back-pressure: stalls when window is full - * - * 3. DepListPool - Dependency list entry allocation - * - Ring buffer for linked list entries - * - O(1) prepend operation - * - Implicit reclamation with task ring - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#ifndef PTO_RING_BUFFER_H -#define PTO_RING_BUFFER_H - -#include - -#include "pto_runtime2_types.h" -#include "pto_shared_memory.h" -#include "common/unified_log.h" - -struct PTO2SchedulerState; // Forward declaration for dep_pool reclaim - -// Set to 1 to enable periodic BLOCKED/Unblocked messages during spin-wait. -#ifndef PTO2_SPIN_VERBOSE_LOGGING -#define PTO2_SPIN_VERBOSE_LOGGING 1 -#endif - -// Block notification interval (in spin counts) -#define PTO2_BLOCK_NOTIFY_INTERVAL 10000 -// Heap ring spin limit - after this, report deadlock and exit -#define PTO2_HEAP_SPIN_LIMIT 100000 - -// Flow control spin limit - if exceeded, likely deadlock due to scope/fanout_count -#define PTO2_FLOW_CONTROL_SPIN_LIMIT 100000 - -// Dep pool spin limit - if exceeded, dep pool capacity too small for workload -#define PTO2_DEP_POOL_SPIN_LIMIT 100000 - -// ============================================================================= -// Heap Ring Buffer -// ============================================================================= - -/** - * Heap ring buffer structure - * - * Allocates output buffers from a contiguous GM Heap. - * Wrap-around design with implicit reclamation. - */ -struct PTO2HeapRing { - void *base; // GM_Heap_Base pointer - uint64_t size; // GM_Heap_Size (total heap size in bytes) - std::atomic *top_ptr; // Allocation pointer (shared atomic in SM header) - - // Reference to shared memory tail (for back-pressure) - std::atomic *tail_ptr; // Points to header->heap_tail - - // Error code pointer for fatal error reporting (→ sm_header->orch_error_code) - std::atomic *error_code_ptr = nullptr; - - /** - * Allocate memory from heap ring - * - * O(1) bump allocation with wrap-around. - * May STALL (spin-wait) if insufficient space (back-pressure). - * Never splits a buffer across the wrap-around boundary. - * - * @param size Requested size in bytes - * @return Pointer to allocated memory, or nullptr on fatal error - */ - void *pto2_heap_ring_alloc(uint64_t size) { - // Align size for DMA efficiency - size = PTO2_ALIGN_UP(size, PTO2_ALIGN_SIZE); - - // Spin-wait if insufficient space (back-pressure from Scheduler) - int spin_count = 0; - uint64_t prev_tail = tail_ptr->load(std::memory_order_acquire); -#if PTO2_SPIN_VERBOSE_LOGGING - bool notified = false; -#endif -#if PTO2_ORCH_PROFILING - uint64_t wait_start = 0; - bool waiting = false; -#endif - - while (1) { - void *ptr = pto2_heap_ring_try_alloc(size); - if (ptr != NULL) { -#if PTO2_SPIN_VERBOSE_LOGGING - if (notified) { - LOG_INFO("[HeapRing] Unblocked after %d spins", spin_count); - } -#endif -#if PTO2_ORCH_PROFILING - if (waiting) { - extern uint64_t g_orch_heap_wait_cycle; - g_orch_heap_wait_cycle += (get_sys_cnt_aicpu() - wait_start); - } - { - extern uint64_t g_orch_heap_atomic_count; - g_orch_heap_atomic_count += - spin_count + 1; // spin_count retries + 1 success (each try_alloc = 1 load) - } -#endif - return ptr; - } - - // No space available, spin-wait - spin_count++; -#if PTO2_ORCH_PROFILING - if (!waiting) { - wait_start = get_sys_cnt_aicpu(); - waiting = true; - } -#endif - - // Progress detection: reset spin counter if heap_tail advances - uint64_t cur_tail = tail_ptr->load(std::memory_order_acquire); - if (cur_tail != prev_tail) { -#if PTO2_SPIN_VERBOSE_LOGGING - LOG_INFO( - "[HeapRing] Progress: tail %" PRIu64 " -> %" PRIu64 " (reset spin_count=%d)", prev_tail, cur_tail, - spin_count - ); -#endif - spin_count = 0; - prev_tail = cur_tail; - } - -#if PTO2_SPIN_VERBOSE_LOGGING - // Periodic block notification - if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count > 0 && spin_count < PTO2_HEAP_SPIN_LIMIT) { - uint64_t top = top_ptr->load(std::memory_order_acquire); - LOG_WARN( - "[HeapRing] BLOCKED: requesting %" PRIu64 " bytes" - ", top=%" PRIu64 ", tail=%" PRIu64 ", spins=%d", - size, top, cur_tail, spin_count - ); - notified = true; - } -#endif - - if (spin_count >= PTO2_HEAP_SPIN_LIMIT) { - uint64_t top = top_ptr->load(std::memory_order_acquire); - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Heap Ring Deadlock Detected!"); - LOG_ERROR("========================================"); - LOG_ERROR("Orchestrator blocked waiting for heap space after %d spins (no tail progress).", spin_count); - LOG_ERROR(" - Requested: %" PRIu64 " bytes", size); - LOG_ERROR(" - Heap top: %" PRIu64, top); - LOG_ERROR(" - Heap tail: %" PRIu64 " (stuck here)", cur_tail); - LOG_ERROR(" - Heap size: %" PRIu64, this->size); - LOG_ERROR(" - Available: %" PRIu64 " bytes", pto2_heap_ring_available()); - LOG_ERROR("Diagnosis:"); - LOG_ERROR(" heap_tail is not advancing, which means last_task_alive"); - LOG_ERROR(" is stuck. Check TaskRing diagnostics for root cause."); - LOG_ERROR("Solution: Increase heap size or investigate task stall."); - LOG_ERROR(" Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h"); - LOG_ERROR( - " Runtime env: PTO2_RING_HEAP= (e.g. %lu)", (unsigned long)(this->size * 2) - ); - LOG_ERROR("========================================"); - if (error_code_ptr) { - error_code_ptr->store(PTO2_ERROR_HEAP_RING_DEADLOCK, std::memory_order_release); - } - return nullptr; - } - - SPIN_WAIT_HINT(); - } - } - - /** - * Try to allocate memory without stalling (thread-safe via CAS) - * - * @param size Requested size in bytes - * @return Pointer to allocated memory, or NULL if no space - */ - void *pto2_heap_ring_try_alloc(uint64_t alloc_size) { - // Align size for DMA efficiency - alloc_size = PTO2_ALIGN_UP(alloc_size, PTO2_ALIGN_SIZE); - - while (true) { - uint64_t top = top_ptr->load(std::memory_order_acquire); - // Read latest tail from shared memory (Scheduler updates this) - uint64_t tail = tail_ptr->load(std::memory_order_acquire); - uint64_t new_top; - void *result; - - if (top >= tail) { - // Case 1: top is at or ahead of tail (normal case) - uint64_t space_at_end = size - top; - - if (space_at_end >= alloc_size) { - new_top = top + alloc_size; - result = (char *)base + top; - } else if (tail > alloc_size) { - // Wrap to beginning - new_top = alloc_size; - result = base; - } else { - return NULL; - } - } else { - // Case 2: top has wrapped, tail is ahead - uint64_t gap = tail - top; - if (gap >= alloc_size) { - new_top = top + alloc_size; - result = (char *)base + top; - } else { - return NULL; - } - } - - if (top_ptr->compare_exchange_weak(top, new_top, std::memory_order_acq_rel, std::memory_order_acquire)) { - return result; - } - // CAS failed, retry with updated top - } - } - - /** - * Get available space in heap ring - */ - uint64_t pto2_heap_ring_available() { - uint64_t top = top_ptr->load(std::memory_order_acquire); - uint64_t tail = tail_ptr->load(std::memory_order_acquire); - - if (top >= tail) { - uint64_t at_end = size - top; - uint64_t at_begin = tail; - return at_end > at_begin ? at_end : at_begin; - } else { - return tail - top; - } - } -}; - -/** - * Initialize heap ring buffer - * - * @param ring Heap ring to initialize - * @param base Base address of heap memory - * @param size Total heap size in bytes - * @param tail_ptr Pointer to shared memory heap_tail - */ -void pto2_heap_ring_init( - PTO2HeapRing *ring, void *base, uint64_t size, std::atomic *tail_ptr, std::atomic *top_ptr -); - -// ============================================================================= -// Task Ring Buffer -// ============================================================================= - -/** - * Task ring buffer structure - * - * Fixed-size sliding window for task management. - * Provides back-pressure when window is full. - */ -struct PTO2TaskRing { - PTO2TaskDescriptor *descriptors; // Task descriptor array (from shared memory) - int32_t window_size; // Window size (power of 2) - std::atomic *current_index_ptr; // Shared atomic in SM header - - // Reference to shared memory last_task_alive (for back-pressure) - std::atomic *last_alive_ptr; // Points to header->last_task_alive - - // Error code pointer for fatal error reporting (→ sm_header->orch_error_code) - std::atomic *error_code_ptr = nullptr; - - /** - * Allocate a task slot from task ring - * - * May STALL (spin-wait) if window is full (back-pressure). - * Initializes the task descriptor to default values. - * - * @return Allocated task ID (absolute, not wrapped) - */ - int32_t pto2_task_ring_alloc() { - // Spin-wait if window is full (back-pressure from Scheduler) - int spin_count = 0; - int32_t prev_last_alive = last_alive_ptr->load(std::memory_order_acquire); -#if PTO2_SPIN_VERBOSE_LOGGING - bool notified = false; -#endif -#if PTO2_ORCH_PROFILING - uint64_t wait_start = 0; - bool waiting = false; -#endif - - while (1) { - int32_t task_id = pto2_task_ring_try_alloc(); - if (task_id >= 0) { -#if PTO2_SPIN_VERBOSE_LOGGING - if (notified) { - LOG_INFO("[TaskRing] Unblocked after %d spins, task_id=%d", spin_count, task_id); - } -#endif -#if PTO2_ORCH_PROFILING - if (waiting) { - extern uint64_t g_orch_alloc_wait_cycle; - g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start); - } - { - extern uint64_t g_orch_alloc_atomic_count; - g_orch_alloc_atomic_count += - spin_count + 1; // spin_count retries + 1 success (each try_alloc = 1 load) - } -#endif - return task_id; - } - - // Window is full, spin-wait (with yield to prevent CPU starvation) - spin_count++; -#if PTO2_ORCH_PROFILING - if (!waiting) { - wait_start = get_sys_cnt_aicpu(); - waiting = true; - } -#endif - - // Progress detection: reset spin counter if last_task_alive advances - int32_t cur_last_alive = last_alive_ptr->load(std::memory_order_acquire); - if (cur_last_alive > prev_last_alive) { -#if PTO2_SPIN_VERBOSE_LOGGING - LOG_INFO( - "[TaskRing] Progress: last_alive %d -> %d (reset spin_count=%d)", prev_last_alive, cur_last_alive, - spin_count - ); -#endif - spin_count = 0; - prev_last_alive = cur_last_alive; - } - -#if PTO2_SPIN_VERBOSE_LOGGING - // Periodic block notification - if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count > 0 && - spin_count < PTO2_FLOW_CONTROL_SPIN_LIMIT) { - int32_t current = current_index_ptr->load(std::memory_order_acquire); - int32_t active_count = current - cur_last_alive; - LOG_WARN( - "[TaskRing] BLOCKED (Flow Control): current=%d, last_alive=%d, " - "active=%d/%d (%.1f%%), spins=%d", - current, cur_last_alive, active_count, window_size, 100.0 * active_count / window_size, spin_count - ); - notified = true; - } -#endif - - // Deadlock: no progress after SPIN_LIMIT spins - if (spin_count >= PTO2_FLOW_CONTROL_SPIN_LIMIT) { - int32_t current = current_index_ptr->load(std::memory_order_acquire); - int32_t active_count = current - cur_last_alive; - - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Flow Control Deadlock Detected!"); - LOG_ERROR("========================================"); - LOG_ERROR("Task Ring is FULL and no progress after %d spins.", spin_count); - LOG_ERROR(" - Current task index: %d", current); - LOG_ERROR(" - Last task alive: %d (stuck here)", cur_last_alive); - LOG_ERROR(" - Active tasks: %d / %d", active_count, window_size); - LOG_ERROR(" - Window utilization: %.1f%%", 100.0 * active_count / window_size); - LOG_ERROR("Diagnosis:"); - LOG_ERROR(" last_task_alive is stuck at %d, meaning task %d", cur_last_alive, cur_last_alive); - LOG_ERROR(" cannot transition to CONSUMED. Possible causes:"); - LOG_ERROR(" 1. Task %d still executing (subtasks not complete)", cur_last_alive); - LOG_ERROR(" 2. Task %d fanout not fully released (downstream not done)", cur_last_alive); - LOG_ERROR(" 3. Scope reference not released (scope_end not called)"); - LOG_ERROR(" 4. Orchestrator blocked here -> can't call scope_end -> circular wait"); - LOG_ERROR("Solution:"); - LOG_ERROR(" Increase task window size (current: %d, recommended: %d)", window_size, active_count * 2); - LOG_ERROR(" Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h"); - LOG_ERROR(" Runtime env: PTO2_RING_TASK_WINDOW= (e.g. %d)", active_count * 2); - LOG_ERROR("========================================"); - if (error_code_ptr) { - error_code_ptr->store(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, std::memory_order_release); - } - return -1; - } - - SPIN_WAIT_HINT(); - } - } - - /** - * Try to allocate task slot without stalling (thread-safe via fetch_add) - * - * @return Task ID, or -1 if window is full - */ - int32_t pto2_task_ring_try_alloc() { - // Optimistically allocate a task ID - int32_t task_id = current_index_ptr->fetch_add(1, std::memory_order_acq_rel); - int32_t last_alive = last_alive_ptr->load(std::memory_order_acquire); - int32_t active_count = task_id - last_alive; - - // Check if there's room (leave at least 1 slot empty) - if (active_count < window_size - 1) { - return task_id; - } - - // Window is full — roll back the optimistic increment - current_index_ptr->fetch_sub(1, std::memory_order_release); - return -1; - } - - int32_t get_task_slot(int32_t task_id) const { return task_id & (window_size - 1); } - - /** - * Get task descriptor by ID - */ - PTO2TaskDescriptor &get_task(int32_t task_id) { return descriptors[task_id & (window_size - 1)]; } - - /** - * Get task descriptor by task slot - */ - PTO2TaskDescriptor &get_task_by_slot(int32_t task_slot) { return descriptors[task_slot]; } -}; - -/** - * Initialize task ring buffer - * - * @param ring Task ring to initialize - * @param descriptors Task descriptor array from shared memory - * @param window_size Window size (must be power of 2) - * @param last_alive_ptr Pointer to shared memory last_task_alive - */ -void pto2_task_ring_init( - PTO2TaskRing *ring, PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *last_alive_ptr, - std::atomic *current_index_ptr -); - -/** - * Get number of active tasks in window - */ -static inline int32_t pto2_task_ring_active_count(PTO2TaskRing *ring) { - int32_t last_alive = ring->last_alive_ptr->load(std::memory_order_acquire); - return ring->current_index_ptr->load(std::memory_order_acquire) - last_alive; -} - -/** - * Check if task ring has space for more tasks - */ -static inline bool pto2_task_ring_has_space(PTO2TaskRing *ring) { - int32_t active = pto2_task_ring_active_count(ring); - return active < ring->window_size - 1; -} - -/** - * Get task descriptor by ID - */ -static inline PTO2TaskDescriptor *pto2_task_ring_get(PTO2TaskRing *ring, int32_t task_id) { - return &ring->descriptors[task_id & (ring->window_size - 1)]; -} - -// ============================================================================= -// Dependency List Pool -// ============================================================================= - -/** - * Dependency list pool structure - * - * True ring buffer for allocating linked list entries. - * Entries are reclaimed when their producer tasks become CONSUMED, - * as tracked by the orchestrator via dep_pool_mark per task. - * - * Linear counters (top, tail) grow monotonically; the physical index - * is obtained via modulo: base[linear_index % capacity]. - */ -struct PTO2DepListPool { - PTO2DepListEntry *base; // Pool base address - int32_t capacity; // Total number of entries - int32_t top; // Linear next-allocation counter (starts from 1) - int32_t tail; // Linear first-alive counter (entries before this are dead) - int32_t high_water; // Peak concurrent usage (top - tail) - int32_t last_reclaimed{0}; // last_task_alive at last successful reclamation - - // Error code pointer for fatal error reporting (→ sm_header->orch_error_code) - std::atomic *error_code_ptr = nullptr; - - /** - * Initialize dependency list pool - * - * @param base Pool base address from shared memory - * @param capacity Total number of entries - */ - void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic *in_error_code_ptr) { - base = in_base; - capacity = in_capacity; - top = 1; // Start from 1, 0 means NULL/empty - tail = 1; // Match initial top (no reclaimable entries yet) - high_water = 0; - last_reclaimed = 0; - - // Initialize entry 0 as NULL marker - base[0].slot_state = nullptr; - base[0].next = nullptr; - - error_code_ptr = in_error_code_ptr; - } - - /** - * Reclaim dead entries based on scheduler's slot state dep_pool_mark. - * Safe to call multiple times — only advances tail forward. - * - * @param sched Scheduler state (for reading slot dep_pool_mark) - * @param ring_id Ring layer index - * @param sm_last_task_alive Current last_task_alive from shared memory - */ - void reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive); - - /** - * Ensure dep pool for a specific ring has at least `needed` entries available. - * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. - */ - void ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed); - - /** - * Allocate a single entry from the pool (single-thread per pool instance) - * - * @return Pointer to allocated entry, or nullptr on fatal error - */ - PTO2DepListEntry *alloc() { - int32_t used = top - tail; - if (used >= capacity) { - LOG_ERROR("========================================"); - LOG_ERROR("FATAL: Dependency Pool Overflow!"); - LOG_ERROR("========================================"); - LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity); - LOG_ERROR(" - Pool top: %d (linear)", top); - LOG_ERROR(" - Pool tail: %d (linear)", tail); - LOG_ERROR(" - High water: %d", high_water); - LOG_ERROR("Solution:"); - LOG_ERROR(" Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2); - LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); - LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", capacity * 2); - LOG_ERROR("========================================"); - if (error_code_ptr) { - error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release); - } - return nullptr; - } - int32_t idx = top % capacity; - top++; - used++; - if (used > high_water) high_water = used; - return &base[idx]; - } - - /** - * Advance the tail pointer, reclaiming dead entries. - * Called by the orchestrator based on last_task_alive advancement. - */ - void advance_tail(int32_t new_tail) { - if (new_tail > tail) { - tail = new_tail; - } - } - - /** - * Prepend a task ID to a dependency list - * - * O(1) operation: allocates new entry and links to current head. - * - * @param current_head Current list head offset (0 = empty list) - * @param task_slot Task slot to prepend - * @return New head offset - */ - PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) { - PTO2DepListEntry *new_entry = alloc(); - if (!new_entry) return nullptr; - new_entry->slot_state = slot_state; - new_entry->next = cur; - return new_entry; - } - - int32_t used() const { return top - tail; } - - int32_t available() const { return capacity - used(); } -}; - -// ============================================================================= -// Ring Set (per-depth aggregate) -// ============================================================================= - -/** - * Groups a HeapRing, TaskRing, and DepPool into one per-depth unit. - * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth. - */ -struct PTO2RingSet { - PTO2HeapRing heap_ring; - PTO2TaskRing task_ring; - PTO2DepListPool dep_pool; -}; - -#endif // PTO_RING_BUFFER_H diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp deleted file mode 100644 index 97d5486e9..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -/** - * PTO Runtime2 - Main Implementation - * - * Implements the unified runtime API that combines orchestrator and scheduler. - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#include "pto_runtime2.h" - -#include -#include -#include - -#include "common/unified_log.h" - -// ============================================================================= -// Orchestration Ops Table (function-pointer dispatch for orchestration .so) -// ============================================================================= - -static SubmitResult submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) { - return pto2_submit_mixed_task(&rt->orchestrator, mixed_kernels, args); -} - -static void add_dependency_impl(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer) { - pto2_add_dependency(&rt->orchestrator, producer, consumer); -} - -void rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator); } - -void rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); } - -void rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->orchestrator); } - -static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; } - -static const PTO2RuntimeOps s_runtime_ops = { - .submit_task = submit_task_impl, - .add_dependency = add_dependency_impl, - .scope_begin = rt_scope_begin, - .scope_end = rt_scope_end, - .orchestration_done = rt_orchestration_done, - .is_fatal = is_fatal_impl, - .log_error = unified_log_error, - .log_warn = unified_log_warn, - .log_info = unified_log_info, - .log_debug = unified_log_debug, - .log_always = unified_log_always, -}; - -// ============================================================================= -// Runtime Creation and Destruction -// ============================================================================= - -PTO2Runtime *pto2_runtime_create(PTO2RuntimeMode mode) { - return pto2_runtime_create_custom(mode, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE); -} - -PTO2Runtime *pto2_runtime_create_custom( - PTO2RuntimeMode mode, uint64_t task_window_size, uint64_t heap_size, int32_t dep_pool_capacity -) { - // Allocate runtime context - PTO2Runtime *rt = reinterpret_cast(calloc(1, sizeof(PTO2Runtime))); - if (!rt) { - return NULL; - } - - rt->ops = &s_runtime_ops; - rt->mode = mode; - rt->sm_handle = pto2_sm_create(task_window_size, heap_size); - if (!rt->sm_handle) { - free(rt); - return NULL; - } - - // Allocate GM heap for output buffers (all rings combined) - uint64_t total_heap_size = heap_size * PTO2_MAX_RING_DEPTH; - rt->gm_heap_size = total_heap_size; -#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L - if (posix_memalign(&rt->gm_heap, PTO2_ALIGN_SIZE, total_heap_size) != 0) { - pto2_sm_destroy(rt->sm_handle); - free(rt); - return NULL; - } -#else - rt->gm_heap = aligned_alloc(PTO2_ALIGN_SIZE, total_heap_size); - if (!rt->gm_heap) { - pto2_sm_destroy(rt->sm_handle); - free(rt); - return NULL; - } -#endif - rt->gm_heap_owned = true; - - // Initialize orchestrator - if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) { - free(rt->gm_heap); - pto2_sm_destroy(rt->sm_handle); - free(rt); - return NULL; - } - - // Initialize scheduler (heap_size = per-ring heap size) - if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap, heap_size)) { - pto2_orchestrator_destroy(&rt->orchestrator); - free(rt->gm_heap); - pto2_sm_destroy(rt->sm_handle); - free(rt); - return NULL; - } - - // Connect orchestrator to scheduler (for simulated mode) - pto2_orchestrator_set_scheduler(&rt->orchestrator, &rt->scheduler); - - return rt; -} - -PTO2Runtime *pto2_runtime_create_from_sm( - PTO2RuntimeMode mode, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, - int32_t dep_pool_capacity -) { - if (!sm_handle) return NULL; - - PTO2Runtime *rt = reinterpret_cast(calloc(1, sizeof(PTO2Runtime))); - if (!rt) return NULL; - - rt->ops = &s_runtime_ops; - rt->mode = mode; - rt->sm_handle = sm_handle; - rt->gm_heap = gm_heap; - rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0; - rt->gm_heap_owned = false; - - if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) { - free(rt); - return NULL; - } - - // Initialize scheduler (heap_size = per-ring heap size) - if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap, heap_size)) { - pto2_orchestrator_destroy(&rt->orchestrator); - free(rt); - return NULL; - } - - pto2_orchestrator_set_scheduler(&rt->orchestrator, &rt->scheduler); - - return rt; -} - -void pto2_runtime_destroy(PTO2Runtime *rt) { - if (!rt) return; - - pto2_scheduler_destroy(&rt->scheduler); - pto2_orchestrator_destroy(&rt->orchestrator); - - if (rt->gm_heap_owned && rt->gm_heap) { - free(rt->gm_heap); - } - - if (rt->sm_handle) { - pto2_sm_destroy(rt->sm_handle); - } - - free(rt); -} - -void pto2_runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { - if (rt) { - rt->mode = mode; - } -} diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h deleted file mode 100644 index cfc4e394d..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -/** - * PTO Runtime2 - Main Interface - * - * This is the main header for the PTO Runtime2 system. - * It provides a unified API for task graph construction and execution. - * - * Key Features: - * - Ring buffer based memory management (zero allocation overhead) - * - Explicit dependency management via add_dependency() - * - Scope-based buffer lifecycle management with batch publish - * - Per-task spinlocks for concurrent fanout updates - * - Orchestrator-Scheduler decoupling via shared memory - * - * Usage: - * 1. Create runtime: pto2_runtime_create() - * 2. Build task graph in orchestration function: - * - pto2_scope_begin() / pto2_scope_end() - * - pto2_submit_task() - * 3. Mark orchestration complete: pto2_orchestrator_done() - * 4. Destroy runtime: pto2_runtime_destroy() - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_H_ -#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_H_ - -#include "pto_orchestrator.h" -#include "pto_ring_buffer.h" -#include "pto_runtime2_types.h" -#include "pto_scheduler.h" -#include "pto_shared_memory.h" -#include "pto_submit_types.h" - -// ============================================================================= -// Runtime Context -// ============================================================================= - -/** - * Runtime execution mode - */ -enum PTO2RuntimeMode { - PTO2_MODE_EXECUTE = 0, // Execute tasks on workers - PTO2_MODE_SIMULATE = 1, // Simulate task execution with cycle counting - PTO2_MODE_GRAPH_ONLY = 2 // Build graph only, no execution -}; - -/** - * Function-pointer ops table for runtime operations. - * - * The orchestration .so calls runtime functions through this table - * (via pto_orchestration_api.h inline wrappers), so it has zero link - * dependencies on runtime .cpp files. - */ -typedef struct PTO2Runtime PTO2Runtime; // forward declare for ops signatures - -struct PTO2RuntimeOps { - SubmitResult (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args); - void (*add_dependency)(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer); - void (*scope_begin)(PTO2Runtime *rt); - void (*scope_end)(PTO2Runtime *rt); - void (*orchestration_done)(PTO2Runtime *rt); - bool (*is_fatal)(PTO2Runtime *rt); - - // Logging (populated by runtime, called by orchestration) - void (*log_error)(const char *func, const char *fmt, ...); - void (*log_warn)(const char *func, const char *fmt, ...); - void (*log_info)(const char *func, const char *fmt, ...); - void (*log_debug)(const char *func, const char *fmt, ...); - void (*log_always)(const char *func, const char *fmt, ...); -}; - -/** - * PTO Runtime2 context - * - * Contains all state for orchestration and scheduling. - * In simulated mode, runs in single process with shared address space. - */ -struct PTO2Runtime { - // Ops table (first field — used by orchestration .so via function pointers) - const PTO2RuntimeOps *ops; - - // Components - PTO2SharedMemoryHandle *sm_handle; - PTO2OrchestratorState orchestrator; - PTO2SchedulerState scheduler; - - // GM Heap for output buffers - void *gm_heap; - uint64_t gm_heap_size; - bool gm_heap_owned; // True if we allocated it - - // Mode - PTO2RuntimeMode mode; - - // Statistics - int64_t total_cycles; -}; - -// ============================================================================= -// Runtime Lifecycle API -// ============================================================================= - -/** - * Create a new runtime instance - * - * @param mode Execution mode - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *pto2_runtime_create(PTO2RuntimeMode mode); - -/** - * Create runtime with custom sizes - * - * @param mode Execution mode - * @param task_window_size Number of task slots - * @param heap_size Size of GM heap - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *pto2_runtime_create_custom( - PTO2RuntimeMode mode, uint64_t task_window_size, uint64_t heap_size, - int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE -); - -/** - * Create runtime from existing shared memory and GM heap (e.g. on device). - * Does not allocate sm_handle or gm_heap; caller owns them. - * - * @param mode Execution mode - * @param sm_handle Pre-created shared memory handle (e.g. from pto2_sm_create_from_buffer) - * @param gm_heap GM heap base for output buffers (or NULL if not used) - * @param heap_size GM heap size in bytes - * @return Runtime context, or NULL on failure - */ -PTO2Runtime *pto2_runtime_create_from_sm( - PTO2RuntimeMode mode, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size, - int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE -); - -/** - * Destroy runtime and free all resources - */ -void pto2_runtime_destroy(PTO2Runtime *rt); - -/** - * Set execution mode - */ -void pto2_runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode); - -// ============================================================================= -// Orchestration API (called by orchestration function) -// ============================================================================= - -/** - * Begin a new scope - * - * All tasks submitted within this scope will have their lifetime - * bounded by the scope. When scope_end() is called, the scope - * releases its reference to all enclosed tasks. - */ -void rt_scope_begin(PTO2Runtime *rt); - -/** - * End current scope - * - * Releases scope reference for all tasks submitted since scope_begin(). - * Tasks whose refcount reaches zero will have their buffers released. - */ -void rt_scope_end(PTO2Runtime *rt); - -/** - * Mark orchestration as complete - * - * Signals that no more tasks will be submitted. - */ -void rt_orchestration_done(PTO2Runtime *rt); - -/** - * Scope helper macros for C - * - * These macros provide scope management for C code. - * For C++, prefer using PTO2_SCOPE_GUARD or PTO2_SCOPE (see below). - * - * Usage (C): - * PTO2_SCOPE_BEGIN(rt); - * rt_submit_task(...); - * rt_submit_task(...); - * PTO2_SCOPE_END(rt); - */ -#define PTO2_SCOPE_BEGIN(rt) rt_scope_begin(rt) -#define PTO2_SCOPE_END(rt) rt_scope_end(rt) - -/** - * RAII Scope Guard for C++ - * - * PTO2ScopeGuard is a C++ RAII wrapper that automatically manages scope lifetime. - * It calls rt_scope_begin() on construction and rt_scope_end() on destruction, - * ensuring proper cleanup even in error paths. - * - * Usage Option 1 - Direct instantiation (recommended): - * PTO2ScopeGuard scope_guard(rt); - * rt_submit_task(...); - * rt_submit_task(...); - * // scope automatically ends here when scope_guard destructor is called - * - * Usage Option 2 - Macro for anonymous guard: - * PTO2_SCOPE_GUARD(rt); - * rt_submit_task(...); - * // scope automatically ends at end of current block - * - * Usage Option 3 - Scoped block with if statement: - * PTO2_SCOPE(rt) { - * rt_submit_task(...); - * rt_submit_task(...); - * } // scope automatically ends here - * - * Benefits: - * - Exception-safe: scope ends even if exceptions are thrown - * - Error-safe: no need to manually call PTO2_SCOPE_END in error paths - * - Cleaner code: less boilerplate, automatic cleanup - * - Less error-prone: impossible to forget scope cleanup - */ -class PTO2ScopeGuard { -public: - explicit PTO2ScopeGuard(PTO2Runtime *rt) : - rt_(rt) { - rt_scope_begin(rt_); - } - ~PTO2ScopeGuard() { rt_scope_end(rt_); } - -private: - PTO2Runtime *rt_; -}; - -/** - * Macro to create an anonymous scope guard with a unique name. - * The [[maybe_unused]] attribute suppresses warnings if the guard - * variable is not explicitly used. - * - * Example: - * PTO2_SCOPE_GUARD(rt); - * rt_submit_task(...); - */ -#define _PTO2_CONCATENATE_IMPL(x, y) x##y -#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y) -#define PTO2_SCOPE_GUARD(rt) [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)(rt) - -/** - * Macro to create a scoped block with automatic scope management. - * Uses if-statement initialization (C++17) to create guard and execute block. - * - * Example: - * PTO2_SCOPE(rt) { - * rt_submit_task(...); - * } // scope automatically ends here - */ -#define PTO2_SCOPE(rt) if (PTO2_SCOPE_GUARD(rt); true) - -/** - * Slim config struct exported by orchestration .so via aicpu_orchestration_config(). - * Shared definition with pto_orchestration_api.h (same layout, guarded). - */ -#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED -#define PTO2_ORCHESTRATION_CONFIG_DEFINED -struct PTO2OrchestrationConfig { - int expected_arg_count; -}; -#endif - -#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h deleted file mode 100644 index b75834dfa..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -/** - * PTO Runtime2 - Core Type Definitions - * - * This header defines all fundamental types used by the PTO Runtime2 system: - * - Configuration constants - * - Worker types and task states - * - Tensor regions and task parameters - * - Task descriptors with fanin/fanout tracking - * - Dependency list entries - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ -#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ - -#include -#include -#include - -#include - -#include "pto_submit_types.h" -#include "pto_types.h" - -// ============================================================================= -// Profiling Configuration -// ============================================================================= - -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif - -#ifndef PTO2_ORCH_PROFILING -#define PTO2_ORCH_PROFILING 0 -#endif - -#ifndef PTO2_SCHED_PROFILING -#define PTO2_SCHED_PROFILING 0 -#endif - -#if PTO2_ORCH_PROFILING && !PTO2_PROFILING -#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1" -#endif - -#if PTO2_SCHED_PROFILING && !PTO2_PROFILING -#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" -#endif - -// ============================================================================= -// AICPU Error Codes (written to shared memory for Host-side diagnosis) -// ============================================================================= - -// Orchestrator errors (1-99): detected in orchestrator thread -#define PTO2_ERROR_NONE 0 -#define PTO2_ERROR_SCOPE_DEADLOCK 1 -#define PTO2_ERROR_HEAP_RING_DEADLOCK 2 -#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3 -#define PTO2_ERROR_DEP_POOL_OVERFLOW 4 -#define PTO2_ERROR_INVALID_ARGS 5 // Arg construction error (invalid args) - -// Scheduler errors (100+): detected in scheduler threads -#define PTO2_ERROR_SCHEDULER_TIMEOUT 100 - -// ============================================================================= -// Configuration Constants -// ============================================================================= - -// Task management -// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. -// Actual window size is passed at runtime to pto2_runtime_create_threaded_custom(). -// Use pto2_task_slot(sched, task_id) for slot calculation. -#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) - -// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer) -// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) -#define PTO2_MAX_RING_DEPTH 4 - -// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH) -#define PTO2_HEAP_SIZE (256 * 1024 * 1024) // 256MB per ring (1GB total) -#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries - -// Scope management -#define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth -#define PTO2_SCOPE_TASKS_INIT_CAP 65536 // Initial capacity for scope task buffer - -// Ready queue -#define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size - -// Memory alignment -#define PTO2_ALIGN_SIZE 64 // Cache line alignment -#define PTO2_PACKED_OUTPUT_ALIGN 1024 // Each output in packed buffer aligned to 1024B; gap is padding -#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1)) - -// Dep pool cleanup interval -#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64 // Cleanup every N retired tasks - -// ============================================================================= -// Multi-Ring task_id Encoding -// ============================================================================= - -/** - * TaskId: 64-bit encoding used across Runtime2. - * - * raw encoding: (ring_id << 32) | local_id - * - * ring_id: which ring layer (0..PTO2_MAX_RING_DEPTH-1) - * local_id: per-ring monotonic counter - */ -struct PTO2TaskId { - uint64_t raw; - - constexpr PTO2TaskId() : - raw(0) {} - constexpr explicit PTO2TaskId(uint64_t v) : - raw(v) {} - - constexpr uint8_t ring() const { return static_cast(raw >> 32); } - constexpr uint32_t local() const { return static_cast(raw & 0xFFFFFFFFu); } - - constexpr bool operator==(const PTO2TaskId &other) const { return raw == other.raw; } - constexpr bool operator!=(const PTO2TaskId &other) const { return raw != other.raw; } -}; - -static_assert(sizeof(PTO2TaskId) == 8, "PTO2TaskId must stay 8 bytes (shared memory ABI)"); - -static inline PTO2TaskId pto2_make_task_id(uint8_t ring_id, uint32_t local_id) { - return PTO2TaskId{(static_cast(ring_id) << 32) | static_cast(local_id)}; -} - -static inline uint8_t pto2_task_id_ring(PTO2TaskId task_id) { return task_id.ring(); } - -static inline uint32_t pto2_task_id_local(PTO2TaskId task_id) { return task_id.local(); } - -static inline uint64_t pto2_task_id_raw(PTO2TaskId task_id) { return task_id.raw; } - -/** - * SubmitResult — return value from pto2_submit_mixed_task. - * Bundles the task_id (for explicit dependencies) and the materialized - * output tensors (for referencing runtime-allocated outputs). - */ -struct SubmitResult { - PTO2TaskId task_id; - TaskOutputTensors outputs; -}; - -// ============================================================================= -// Worker Types -// ============================================================================= - -/** - * Worker type enumeration - * Each worker type has its own ready queue for load balancing - */ -typedef enum { - PTO2_WORKER_CUBE = 0, // AICore CUBE unit (matrix ops) - PTO2_WORKER_VECTOR = 1, // AICore VECTOR unit (element-wise ops) - PTO2_WORKER_AI_CPU = 2, // AI_CPU (scalar ops, control flow) - PTO2_WORKER_ACCELERATOR = 3, // Fixed-function accelerators (DMA, etc.) - PTO2_NUM_WORKER_TYPES = 4 -} PTO2WorkerType; - -// ============================================================================= -// Task States -// ============================================================================= - -/** - * Task state enumeration - * - * State transitions: - * PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED - * - * Conditions: - * PENDING->READY: fanin_refcount == fanin_count - * COMPLETED->CONSUMED: fanout_refcount == fanout_count && state == COMPLETED - */ -typedef enum { - PTO2_TASK_PENDING = 0, // Waiting for dependencies (fanin_refcount < fanin_count) - PTO2_TASK_READY = 1, // All dependencies satisfied, waiting in ready queue - PTO2_TASK_RUNNING = 2, // Currently executing on a worker - PTO2_TASK_COMPLETED = 3, // Execution finished, output may still be in use - PTO2_TASK_CONSUMED = 4 // Output fully consumed, buffers can be released -} PTO2TaskState; - -// ============================================================================= -// Dependency List Entry -// ============================================================================= - -/** - * Dependency list entry (singly-linked list node) - * Stored in DepListPool ring buffer - * - * Used for both fanin_list and fanout_list - */ -struct PTO2TaskSlotState; // Forward declaration -struct PTO2DepListEntry { - PTO2TaskSlotState *slot_state; // Consumer slot state (direct pointer) - PTO2DepListEntry *next; // next entry -}; - -// ============================================================================= -// Task Descriptor -// ============================================================================= - -/** - * Task descriptor structure (shared memory) - * - * Stored in the TaskDescriptor ring buffer in shared memory. - * Contains static identification and buffer pointers only. - * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState. - * - * Fields set by Orchestrator at submission, read by Scheduler for dispatch. - */ -struct PTO2TaskDescriptor { - // Mixed-task identification (encodes ring_id in upper 32 bits) - PTO2TaskId task_id; // raw: (ring_id << 32) | local_id - - // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive) - int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT]; - - // Packed output buffer (all outputs packed into single contiguous buffer) - void *packed_buffer_base; // Start of packed buffer in GM Heap - void *packed_buffer_end; // End of packed buffer (for heap reclamation) -}; - -// ============================================================================= -// Per-Slot Scheduling State -// ============================================================================= - -/** - * Task payload data (cold path - only accessed during orchestration and dispatch) - * - * Layout: metadata (counts, fanin pointers) packed in the first 3 cache lines, - * followed by bulk tensor and scalar data. This gives sequential write access - * during orchestration and groups scheduler-hot fields (fanin_actual_count + - * fanin_slot_states) together for on_task_release. - */ -struct PTO2TaskPayload { - // === Cache line 0 (64B) — metadata === - int32_t tensor_count{0}; - int32_t scalar_count{0}; - int32_t fanin_actual_count{0}; // Actual fanin count (without the +1 redundance) - int32_t _reserved{0}; // Reserved (dep_pool_mark moved to SlotState for local access) - PTO2TaskSlotState *fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release) - // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) === - Tensor tensors[MAX_TENSOR_ARGS]; - // === Cache lines 35-50 (1024B) — scalars === - uint64_t scalars[MAX_SCALAR_ARGS]; - - void init(const Arg &args, const TaskOutputTensors &materialized_outputs) { - tensor_count = args.tensor_count(); - scalar_count = args.scalar_count(); - int32_t out_idx = 0; - for (int32_t i = 0; i < args.tensor_count(); i++) { - const Tensor *src; - if (args.tag(i) == TensorArgType::OUTPUT) { - src = materialized_outputs.output_ptr(out_idx++); - } else { - src = args.tensor(i).ptr; - } - tensors[i].copy(*src); - } - // Round up to cache line boundary. Both arrays are 1024B so no overrun. - // Eliminates branches; extra bytes within the same CL have zero additional cost. - memcpy(scalars, args.scalar_data(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64)); - } -}; - -/** - * Per-task slot scheduling state (scheduler-private, NOT in shared memory) - * - * Consolidates all hot-path scheduling fields into a single cache-friendly - * structure (32 bytes = half a cache line). Accessing any field of a task's - * slot state brings all related fields into the same cache line. - * - * Concurrency notes: - * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock) - * - fanin_count set once at submission, read-only after (hot path for ready check) - * - task_state, fanin_refcount, fanout_refcount updated atomically - */ -struct alignas(64) PTO2TaskSlotState { - // Fanout lock + list (accessed together under lock in on_task_complete) - std::atomic fanout_lock; // Per-task spinlock (0=unlocked, 1=locked) - int32_t fanout_count; // 1 (owning scope) + number of consumers - - PTO2DepListEntry *fanout_head; // Pointer to first fanout entry (nullptr = empty) - - // Task state (completion, consumed check, ready check) - std::atomic task_state; // PENDING/READY/RUNNING/COMPLETED/CONSUMED - - // Fanin (accessed together in release_fanin_and_check_ready) - std::atomic fanin_refcount; // Dynamic: counts completed producers - int32_t fanin_count; // Number of producer dependencies (set once) - - // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) - std::atomic fanout_refcount; // Dynamic: counts released references - - PTO2TaskPayload *payload; - - PTO2TaskDescriptor *task; - - // Hot-path completion fields (moved from TaskDescriptor to avoid cross-struct access) - uint8_t active_mask; // Bitmask of active subtask slots (set once) - std::atomic subtask_done_mask; // Each subtask sets its done bit on completion - uint8_t ring_id; // Ring layer this task belongs to (for per-ring reclamation) - int32_t dep_pool_mark{0}; // Dep pool top after this task's submission (orchestrator-only, local memory) -}; - -static_assert(sizeof(PTO2TaskSlotState) == 64); - -// ============================================================================= -// Cycle Cost Function Type -// ============================================================================= - -/** - * Cycle cost function pointer type - * Returns estimated cycle count for the InCore function - */ -typedef int64_t (*PTO2CycleCostFunc)(void **args, int32_t num_args); - -// ============================================================================= -// InCore Function Type -// ============================================================================= - -/** - * InCore function signature - * All InCore functions must match this signature - */ -typedef void (*PTO2InCoreFunc)(void **args, int32_t num_args); - -// ============================================================================= -// Utility Macros -// ============================================================================= - -/** - * Memory barrier macros for different architectures - */ -#if defined(__aarch64__) -#define PTO2_MEMORY_BARRIER() __asm__ __volatile__("dmb sy" ::: "memory") -#elif defined(__x86_64__) -#define PTO2_MEMORY_BARRIER() __asm__ __volatile__("mfence" ::: "memory") -#else -#define PTO2_MEMORY_BARRIER() __sync_synchronize() -#endif - -// Spin-wait hint for AICPU threads. On real hardware the AICPU has dedicated -// ARM A55 cores — no OS yield is needed, so the hint is a no-op. In simulation -// all threads share host CPU cores, so we yield to prevent starvation. -// This header is also compiled into the Host .so (for struct definitions only), -// where the hint is never called — the fallback no-op keeps Host builds clean. -#if __has_include("spin_hint.h") -#include "spin_hint.h" -#else -#define SPIN_WAIT_HINT() ((void)0) -#endif - -// ============================================================================= -// Per-task fanout spinlock helpers -// -// Used by BOTH the orchestrator (pto_orchestrator.cpp) and the scheduler -// (aicpu_executor.cpp). Placing them here ensures both translation units use -// identical acquire/release semantics. -// -// The fanout_lock MUST be held whenever reading or writing fanout_head / -// fanout_count, because the orchestrator adds consumers concurrently with the -// scheduler traversing the list after task completion. -// ============================================================================= - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING -#include "aicpu/device_time.h" -#endif - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING -static inline void pto2_fanout_lock(PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) { - uint64_t t0 = get_sys_cnt_aicpu(); - bool contended = false; - uint32_t atomic_ops = 0; - - for (;;) { - while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) { - contended = true; - atomic_ops++; // each load = 1 atomic - SPIN_WAIT_HINT(); - } - int32_t expected = 0; - if (slot_state.fanout_lock.compare_exchange_weak( - expected, 1, std::memory_order_acquire, std::memory_order_relaxed - )) { - atomic_ops++; // successful CAS = 1 atomic - atomic_count += atomic_ops; - if (contended) { - wait_cycle += (get_sys_cnt_aicpu() - t0); - } - return; - } - contended = true; - atomic_ops++; // failed CAS = 1 atomic - } -} -#endif - -static inline void pto2_fanout_lock(PTO2TaskSlotState &slot_state) { - for (;;) { - while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) { - SPIN_WAIT_HINT(); - } - int32_t expected = 0; - if (slot_state.fanout_lock.compare_exchange_weak( - expected, 1, std::memory_order_acquire, std::memory_order_relaxed - )) { - return; - } - } -} - -static inline void pto2_fanout_unlock(PTO2TaskSlotState &slot_state) { - slot_state.fanout_lock.store(0, std::memory_order_release); -} - -#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp deleted file mode 100644 index 38308ff81..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Runtime2 - Scheduler Implementation - * - * Implements scheduler state management, ready queues, and task lifecycle. - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#include "pto_scheduler.h" -#include -#include -#include -#include -#include "common/unified_log.h" - -// ============================================================================= -// Scheduler Profiling Counters -// ============================================================================= - -#if PTO2_SCHED_PROFILING -#include "common/platform_config.h" - -uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; -uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {}; - -PTO2SchedProfilingData pto2_scheduler_get_profiling(int thread_idx) { - PTO2SchedProfilingData d; - d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0); - d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0); - d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0); - d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0); - d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0); - d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0); - d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0); - d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0); - d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0); - d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0); - d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0); - d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0); - d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0); - return d; -} -#endif - -// ============================================================================= -// Task State Names -// ============================================================================= - -const char *pto2_task_state_name(PTO2TaskState state) { - switch (state) { - case PTO2_TASK_PENDING: - return "PENDING"; - case PTO2_TASK_READY: - return "READY"; - case PTO2_TASK_RUNNING: - return "RUNNING"; - case PTO2_TASK_COMPLETED: - return "COMPLETED"; - case PTO2_TASK_CONSUMED: - return "CONSUMED"; - default: - return "UNKNOWN"; - } -} - -// ============================================================================= -// Ready Queue Implementation -// ============================================================================= - -bool pto2_ready_queue_init(PTO2ReadyQueue *queue, uint64_t capacity) { - queue->slots = (PTO2ReadyQueueSlot *)malloc(capacity * sizeof(PTO2ReadyQueueSlot)); - if (!queue->slots) { - return false; - } - - queue->capacity = capacity; - queue->mask = capacity - 1; - queue->enqueue_pos.store(0, std::memory_order_relaxed); - queue->dequeue_pos.store(0, std::memory_order_relaxed); - - for (uint64_t i = 0; i < capacity; i++) { - queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed); - queue->slots[i].slot_state = nullptr; - } - - return true; -} - -void pto2_ready_queue_destroy(PTO2ReadyQueue *queue) { - if (queue->slots) { - free(queue->slots); - queue->slots = NULL; - } -} - -// ============================================================================= -// Scheduler Initialization -// ============================================================================= - -bool PTO2SchedulerState::RingSchedState::init( - PTO2SharedMemoryHandle *sm_handle, int32_t ring_id, void *gm_heap_base, uint64_t per_ring_heap_size -) { - task_descriptors = sm_handle->task_descriptors[ring_id]; - heap_base = (char *)gm_heap_base + ring_id * per_ring_heap_size; - task_window_size = sm_handle->header->rings[ring_id].task_window_size; - task_window_mask = static_cast(task_window_size - 1); - last_task_alive = 0; - last_heap_consumed = 0; - heap_tail = 0; - slot_states = nullptr; - advance_lock.store(0, std::memory_order_relaxed); - - // Allocate per-task slot state array (dynamically sized based on runtime window_size) - slot_states = new (std::nothrow) PTO2TaskSlotState[task_window_size]; - if (!slot_states) { - return false; - } - - // Zero-initialize all per-task slot state fields. - for (uint64_t i = 0; i < task_window_size; i++) { - slot_states[i].fanout_lock.store(0, std::memory_order_relaxed); - slot_states[i].fanout_count = 0; - slot_states[i].fanout_head = nullptr; - slot_states[i].task_state.store(static_cast(0), std::memory_order_relaxed); - slot_states[i].fanin_refcount.store(0, std::memory_order_relaxed); - slot_states[i].fanin_count = 0; - slot_states[i].fanout_refcount.store(0, std::memory_order_relaxed); - slot_states[i].payload = nullptr; - slot_states[i].task = nullptr; - slot_states[i].active_mask = 0; - slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed); - slot_states[i].ring_id = 0; - } - - return true; -} - -void PTO2SchedulerState::RingSchedState::destroy() { - if (!slot_states) return; - delete[] slot_states; - slot_states = nullptr; -} - -bool pto2_scheduler_init( - PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, void *gm_heap_base, uint64_t per_ring_heap_size -) { - sched->sm_handle = sm_handle; -#if PTO2_SCHED_PROFILING - sched->tasks_completed.store(0, std::memory_order_relaxed); - sched->tasks_consumed.store(0, std::memory_order_relaxed); -#endif - - // Initialize per-ring state - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!sched->ring_sched_states[r].init(sm_handle, r, gm_heap_base, per_ring_heap_size)) { - for (int j = 0; j < r; j++) { - sched->ring_sched_states[j].destroy(); - } - return false; - } - } - - // Initialize ready queues (one per resource shape, global) - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - if (!pto2_ready_queue_init(&sched->ready_queues[i], PTO2_READY_QUEUE_SIZE)) { - // Cleanup on failure - for (int j = 0; j < i; j++) { - pto2_ready_queue_destroy(&sched->ready_queues[j]); - } - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - sched->ring_sched_states[r].destroy(); - } - return false; - } - } - - return true; -} - -void pto2_scheduler_destroy(PTO2SchedulerState *sched) { - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - sched->ring_sched_states[r].destroy(); - } - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - pto2_ready_queue_destroy(&sched->ready_queues[i]); - } -} - -// ============================================================================= -// Debug Utilities -// ============================================================================= - -void pto2_scheduler_print_stats(PTO2SchedulerState *sched) { - LOG_INFO("=== Scheduler Statistics ==="); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (sched->ring_sched_states[r].last_task_alive > 0 || sched->ring_sched_states[r].heap_tail > 0) { - LOG_INFO("Ring %d:", r); - LOG_INFO(" last_task_alive: %d", sched->ring_sched_states[r].last_task_alive); - LOG_INFO(" heap_tail: %" PRIu64, sched->ring_sched_states[r].heap_tail); - } - } -#if PTO2_SCHED_PROFILING - LOG_INFO("tasks_completed: %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed)); - LOG_INFO("tasks_consumed: %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed)); -#endif - LOG_INFO("============================"); -} - -void pto2_scheduler_print_queues(PTO2SchedulerState *sched) { - LOG_INFO("=== Ready Queues ==="); - - const char *shape_names[] = {"AIC_ONLY", "AIV_X1", "AIV_X2", "AIC_AIV_X1", "AIC_AIV_X2"}; - - for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { - LOG_INFO(" %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size()); - } - - LOG_INFO("===================="); -} diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h deleted file mode 100644 index 080e9e598..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h +++ /dev/null @@ -1,729 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Runtime2 - Scheduler Interface - * - * The Scheduler is responsible for: - * 1. Maintaining per-resource-shape ready queues - * 2. Tracking task state (PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED) - * 3. Managing fanin/fanout refcounts for dependency resolution - * 4. Advancing last_task_alive for heap reclamation - * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete) - * - * The Scheduler runs on Device AI_CPU and processes: - * - Task state transitions based on fanin_refcount - * - Buffer lifecycle based on fanout_refcount - * - Ring pointer advancement for flow control - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#ifndef PTO_SCHEDULER_H -#define PTO_SCHEDULER_H - -#include - -#include "pto_runtime2_types.h" -#include "pto_shared_memory.h" -#include "pto_ring_buffer.h" - -#include "common/core_type.h" - -#if PTO2_SCHED_PROFILING -#include "aicpu/device_time.h" -#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1 -#define PTO2_SCHED_CYCLE_LAP(acc) \ - do { \ - _st1 = get_sys_cnt_aicpu(); \ - acc += (_st1 - _st0); \ - _st0 = _st1; \ - } while (0) -#endif - -// ============================================================================= -// Ready Queue (Lock-free bounded MPMC — Vyukov design) -// ============================================================================= - -/** - * Per-slot entry: sequence counter for ABA safety + task payload - */ -struct PTO2ReadyQueueSlot { - std::atomic sequence; - PTO2TaskSlotState *slot_state; -}; - -/** - * Thread-local ready buffer for local-first dispatch optimization. - * - * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1). - * Initialized once before the scheduling loop; must be empty at - * the start of each iteration (verified by always_assert). - * - * Phase 1 fills per-CoreType buffers via on_task_complete(). - * dispatch_ready_tasks_to_idle_cores drains them: local-first via - * get_ready_task, then remaining tasks pushed to global readyQ. - */ -// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1) -static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2; - -struct PTO2LocalReadyBuffer { - PTO2TaskSlotState **slot_states = nullptr; - int count = 0; - int capacity = 0; - - void reset(PTO2TaskSlotState **buf, int cap) { - slot_states = buf; - count = 0; - capacity = cap; - } - - bool try_push(PTO2TaskSlotState *s) { - if (slot_states && count < capacity) { - slot_states[count++] = s; - return true; - } - return false; - } - - PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; } -}; - -/** - * Lock-free bounded MPMC queue (Dmitry Vyukov design) - * - * Key properties: - * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing) - * - Per-slot sequence counter prevents ABA problem - * - Empty queue pop returns immediately (single atomic load, no lock) - * - CAS contention is split: producers only touch enqueue_pos, - * consumers only touch dequeue_pos - */ -struct alignas(64) PTO2ReadyQueue { - PTO2ReadyQueueSlot *slots; - uint64_t capacity; - uint64_t mask; // capacity - 1 - char _pad0[64 - 24]; // Pad to own cache line - - std::atomic enqueue_pos; - char _pad1[64 - sizeof(std::atomic)]; // Own cache line - - std::atomic dequeue_pos; - char _pad2[64 - sizeof(std::atomic)]; // Own cache line - - uint64_t size() { - uint64_t e = enqueue_pos.load(std::memory_order_relaxed); - uint64_t d = dequeue_pos.load(std::memory_order_relaxed); - return (e >= d) ? (e - d) : 0; - } - - bool push(PTO2TaskSlotState *slot_state) { - uint64_t pos; - PTO2ReadyQueueSlot *slot; - while (true) { - pos = enqueue_pos.load(std::memory_order_relaxed); - slot = &slots[pos & mask]; - int64_t seq = slot->sequence.load(std::memory_order_acquire); - int64_t diff = seq - (int64_t)pos; - if (diff == 0) { - if (enqueue_pos.compare_exchange_weak( - pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed - )) { - break; - } - } else if (diff < 0) { - return false; // Queue full - } - } - - slot->slot_state = slot_state; - slot->sequence.store((int64_t)(pos + 1), std::memory_order_release); - return true; - } - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING - bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) { - uint64_t pos; - PTO2ReadyQueueSlot *slot; - uint64_t t0 = get_sys_cnt_aicpu(); - bool contended = false; - uint32_t atomic_ops = 0; - while (true) { - pos = enqueue_pos.load(std::memory_order_relaxed); - slot = &slots[pos & mask]; - int64_t seq = slot->sequence.load(std::memory_order_acquire); - int64_t diff = seq - (int64_t)pos; - atomic_ops += 2; // enqueue_pos.load + sequence.load - if (diff == 0) { - if (enqueue_pos.compare_exchange_weak( - pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed - )) { - atomic_ops++; // successful CAS - break; - } - contended = true; - atomic_ops++; // failed CAS - } else if (diff < 0) { - return false; // Queue full - } else { - contended = true; // diff > 0: slot not yet released, spin - } - } - atomic_ops++; // final sequence.store - atomic_count += atomic_ops; - if (contended) { - wait_cycle += (get_sys_cnt_aicpu() - t0); - } - - slot->slot_state = slot_state; - slot->sequence.store((int64_t)(pos + 1), std::memory_order_release); - return true; - } -#endif - - PTO2TaskSlotState *pop() { - // Fast-path: skip slot load when queue is clearly empty - uint64_t d = dequeue_pos.load(std::memory_order_relaxed); - uint64_t e = enqueue_pos.load(std::memory_order_relaxed); - if (d >= e) { - return nullptr; - } - - uint64_t pos; - PTO2ReadyQueueSlot *slot; - while (true) { - pos = dequeue_pos.load(std::memory_order_relaxed); - slot = &slots[pos & mask]; - int64_t seq = slot->sequence.load(std::memory_order_acquire); - int64_t diff = seq - (int64_t)(pos + 1); - if (diff == 0) { - if (dequeue_pos.compare_exchange_weak( - pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed - )) - break; - } else if (diff < 0) { - return nullptr; // Queue empty - } - } - - PTO2TaskSlotState *result = slot->slot_state; - slot->sequence.store((int64_t)(pos + mask + 1), std::memory_order_release); - return result; - } - -#if PTO2_SCHED_PROFILING - PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) { - // Fast-path: skip slot load when queue is clearly empty - uint64_t d = dequeue_pos.load(std::memory_order_relaxed); - uint64_t e = enqueue_pos.load(std::memory_order_relaxed); - atomic_count += 2; // dequeue_pos.load + enqueue_pos.load - if (d >= e) { - return nullptr; - } - - uint64_t pos; - PTO2ReadyQueueSlot *slot; - uint64_t t0 = get_sys_cnt_aicpu(); - bool contended = false; - uint32_t atomic_ops = 0; - while (true) { - pos = dequeue_pos.load(std::memory_order_relaxed); - slot = &slots[pos & mask]; - int64_t seq = slot->sequence.load(std::memory_order_acquire); - int64_t diff = seq - (int64_t)(pos + 1); - atomic_ops += 2; // dequeue_pos.load + sequence.load - if (diff == 0) { - if (dequeue_pos.compare_exchange_weak( - pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed - )) { - atomic_ops++; // successful CAS - break; - } - contended = true; - atomic_ops++; // failed CAS - } else if (diff < 0) { - atomic_count += atomic_ops; - return nullptr; // Queue empty - } else { - contended = true; - } - } - atomic_ops++; // final sequence.store - atomic_count += atomic_ops; - if (contended) { - wait_cycle += (get_sys_cnt_aicpu() - t0); - } - - PTO2TaskSlotState *result = slot->slot_state; - slot->sequence.store((int64_t)(pos + mask + 1), std::memory_order_release); - return result; - } -#endif -}; - -// Cold-path ready queue operations (defined in pto_scheduler.cpp) -bool pto2_ready_queue_init(PTO2ReadyQueue *queue, uint64_t capacity); -void pto2_ready_queue_destroy(PTO2ReadyQueue *queue); - -// ============================================================================= -// Scheduler State -// ============================================================================= - -/** - * Statistics returned by mixed-task completion processing - */ -struct PTO2CompletionStats { - int32_t fanout_edges; // Number of fanout edges traversed (notify consumers) - int32_t tasks_enqueued; // Number of consumers that became READY - int32_t fanin_edges; // Number of fanin edges traversed (release producers) - bool mixed_task_completed; // True only when this callback completed a mixed task -}; - -/** - * Scheduler state structure - * - * Contains dynamic state updated during task execution. - * Separated from shared memory for cache efficiency. - * Hot-path methods are defined inline (implicitly inline as member functions). - */ -struct PTO2SchedulerState { - // Shared memory access - PTO2SharedMemoryHandle *sm_handle; - - // Per-ring state - struct RingSchedState { - PTO2TaskDescriptor *task_descriptors; - PTO2TaskSlotState *slot_states; - int32_t last_task_alive; - int32_t last_heap_consumed; - uint64_t heap_tail; - void *heap_base; - int32_t task_window_mask; - uint64_t task_window_size; - // Try-lock used to advance this ring's pointers (CONSUMED scanning + heap tail update). - std::atomic advance_lock; - - bool init(PTO2SharedMemoryHandle *sm_handle, int32_t ring_id, void *gm_heap_base, uint64_t per_ring_heap_size); - void destroy(); - - PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { - return slot_states[local_id & task_window_mask]; - } - PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } - - void sync_to_sm(PTO2SharedMemoryRingHeader &ring) { - ring.fc.last_task_alive.store(last_task_alive, std::memory_order_release); - ring.fc.heap_tail.store(heap_tail, std::memory_order_release); - } - - void advance_ring_pointers(PTO2SharedMemoryRingHeader &ring) { - int32_t current_task_index = ring.fc.current_task_index.load(std::memory_order_acquire); - - while (last_task_alive < current_task_index) { - PTO2TaskSlotState &slot_state = get_slot_state_by_task_id(last_task_alive); - if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) { - break; - } - last_task_alive++; - } - - if (last_task_alive > 0) { - int32_t last_consumed_id = last_task_alive - 1; - PTO2TaskSlotState &slot_state = get_slot_state_by_task_id(last_consumed_id); - PTO2TaskDescriptor &task = *slot_state.task; - if (task.packed_buffer_end != NULL) { - heap_tail = (uint64_t)((char *)task.packed_buffer_end - (char *)heap_base); - } - } - - sync_to_sm(ring); - } - } ring_sched_states[PTO2_MAX_RING_DEPTH]; - - // Ready queues remain global (scheduling is ring-agnostic) - PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES]; - - // Statistics -#if PTO2_SCHED_PROFILING - std::atomic tasks_completed; - std::atomic tasks_consumed; -#endif - // ========================================================================= - // Inline hot-path methods - // ========================================================================= - PTO2TaskSlotState &get_slot_state(int32_t ring_id, int32_t local_id) { - return ring_sched_states[ring_id].get_slot_state_by_task_id(local_id); - } - PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) { - return ring_sched_states[ring_id].get_slot_state_by_slot(slot); - } - - void check_and_handle_consumed(PTO2TaskSlotState &slot_state) { - if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return; - - PTO2TaskState expected = PTO2_TASK_COMPLETED; - if (!slot_state.task_state.compare_exchange_strong( - expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire - )) { - return; - } - -#if PTO2_SCHED_PROFILING - tasks_consumed.fetch_add(1, std::memory_order_relaxed); -#endif - - int32_t ring_id = slot_state.ring_id; - // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task - int32_t expected_lock = 0; - if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( - expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed - )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]); - ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); - } - } - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING - void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) { - int32_t fc = slot_state.fanout_count; - int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire); - - atomic_count += 2; // fanout_count.load + fanout_refcount.load - - if (rc != fc) return; - - PTO2TaskState expected = PTO2_TASK_COMPLETED; - if (!slot_state.task_state.compare_exchange_strong( - expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire - )) { - atomic_count += 1; // failed CAS - return; - } - - atomic_count += 1; // successful CAS - -#if PTO2_SCHED_PROFILING - tasks_consumed.fetch_add(1, std::memory_order_relaxed); -#endif - - int32_t ring_id = slot_state.ring_id; - // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task - int32_t expected_lock = 0; - if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( - expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed - )) { - ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]); - ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); - atomic_count += 2; // try-lock CAS + unlock store - } else { - atomic_count += 1; // failed try-lock CAS - } - } -#endif - - void release_producer(PTO2TaskSlotState &slot_state) { - slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel); - check_and_handle_consumed(slot_state); - } - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING - void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) { - slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel); - atomic_count += 1; // fanout_refcount.fetch_add - check_and_handle_consumed(slot_state, atomic_count); - } -#endif - - bool release_fanin_and_check_ready(PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr) { - // Atomically increment fanin_refcount and check if all producers are done - // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's - // init release, making fanin_count visible — plain load suffices. - int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; - - if (new_refcount == slot_state.fanin_count) { - // Local-first: try per-CoreType thread-local buffer before global queue - // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1] - PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask); - bool pushed_local = false; - if (local_bufs) { - int32_t buf_idx = (slot_state.active_mask & 0x01) ? 0 : 1; - pushed_local = local_bufs[buf_idx].try_push(&slot_state); - } - if (!pushed_local) { - ready_queues[static_cast(shape)].push(&slot_state); - } - return true; - } - return false; - } - -#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING - bool release_fanin_and_check_ready( - PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait, - PTO2LocalReadyBuffer *local_bufs = nullptr - ) { - int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; - atomic_count += 1; // fanin_refcount.fetch_add - - if (new_refcount == slot_state.fanin_count) { - PTO2TaskState expected = PTO2_TASK_PENDING; - if (slot_state.task_state.compare_exchange_strong( - expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire - )) { - atomic_count += 1; // CAS(task_state PENDING→READY) - // Local-first: try per-CoreType thread-local buffer before global queue - PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask); - bool pushed_local = false; - if (local_bufs) { - int32_t buf_idx = (slot_state.active_mask & 0x01) ? 0 : 1; - pushed_local = local_bufs[buf_idx].try_push(&slot_state); - } - if (!pushed_local) { - ready_queues[static_cast(shape)].push(&slot_state, atomic_count, push_wait); - } - return true; - } - } - return false; - } -#endif - - PTO2TaskSlotState *get_ready_task(PTO2ResourceShape shape) { - return ready_queues[static_cast(shape)].pop(); - } - - template - PTO2TaskSlotState *get_ready_task(PTO2LocalReadyBuffer *local_bufs) { - constexpr int ct = static_cast(CT); - if (local_bufs && local_bufs[ct].count > 0) { - return local_bufs[ct].pop(); - } - return ready_queues[ct].pop(); - } - -#if PTO2_SCHED_PROFILING - PTO2TaskSlotState *get_ready_task(PTO2ResourceShape shape, uint64_t &atomic_count, uint64_t &wait_cycle) { - return ready_queues[static_cast(shape)].pop(atomic_count, wait_cycle); - } - - template - PTO2TaskSlotState *get_ready_task(PTO2LocalReadyBuffer *local_bufs, uint64_t &atomic_count, uint64_t &wait_cycle) { - constexpr int ct = static_cast(CT); - if (local_bufs && local_bufs[ct].count > 0) { - return local_bufs[ct].pop(); - } - return ready_queues[ct].pop(atomic_count, wait_cycle); - } -#endif - - /** - * Requeue a ready task that could not be dispatched (no suitable cluster). - * Pushes the task back into its shape-based queue. - */ - void requeue_ready_task(PTO2TaskSlotState &slot_state) { - PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask); - ready_queues[static_cast(shape)].push(&slot_state); - } - - void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) { -#if PTO2_ORCH_PROFILING - extern uint64_t g_orch_scope_end_atomic_count; - for (int32_t i = 0; i < count; i++) { - release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count); - } -#else - for (int32_t i = 0; i < count; i++) { - release_producer(*task_slot_states[i]); - } -#endif - } - - /** - * Two-stage completion: first stage. - * Called when a single subtask (AIC, AIV0, or AIV1) finishes. - * Sets the corresponding done bit in subtask_done_mask. - * - * @return true if this subtask was the last one, completing the mixed task. - */ - bool on_subtask_complete(PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot) { - uint8_t done_bit = (1u << static_cast(subslot)); - uint8_t prev_mask = slot_state.subtask_done_mask.fetch_or(done_bit, std::memory_order_acq_rel); - uint8_t new_mask = prev_mask | done_bit; - - return new_mask == slot_state.active_mask; - } - - /** - * Two-stage completion: second stage. - * Called exactly once when all subtasks of a mixed task are done - * (i.e., on_subtask_complete returned true). - * Handles fanout notification, fanin release, and self-consumption check. - */ -#if PTO2_SCHED_PROFILING - PTO2CompletionStats -#else - void -#endif - on_mixed_task_complete( - PTO2TaskSlotState &slot_state, -#if PTO2_SCHED_PROFILING - int thread_idx, -#endif - - PTO2LocalReadyBuffer *local_bufs = nullptr - ) { -#if PTO2_SCHED_PROFILING - PTO2CompletionStats stats = {0, 0, 0, true}; -#endif -#if PTO2_SCHED_PROFILING - extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[]; - extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[]; - extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[]; - uint64_t lock_atomics = 0, lock_wait = 0; - PTO2_SCHED_CYCLE_START(); -#endif - -#if PTO2_SCHED_PROFILING - pto2_fanout_lock(slot_state, lock_atomics, lock_wait); -#else - pto2_fanout_lock(slot_state); -#endif - slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); - PTO2DepListEntry *current = slot_state.fanout_head; // Protected by fanout_lock - pto2_fanout_unlock(slot_state); - -#if PTO2_SCHED_PROFILING - lock_atomics += 2; // state.store + unlock.store - g_sched_lock_atomic_count[thread_idx] += lock_atomics; - g_sched_lock_wait_cycle[thread_idx] += lock_wait; - PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]); -#endif - - // Fanout: notify consumers -#if PTO2_SCHED_PROFILING - uint64_t fanout_atomics = 0, push_wait = 0; -#endif - while (current != nullptr) { - PTO2TaskSlotState &consumer_slot = *current->slot_state; -#if PTO2_SCHED_PROFILING - stats.fanout_edges++; - if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs)) { - stats.tasks_enqueued++; - } -#else - release_fanin_and_check_ready(consumer_slot, local_bufs); -#endif - current = current->next; - } - -#if PTO2_SCHED_PROFILING - g_sched_fanout_atomic_count[thread_idx] += fanout_atomics; - g_sched_push_wait_cycle[thread_idx] += push_wait; - PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]); - return stats; -#endif - } - - /** - * Cold path: release producers (fanin traversal) + check self for CONSUMED. - * Returns fanin edge count for profiling. - */ - -#if PTO2_SCHED_PROFILING - int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) { - PTO2_SCHED_CYCLE_START(); - extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[]; - extern uint64_t g_sched_self_atomic_count[]; - extern uint64_t g_sched_self_consumed_cycle[]; - extern uint64_t g_sched_complete_count[]; - uint64_t fanin_atomics = 0; -#else - int32_t on_task_release(PTO2TaskSlotState &slot_state) { -#endif - PTO2TaskPayload *payload = slot_state.payload; - int32_t fanin_edges = payload->fanin_actual_count; - for (int32_t i = 0; i < fanin_edges; i++) { -#if PTO2_SCHED_PROFILING - release_producer(*payload->fanin_slot_states[i], fanin_atomics); -#else - release_producer(*payload->fanin_slot_states[i]); -#endif - } -#if PTO2_SCHED_PROFILING - g_sched_fanin_atomic_count[thread_idx] += fanin_atomics; - PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]); -#endif - - // Self consumed check -#if PTO2_SCHED_PROFILING - uint64_t self_atomics = 0; - check_and_handle_consumed(slot_state, self_atomics); - g_sched_self_atomic_count[thread_idx] += self_atomics; - PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]); - g_sched_complete_count[thread_idx]++; -#else - check_and_handle_consumed(slot_state); -#endif - return fanin_edges; - } -}; - -// ============================================================================= -// Scheduler API (cold path, defined in pto_scheduler.cpp) -// ============================================================================= - -bool pto2_scheduler_init( - PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, void *gm_heap_base, uint64_t per_ring_heap_size -); -void pto2_scheduler_destroy(PTO2SchedulerState *sched); - -// ============================================================================= -// Debug Utilities (cold path, defined in pto_scheduler.cpp) -// ============================================================================= - -void pto2_scheduler_print_stats(PTO2SchedulerState *sched); -void pto2_scheduler_print_queues(PTO2SchedulerState *sched); -const char *pto2_task_state_name(PTO2TaskState state); - -// ============================================================================= -// Scheduler Profiling Data -// ============================================================================= - -#if PTO2_SCHED_PROFILING -struct PTO2SchedProfilingData { - // Sub-phase cycle breakdown within on_mixed_task_complete - uint64_t lock_cycle; // pto2_fanout_lock + state store + unlock - uint64_t fanout_cycle; // fanout traversal - uint64_t fanin_cycle; // fanin traversal - uint64_t self_consumed_cycle; // self check_and_handle_consumed - - // Wait times - uint64_t lock_wait_cycle; // spin-wait in fanout_lock - uint64_t push_wait_cycle; // CAS contention in push() - uint64_t pop_wait_cycle; // CAS contention in pop() - - // Atomic counts per sub-phase - uint64_t lock_atomic_count; - uint64_t fanout_atomic_count; - uint64_t fanin_atomic_count; - uint64_t self_atomic_count; - uint64_t pop_atomic_count; - - int64_t complete_count; -}; - -/** - * Get and reset scheduler profiling data for a specific thread. - * Returns accumulated profiling data and resets counters. - */ -PTO2SchedProfilingData pto2_scheduler_get_profiling(int thread_idx); -#endif - -#endif // PTO_SCHEDULER_H diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp deleted file mode 100644 index 4c511d0f8..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Runtime2 - Shared Memory Implementation - * - * Implements shared memory allocation, initialization, and management - * for Orchestrator-Scheduler communication. - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#include "pto_shared_memory.h" -#include -#include -#include -#include "common/unified_log.h" - -// ============================================================================= -// Size Calculation -// ============================================================================= - -uint64_t pto2_sm_calculate_size(uint64_t task_window_size) { - uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = task_window_size; - } - return pto2_sm_calculate_size_per_ring(task_window_sizes); -} - -uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) { - uint64_t size = 0; - - // Header (aligned to cache line) - size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); - - // Per-ring task descriptors and payloads - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); - } - - return size; -} - -// ============================================================================= -// Creation and Destruction -// ============================================================================= - -static void -pto2_sm_setup_pointers_per_ring(PTO2SharedMemoryHandle *handle, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) { - char *ptr = (char *)handle->sm_base; - - // Header - handle->header = (PTO2SharedMemoryHeader *)ptr; - ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); - - // Per-ring task descriptors and payloads - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - handle->task_descriptors[r] = (PTO2TaskDescriptor *)ptr; - ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - - handle->task_payloads[r] = (PTO2TaskPayload *)ptr; - ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); - } -} - -static void pto2_sm_setup_pointers(PTO2SharedMemoryHandle *handle, uint64_t task_window_size) { - uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = task_window_size; - } - pto2_sm_setup_pointers_per_ring(handle, task_window_sizes); -} - -PTO2SharedMemoryHandle *pto2_sm_create(uint64_t task_window_size, uint64_t heap_size) { - // Allocate handle - PTO2SharedMemoryHandle *handle = (PTO2SharedMemoryHandle *)calloc(1, sizeof(PTO2SharedMemoryHandle)); - if (!handle) { - return NULL; - } - - // Calculate total size - uint64_t sm_size = pto2_sm_calculate_size(task_window_size); - -// Allocate shared memory (aligned for DMA efficiency) -#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L - if (posix_memalign(&handle->sm_base, PTO2_ALIGN_SIZE, static_cast(sm_size)) != 0) { - free(handle); - return NULL; - } -#else - handle->sm_base = aligned_alloc(PTO2_ALIGN_SIZE, static_cast(sm_size)); - if (!handle->sm_base) { - free(handle); - return NULL; - } -#endif - - handle->sm_size = sm_size; - handle->is_owner = true; - - // Initialize to zero - memset(handle->sm_base, 0, static_cast(sm_size)); - - // Set up pointers - pto2_sm_setup_pointers(handle, task_window_size); - - // Initialize header - pto2_sm_init_header(handle, task_window_size, heap_size); - - return handle; -} - -PTO2SharedMemoryHandle *pto2_sm_create_default(void) { return pto2_sm_create(PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE); } - -PTO2SharedMemoryHandle * -pto2_sm_create_from_buffer(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size) { - if (!sm_base || sm_size == 0) return NULL; - - uint64_t required = pto2_sm_calculate_size(task_window_size); - if (sm_size < required) return NULL; - - PTO2SharedMemoryHandle *handle = (PTO2SharedMemoryHandle *)calloc(1, sizeof(PTO2SharedMemoryHandle)); - if (!handle) return NULL; - - handle->sm_base = sm_base; - handle->sm_size = sm_size; - handle->is_owner = false; - - pto2_sm_setup_pointers(handle, task_window_size); - pto2_sm_init_header(handle, task_window_size, heap_size); - - return handle; -} - -void pto2_sm_destroy(PTO2SharedMemoryHandle *handle) { - if (!handle) return; - - if (handle->is_owner && handle->sm_base) { - free(handle->sm_base); - } - - free(handle); -} - -// ============================================================================= -// Initialization -// ============================================================================= -// -// no need init data in pool, init pool data when used -void pto2_sm_init_header(PTO2SharedMemoryHandle *handle, uint64_t task_window_size, uint64_t heap_size) { - uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; - uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - task_window_sizes[r] = task_window_size; - heap_sizes[r] = heap_size; - } - pto2_sm_init_header_per_ring(handle, task_window_sizes, heap_sizes); -} - -void pto2_sm_init_header_per_ring( - PTO2SharedMemoryHandle *handle, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], - const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] -) { - PTO2SharedMemoryHeader *header = handle->header; - - // Per-ring flow control (start at 0) - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - header->rings[r].fc.init(); - } - - header->orchestrator_done.store(0, std::memory_order_relaxed); - - // Per-ring layout info - uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - header->rings[r].task_window_size = task_window_sizes[r]; - header->rings[r].heap_size = heap_sizes[r]; - header->rings[r].task_descriptors_offset = offset; - offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); - offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); - } - - header->total_size = handle->sm_size; - header->graph_output_ptr.store(0, std::memory_order_relaxed); - header->graph_output_size.store(0, std::memory_order_relaxed); - - // Error reporting - header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); - header->sched_error_bitmap.store(0, std::memory_order_relaxed); - header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); - header->sched_error_thread.store(-1, std::memory_order_relaxed); -} - -// ============================================================================= -// Debug Utilities -// ============================================================================= - -void pto2_sm_print_layout(PTO2SharedMemoryHandle *handle) { - if (!handle || !handle->header) return; - - PTO2SharedMemoryHeader *h = handle->header; - - LOG_INFO("=== PTO2 Shared Memory Layout ==="); - LOG_INFO("Base address: %p", handle->sm_base); - LOG_INFO("Total size: %" PRIu64 " bytes", h->total_size); - LOG_INFO("Ring depth: %d", PTO2_MAX_RING_DEPTH); - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - LOG_INFO("Ring %d:", r); - LOG_INFO(" task_window_size: %" PRIu64, h->rings[r].task_window_size); - LOG_INFO(" heap_size: %" PRIu64 " bytes", h->rings[r].heap_size); - LOG_INFO( - " descriptors_off: %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset, - h->rings[r].task_descriptors_offset - ); - LOG_INFO(" heap_top: %" PRIu64, h->rings[r].fc.heap_top.load(std::memory_order_acquire)); - LOG_INFO(" heap_tail: %" PRIu64, h->rings[r].fc.heap_tail.load(std::memory_order_acquire)); - LOG_INFO(" current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire)); - LOG_INFO(" last_task_alive: %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire)); - } - LOG_INFO("orchestrator_done: %d", h->orchestrator_done.load(std::memory_order_acquire)); - LOG_INFO("Error state:"); - LOG_INFO(" orch_error_code: %d", h->orch_error_code.load(std::memory_order_relaxed)); - LOG_INFO(" sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed)); - LOG_INFO(" sched_error_code: %d", h->sched_error_code.load(std::memory_order_relaxed)); - LOG_INFO(" sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed)); - LOG_INFO("================================"); -} - -bool pto2_sm_validate(PTO2SharedMemoryHandle *handle) { - if (!handle) return false; - if (!handle->sm_base) return false; - if (!handle->header) return false; - - PTO2SharedMemoryHeader *h = handle->header; - - for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { - if (!h->rings[r].fc.validate(handle, r)) return false; - } - - return true; -} - -bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const { - if (!handle) return false; - if (!handle->header) return false; - if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false; - - const PTO2SharedMemoryHeader *h = handle->header; - - // Check that offsets are within bounds - if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false; - - // Check pointer alignment - if ((uintptr_t)handle->task_descriptors[ring_id] % PTO2_ALIGN_SIZE != 0) return false; - - // Check flow control pointer sanity - int32_t current = current_task_index.load(std::memory_order_acquire); - int32_t last_alive = last_task_alive.load(std::memory_order_acquire); - uint64_t top = heap_top.load(std::memory_order_acquire); - uint64_t tail = heap_tail.load(std::memory_order_acquire); - if (current < 0) return false; - if (last_alive < 0) return false; - if (top > h->rings[ring_id].heap_size) return false; - if (tail > h->rings[ring_id].heap_size) return false; - - return true; -} diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h deleted file mode 100644 index d7880f482..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Runtime2 - Shared Memory Layout - * - * Defines the shared memory structure for Orchestrator-Scheduler communication. - * - * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1): - * +---------------------------+ - * | SharedMemoryHeader | (per-ring flow control + sync) - * +---------------------------+ - * | Ring 0: TaskDescriptor[] | - * | Ring 0: TaskPayload[] | - * +---------------------------+ - * | Ring 1: TaskDescriptor[] | - * | Ring 1: TaskPayload[] | - * +---------------------------+ - * | ... | - * +---------------------------+ - * - * Design principles: - * - Only data needed for Orchestrator<->Scheduler communication is here - * - Scope_stack, ready_queues, dep_pool are in private memory - * - Flow control via atomic counters/flags (no locks needed for single-word R/W) - * - * Based on: docs/RUNTIME_LOGIC.md - */ - -#ifndef PTO_SHARED_MEMORY_H -#define PTO_SHARED_MEMORY_H - -#include "pto_runtime2_types.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// ============================================================================= -// Shared Memory Header -// ============================================================================= - -struct PTO2SharedMemoryHandle; - -/** - * Per-ring flow control state in shared memory. - * Written/read by Orchestrator and Scheduler for synchronization. - */ -struct PTO2RingFlowControl { - // Written by Orchestrator, Read by Scheduler - std::atomic heap_top; // Heap ring allocation pointer - std::atomic current_task_index; // Task ring head (next to allocate) - int32_t _pad0; // Alignment padding - - // Written by Scheduler, Read by Orchestrator (for back-pressure) - std::atomic heap_tail; // Heap ring free pointer - std::atomic last_task_alive; // Task ring tail (oldest active task) - int32_t _pad1; // Alignment padding - - void init() { - heap_top.store(0, std::memory_order_relaxed); - current_task_index.store(0, std::memory_order_relaxed); - heap_tail.store(0, std::memory_order_relaxed); - last_task_alive.store(0, std::memory_order_relaxed); - } - - bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const; -}; - -/** - * Per-ring shared memory header section. - * - * Groups flow-control and layout info for a single ring to avoid parallel arrays. - */ -struct PTO2SharedMemoryRingHeader { - PTO2RingFlowControl fc; - uint64_t task_window_size; - uint64_t heap_size; - uint64_t task_descriptors_offset; // Offset from SM base, in bytes -}; - -/** - * Shared memory header structure - * - * Contains per-ring flow control and global layout information. - */ -struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { - // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) === - PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH]; - - // === GLOBAL FIELDS === - std::atomic orchestrator_done; // Flag: orchestration complete - - // Total shared memory size (for validation) - uint64_t total_size; - - // Graph output for copy-back (set by orchestrator when using packed buffer) - // Host finalize copies from this address instead of dev_ptr when non-zero - std::atomic graph_output_ptr; // Address where final output was written (packed buffer) - std::atomic graph_output_size; // Size in bytes - - // === ERROR REPORTING === - - // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host) - // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host. - std::atomic orch_error_code; - - // Scheduler error state (Scheduler → Host, independent of orchestrator) - // Written by scheduler threads on timeout; read by orchestrator and host. - std::atomic sched_error_bitmap; // Bit X set = thread X had error - std::atomic sched_error_code; // Last scheduler error code (last-writer-wins) - std::atomic sched_error_thread; // Thread index of last error writer -}; - -static_assert( - sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0, - "PTO2SharedMemoryHeader must be aligned to cache line (PTO2_ALIGN_SIZE)" -); - -// ============================================================================= -// Shared Memory Handle -// ============================================================================= - -/** - * Handle for shared memory access - * Provides both Orchestrator and Scheduler views of the same memory - */ -struct PTO2SharedMemoryHandle { - void *sm_base; // Base address of shared memory - uint64_t sm_size; // Total size of shared memory - - // Quick pointers into shared memory regions (per-ring) - PTO2SharedMemoryHeader *header; - PTO2TaskDescriptor *task_descriptors[PTO2_MAX_RING_DEPTH]; - PTO2TaskPayload *task_payloads[PTO2_MAX_RING_DEPTH]; - - // Ownership flag - bool is_owner; // True if this handle allocated the memory -}; - -// ============================================================================= -// Shared Memory API -// ============================================================================= - -/** - * Calculate required shared memory size - * - * @param task_window_size Number of task slots per ring - * @return Total bytes required - */ -uint64_t pto2_sm_calculate_size(uint64_t task_window_size); - -/** - * Calculate required shared memory size for per-ring task windows. - * - * @param task_window_sizes Array of window sizes per ring - * @return Total bytes required - */ -uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); - -/** - * Create shared memory for Orchestrator and Scheduler - * - * @param task_window_size Number of task slots per ring - * @param heap_size Heap size per ring for output buffers - * @return Handle with both views, or NULL on failure - */ -PTO2SharedMemoryHandle *pto2_sm_create(uint64_t task_window_size, uint64_t heap_size); - -/** - * Create shared memory with default sizes - */ -PTO2SharedMemoryHandle *pto2_sm_create_default(void); - -/** - * Wrap an existing buffer as shared memory (e.g. device GM buffer). - * Caller owns the buffer; handle will not free sm_base. - * - * @param sm_base Base address of pre-allocated buffer - * @param sm_size Total size in bytes - * @param task_window_size Number of task slots per ring (must match buffer layout) - * @param heap_size Heap size per ring (for layout; buffer has no heap region) - * @return Handle, or NULL on failure - */ -PTO2SharedMemoryHandle * -pto2_sm_create_from_buffer(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size); - -/** - * Destroy shared memory and free resources - */ -void pto2_sm_destroy(PTO2SharedMemoryHandle *handle); - -/** - * Initialize shared memory header with layout information - * Called after memory is allocated - */ -void pto2_sm_init_header(PTO2SharedMemoryHandle *handle, uint64_t task_window_size, uint64_t heap_size); - -/** - * Initialize shared memory header with per-ring layout information. - */ -void pto2_sm_init_header_per_ring( - PTO2SharedMemoryHandle *handle, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], - const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] -); - -// ============================================================================= -// Debug Utilities -// ============================================================================= - -/** - * Print shared memory layout info - */ -void pto2_sm_print_layout(PTO2SharedMemoryHandle *handle); - -/** - * Validate shared memory integrity - * @return true if valid, false if corrupted - */ -bool pto2_sm_validate(PTO2SharedMemoryHandle *handle); - -#ifdef __cplusplus -} -#endif - -#endif // PTO_SHARED_MEMORY_H diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h deleted file mode 100644 index d27decf3b..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * PTO Submit Types - Shared submit-contract definitions - * - * Header-only definitions shared by orchestration-facing and runtime-facing - * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h). - */ - -#ifndef PTO_SUBMIT_TYPES_H -#define PTO_SUBMIT_TYPES_H - -#include - -inline constexpr int32_t INVALID_KERNEL_ID = -1; - -/** - * Subtask slot count: AIC, AIV0, AIV1 - */ -inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3; - -/** - * Subtask slot indices - */ -enum class PTO2SubtaskSlot : uint8_t { - AIC = 0, - AIV0 = 1, - AIV1 = 2, -}; - -/** - * Subtask mask bits (for active_mask / subtask_done_mask) - */ -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 -inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 - -/** - * Test whether a subtask slot is active in a given mask - */ -static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) { - return (mask & (1u << static_cast(slot))) != 0; -} - -/** - * Mixed-task submit contract. - * - * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive). - * At least one slot must be valid. - */ -struct MixedKernels { - int32_t aic_kernel_id{INVALID_KERNEL_ID}; - int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; - int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; -}; - -/** - * Resource shape — classifies a MixedKernels into one of 5 queue buckets. - */ -enum class PTO2ResourceShape : uint8_t { - AIC_ONLY = 0, // AIC only - AIV_X1 = 1, // One AIV slot - AIV_X2 = 2, // Both AIV slots - AIC_AIV_X1 = 3, // AIC + one AIV - AIC_AIV_X2 = 4, // AIC + both AIV -}; - -inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5; - -/** - * Derive resource shape from active_mask. - * Caller must ensure active_mask is valid (at least one bit set). - */ -static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) { - bool has_aic = (active_mask & PTO2_SUBTASK_MASK_AIC) != 0; - int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0) + ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0); - - if (has_aic) { - if (aiv_count == 0) return PTO2ResourceShape::AIC_ONLY; - if (aiv_count == 1) return PTO2ResourceShape::AIC_AIV_X1; - return PTO2ResourceShape::AIC_AIV_X2; - } - if (aiv_count == 1) return PTO2ResourceShape::AIV_X1; - return PTO2ResourceShape::AIV_X2; -} - -/** - * Compute active_mask from MixedKernels. - */ -static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels &mk) { - uint8_t mask = 0; - if (mk.aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC; - if (mk.aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0; - if (mk.aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1; - return mask; -} - -#endif // PTO_SUBMIT_TYPES_H diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h deleted file mode 100644 index b4d9bb1cd..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Orchestration Build Graph Types - Data structures for orchestration runtime extensions - * - * Standalone header defining orchestration-specific types for: - * - TaskOutputTensors: Return value from submit containing materialized output Tensors - * - TensorRef: Tagged union for tensor slots (Tensor* or TensorCreateInfo) - * - SubmitResult: Combined return value (PTO2TaskId + TaskOutputTensors) - * - Arg: Aggregated argument container for pto_submit_task API - * - * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are - * defined in tensor.h. - * - * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h - * without type conflicts (Handshake, TensorPair, HostApi). - */ - -#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_TYPES_H_ -#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_TYPES_H_ - -#include -#include - -#if defined(__aarch64__) -#include -#endif - -#include "task_args.h" -#include "tensor.h" -#include "tensor_arg.h" - -// Task arguments -#define MAX_TENSOR_ARGS 16 // Maximum tensor parameters per task -#define MAX_SCALAR_ARGS 32 // Maximum scalar parameters per task -#define PTO2_MAX_OUTPUTS 16 // Maximum outputs per task -#define PTO2_MAX_INPUTS 16 // Maximum inputs per task -#define PTO2_MAX_INOUTS 8 // Maximum in-out args per task - -// Forward declaration for SubmitResult -struct PTO2TaskId; - -// ============================================================================= -// Task Output Tensors (return value from submit) -// ============================================================================= - -/** - * TaskOutputTensors — returned by submit, holds materialized output Tensors. - * - * Only runtime-created outputs are stored here, indexed in add_output order. - * - * The underlying storage is uninitialized; only output_count elements are - * valid after submit returns. This avoids default-constructing Tensor[] - * on the hot path (2 KB of unnecessary zeroing per submit). - * - * Users must hold a named TaskOutputTensors variable and borrow via get_ref(); - * binding get_ref() on an rvalue is compile-time rejected to prevent dangling. - */ -class TaskOutputTensors { -public: - TaskOutputTensors() : - output_count_(0) {} - - bool empty() const { return output_count_ == 0; } - uint32_t size() const { return output_count_; } - - /// Borrow a materialized output tensor by index (lvalue only). - const Tensor &get_ref(uint32_t index) const & { - always_assert(index < output_count_); - return *reinterpret_cast(_storage + index * sizeof(Tensor)); - } - const Tensor &get_ref(uint32_t index) const && = delete; - - /// Runtime-internal: append one materialized output Tensor. - Tensor &materialize_output(const TensorCreateInfo &ci, void *addr, int32_t version) { - always_assert(output_count_ < PTO2_MAX_OUTPUTS); - Tensor *out = output_ptr(output_count_); - out->init_from_create_info(ci, addr, version); - output_count_++; - return *out; - } - - /// Runtime-internal: writable pointer for materialization. - Tensor *output_ptr(uint32_t index) { return reinterpret_cast(_storage + index * sizeof(Tensor)); } - const Tensor *output_ptr(uint32_t index) const { - return reinterpret_cast(_storage + index * sizeof(Tensor)); - } - -private: - uint32_t output_count_; - alignas(Tensor) unsigned char _storage[PTO2_MAX_OUTPUTS * sizeof(Tensor)]; -}; - -// ============================================================================= -// Argument Types (for pto_submit_task API) -// ============================================================================= - -// TensorArgType is defined in tensor_arg.h (included above) - -/** - * Tagged union for a single Arg slot — either a Tensor* or a TensorCreateInfo value. - * The active member is determined by TensorArgType (OUTPUT → create_info, else → ptr). - */ -union TensorRef { - const Tensor *ptr; - TensorCreateInfo create_info; - TensorRef() : - ptr(nullptr) {} -}; - -/** - * Aggregated argument container for pto_submit_task - * - * Inherits storage from TaskArgsTpl. - * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo) - * discriminated by the corresponding tag(). - * Tensors are dispatched first in kernel args, followed by scalars. - * - * Output arguments follow two distinct ownership models: - * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer - * and materializes a new Tensor, returned via TaskOutputTensors. - * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target. - * - * Example: - * Tensor x = make_tensor_external(dev_a, shapes, 2); - * Arg args; - * args.add_input(x); - * args.add_output(TensorCreateInfo(shapes, 2)); - * args.add_scalar(some_value); - * SubmitResult r = rt_submit_aic_task(rt, kernel_id, args); - * const Tensor& y = r.outputs.get_ref(0); - */ -struct Arg : TaskArgsTpl { - bool has_error{false}; - const char *error_msg{nullptr}; - - void reset() { - clear(); - has_error = false; - error_msg = nullptr; - } - - void set_error(const char *msg) { - if (!has_error) { - has_error = true; - error_msg = msg; - } - } - - bool check_add_tensor_valid() { - if (scalar_count_ != 0) { - set_error( - "add_input/add_output/add_inout called after add_scalar: " - "all tensors must be added before any scalars" - ); - return false; - } - if (tensor_count_ >= MAX_TENSOR_ARGS) { - set_error("Too many tensor args (exceeds MAX_TENSOR_ARGS=16)"); - return false; - } - return true; - } - - void add_input(const Tensor &t) { - if (!check_add_tensor_valid()) { - return; - } - tensors_[tensor_count_].ptr = &t; - tags_[tensor_count_] = TensorArgType::INPUT; - tensor_count_++; - } - - /// Standard future-output path: runtime allocates buffer from heap, - /// materializes Tensor into TaskOutputTensors. - void add_output(const TensorCreateInfo &ci) { - if (!check_add_tensor_valid()) { - return; - } - tensors_[tensor_count_].create_info = ci; - tags_[tensor_count_] = TensorArgType::OUTPUT; - tensor_count_++; - } - - void add_inout(const Tensor &t) { - if (!check_add_tensor_valid()) { - return; - } - tensors_[tensor_count_].ptr = &t; - tags_[tensor_count_] = TensorArgType::INOUT; - tensor_count_++; - } - - /** - * Add a scalar value. Type is deduced from the argument; - * the value is bit-cast to uint64_t for storage. - * - * args.add_scalar(uint64_val); // existing usage unchanged - * args.add_scalar(3.14f); // float, auto bit-cast - * args.add_scalar(int32_t(42)); // int32, auto bit-cast - */ - template - void add_scalar(T value) { - static_assert(is_supported_scalar_arg_v, "add_scalar: type must be arithmetic or enum"); - if (scalar_count_ >= MAX_SCALAR_ARGS) { - set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)"); - return; - } - scalars_[scalar_count_++] = to_u64(value); - } - - void add_scalars(const uint64_t *values, int count) { - if (scalar_count_ + count > MAX_SCALAR_ARGS) { - set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)"); - return; - } - memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t)); - scalar_count_ += count; - } - - /** - * Zero-extend int32 bit patterns into uint64 scalar slots. - * Negative values are treated as their unsigned 32-bit representation - * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF). - * Uses NEON to process 4 elements per iteration on aarch64. - */ - void add_scalars_i32(const int32_t *values, int count) { - if (scalar_count_ + count > MAX_SCALAR_ARGS) { - set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)"); - return; - } - uint64_t *dst = &scalars_[scalar_count_]; -#if defined(__aarch64__) - int i = 0; - for (; i + 4 <= count; i += 4) { - uint32x4_t v = vld1q_u32(reinterpret_cast(values + i)); - uint64x2_t lo = vmovl_u32(vget_low_u32(v)); - uint64x2_t hi = vmovl_u32(vget_high_u32(v)); - vst1q_u64(dst + i, lo); - vst1q_u64(dst + i + 2, hi); - } - for (; i < count; i++) { - dst[i] = static_cast(static_cast(values[i])); - } -#else - for (int i = 0; i < count; i++) { - dst[i] = static_cast(static_cast(values[i])); - } -#endif - scalar_count_ += count; - } - - /** - * Copy scalars from another Arg's scalar array. - * Useful when multiple tasks share the same scalar data (e.g., block indices). - */ - void copy_scalars_from(const Arg &src, int src_offset, int count) { - if (src_offset + count > src.scalar_count_) { - set_error("Source scalar range out of bounds in copy_scalars_from"); - return; - } - if (scalar_count_ + count > MAX_SCALAR_ARGS) { - set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)"); - return; - } - memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t)); - scalar_count_ += count; - } -}; - -#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_TYPES_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp deleted file mode 100644 index 5d8886cdf..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Runtime Class - Implementation - * - * Device execution and handshake control. - * Task graph construction is handled by PTO2Runtime. - */ - -#include "runtime.h" - -#include "common/unified_log.h" -#include "pto_runtime2_types.h" -#include "pto_shared_memory.h" - -// ============================================================================= -// Constructor -// ============================================================================= - -Runtime::Runtime() { - // NOTE: host_api is initialized in InitRuntime() (host-only code) - // because the CApi functions don't exist when compiled for device. - - // Initialize handshake buffers - memset(workers, 0, sizeof(workers)); - worker_count = 0; - sche_cpu_num = 1; - ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS; - task_window_size = 0; - heap_size = 0; - dep_pool_size = 0; - orch_to_sched = false; - - // Initialize tensor pairs - tensor_pair_count = 0; - - // Initialize device orchestration state - orch_built_on_host_ = true; - gm_sm_ptr_ = nullptr; - gm_heap_ptr_ = nullptr; - slot_states_ptr_ = nullptr; - orch_args_storage_.clear(); - - // Initialize device orchestration SO binary - dev_orch_so_addr_ = 0; - dev_orch_so_size_ = 0; - has_new_orch_so_ = false; - - // Initialize kernel binary tracking - registered_kernel_count_ = 0; - - // Initialize function address mapping - for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) { - func_id_to_addr_[i] = 0; - } -} - -// ============================================================================= -// Tensor Pair Management -// ============================================================================= - -void Runtime::record_tensor_pair(void *host_ptr, void *dev_ptr, size_t size) { - if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) { - LOG_ERROR("[Runtime] Tensor pairs full (max=%d)", RUNTIME_MAX_TENSOR_PAIRS); - return; - } - tensor_pairs[tensor_pair_count].host_ptr = host_ptr; - tensor_pairs[tensor_pair_count].dev_ptr = dev_ptr; - tensor_pairs[tensor_pair_count].size = size; - tensor_pair_count++; - LOG_INFO("Recorded tensor pair: host=%p dev=%p size=%zu", host_ptr, dev_ptr, size); -} - -TensorPair *Runtime::get_tensor_pairs() { return tensor_pairs; } - -int Runtime::get_tensor_pair_count() const { return tensor_pair_count; } - -void Runtime::clear_tensor_pairs() { tensor_pair_count = 0; } - -// ============================================================================= -// Device orchestration -// ============================================================================= - -bool Runtime::get_orch_built_on_host() const { return orch_built_on_host_; } -void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; } -void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; } -const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; } -void Runtime::set_orch_built_on_host(bool v) { orch_built_on_host_ = v; } -void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; } -void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } -void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } -void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } - -// Device orchestration SO metadata (bytes live in a separate device buffer -// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime). -void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) { - dev_orch_so_addr_ = dev_addr; - dev_orch_so_size_ = size; - has_new_orch_so_ = is_new; -} - -uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; } - -uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; } - -bool Runtime::has_new_orch_so() const { return has_new_orch_so_; } - -uint64_t Runtime::get_function_bin_addr(int func_id) const { - if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0; - return func_id_to_addr_[func_id]; -} - -void Runtime::set_function_bin_addr(int func_id, uint64_t addr) { - if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { - LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); - return; - } - if (addr != 0 && func_id_to_addr_[func_id] == 0) { - if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) { - registered_kernel_func_ids_[registered_kernel_count_++] = func_id; - } else { - LOG_ERROR( - "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID, - func_id - ); - } - } - func_id_to_addr_[func_id] = addr; -} - -int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; } - -int Runtime::get_registered_kernel_func_id(int index) const { - if (index < 0 || index >= registered_kernel_count_) return -1; - return registered_kernel_func_ids_[index]; -} - -void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; } diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h deleted file mode 100644 index 340c78ae0..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Runtime Class - Device Execution and Handshake Control - * - * This class manages device-side execution through AICPU-AICore handshake - * protocol. Task graph construction is handled by PTO2Runtime; this class - * only handles: - * - Handshake buffers for AICPU-AICore communication - * - Execution parameters (block_dim, sche_cpu_num) - * - Tensor pair management for host-device memory tracking - * - Device orchestration state (gm_sm_ptr_, orch_args_) - * - Function address mapping (func_id_to_addr_) - * - * Task dispatch uses PTO2DispatchPayload from PTO2 shared memory. - */ - -#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_RUNTIME_H_ -#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_RUNTIME_H_ - -#include -#include -#include // for fprintf, printf -#include // for memset - -#include "common/core_type.h" -#include "common/l2_perf_profiling.h" -#include "common/platform_config.h" -#include "pto2_dispatch_payload.h" -#include "task_args.h" - -// ============================================================================= -// Configuration Macros -// ============================================================================= - -#define RUNTIME_MAX_ARGS 128 -#define RUNTIME_MAX_WORKER 72 // 24 AIC + 48 AIV cores -#define RUNTIME_MAX_TENSOR_PAIRS 64 -#define RUNTIME_MAX_FUNC_ID 1024 -#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 1MB max for orchestration SO - -// Default ready queue shards: one shard per worker thread (total minus orchestrator) -constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; - -// ============================================================================= -// Data Structures -// ============================================================================= - -/** - * Handshake Structure - Shared between Host, AICPU, and AICore - * - * This structure facilitates communication and synchronization between - * AICPU and AICore during task execution. - * - * Protocol State Machine: - * 1. Initialization: AICPU sets aicpu_ready=1 - * 2. Acknowledgment: AICore sets aicore_done=core_id+1 - * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core task pointer - * 4. Task Execution: AICore reads the dispatched task and executes - * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion - * 6. Shutdown: AICPU sets control=1, AICore exits - * - * Each AICore instance has its own handshake buffer to enable concurrent - * task execution across multiple cores. - */ - -/** - * Handshake buffer for AICPU-AICore communication - * - * Each AICore has its own handshake buffer for synchronization with AICPU. - * The structure is cache-line aligned (64 bytes) to prevent false sharing - * between cores and optimize cache coherency operations. - * - * enable_profiling_flag bit definitions (umbrella bitmask — "profiling" - * is the umbrella, each bit is a parallel diagnostics sub-feature): - * - bit0: tensor dump enabled - * - bit1: L2 swimlane enabled - * - bit2: PMU enabled - * - * Field Access Patterns: - * - aicpu_ready: Written by AICPU, read by AICore - * - aicore_done: Written by AICore, read by AICPU - * - task: Written by AICPU, read by AICore (0 = no task, non-zero = PTO2DispatchPayload*) - * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV) - * - enable_profiling_flag: Written by host/AICPU init, read by AICore (bitmask) - */ -struct Handshake { - volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready - volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready - volatile uint64_t task; // Task pointer: 0=no task, non-zero=PTO2DispatchPayload* - volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV - volatile uint64_t l2_perf_records_addr; // Performance records address - volatile uint32_t physical_core_id; // Physical core ID - volatile uint32_t aicpu_regs_ready; // AICPU register init done: 0=pending, 1=done - volatile uint32_t aicore_regs_ready; // AICore ID reported: 0=pending, 1=done - volatile uint32_t - enable_profiling_flag; // Umbrella diagnostics bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu -} __attribute__((aligned(64))); - -/** - * Tensor pair for tracking host-device memory mappings. - * Used for copy-back during finalize. - */ -struct TensorPair { - void *host_ptr; - void *dev_ptr; - size_t size; -}; - -/** - * Host API function pointers for device memory operations. - * Allows runtime to use pluggable device memory backends. - */ -struct HostApi { - void *(*device_malloc)(size_t size); - void (*device_free)(void *dev_ptr); - int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); - int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); - uint64_t (*upload_kernel_binary)(int func_id, const uint8_t *bin_data, size_t bin_size); - void (*remove_kernel_binary)(int func_id); -}; - -/** - * Task structure - Compatibility stub for platform layer - * - * RT2 uses PTO2DispatchPayload instead of Task for task dispatch. - * This stub exists only for API compatibility with device_runner.cpp. - * Since get_task_count() returns 0, this struct is never actually used. - */ -struct Task { - int func_id; - uint64_t function_bin_addr; -}; - -// ============================================================================= -// Runtime Class -// ============================================================================= - -/** - * Runtime class for device execution and handshake control - * - * This class manages AICPU-AICore communication through handshake buffers. - * Task graph construction is handled by PTO2Runtime; this class only handles - * execution control and device orchestration state. - */ -class Runtime { -public: - // Handshake buffers for AICPU-AICore communication - Handshake workers[RUNTIME_MAX_WORKER]; // Worker (AICore) handshake buffers - int worker_count; // Number of active workers - - // Execution parameters for AICPU scheduling - int sche_cpu_num; // Number of AICPU threads for scheduling - int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) - - // Ring buffer size overrides (0 = use compile-time defaults) - uint64_t task_window_size; - uint64_t heap_size; - uint64_t dep_pool_size; - - // PTO2 integration: kernel_id -> GM function_bin_addr mapping - // NOTE: Made public for direct access from aicore code - uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; - - // Orchestrator-to-scheduler transition control - // When true, orchestrator threads convert to scheduler threads after orchestration completes. - // When false (default), orchestrator threads exit after orchestration without dispatching tasks. - // Controlled via PTO2_ORCH_TO_SCHED environment variable. - bool orch_to_sched; - -private: - // Tensor pairs for host-device memory tracking - TensorPair tensor_pairs[RUNTIME_MAX_TENSOR_PAIRS]; - int tensor_pair_count; - - // Kernel binary tracking for cleanup - int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; - int registered_kernel_count_; - - // Device orchestration: when false, orchestration runs on device (thread 3) - bool orch_built_on_host_; - void *gm_sm_ptr_; // GM pointer to PTO2 shared memory (device) - void *gm_heap_ptr_; // GM heap for orchestrator output buffers (device) - void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) - ChipStorageTaskArgs orch_args_storage_; // Copy of args for device - - // Device orchestration SO (for dlopen on AICPU thread 3). - // Bytes live in a separate device buffer owned by DeviceRunner; only the - // metadata travels in Runtime. `has_new_orch_so_` tells AICPU to reload. - uint64_t dev_orch_so_addr_; - uint64_t dev_orch_so_size_; - bool has_new_orch_so_; - -public: - /** - * Constructor - zero-initialize all arrays - */ - Runtime(); - - // ========================================================================= - // Tensor Pair Management - // ========================================================================= - - /** - * Record a host-device tensor pair for copy-back during finalize. - */ - void record_tensor_pair(void *host_ptr, void *dev_ptr, size_t size); - - /** - * Get pointer to tensor pairs array. - */ - TensorPair *get_tensor_pairs(); - - /** - * Get number of recorded tensor pairs. - */ - int get_tensor_pair_count() const; - - /** - * Clear all recorded tensor pairs. - */ - void clear_tensor_pairs(); - - // ========================================================================= - // Performance Profiling - // ========================================================================= - - // ========================================================================= - // Device orchestration (for AICPU thread 3) - // ========================================================================= - - bool get_orch_built_on_host() const; - void *get_gm_sm_ptr() const; - void *get_gm_heap_ptr() const; - const ChipStorageTaskArgs &get_orch_args() const; - void set_orch_built_on_host(bool v); - void set_gm_sm_ptr(void *p); - void set_gm_heap(void *p); - void set_slot_states_ptr(void *p); - void set_orch_args(const ChipStorageTaskArgs &args); - - // Device orchestration SO binary (for dlopen on AICPU thread 3) - void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new); - uint64_t get_dev_orch_so_addr() const; - uint64_t get_dev_orch_so_size() const; - bool has_new_orch_so() const; - - uint64_t get_function_bin_addr(int func_id) const; - void set_function_bin_addr(int func_id, uint64_t addr); - - int get_registered_kernel_count() const; - int get_registered_kernel_func_id(int index) const; - void clear_registered_kernels(); - - // ========================================================================= - // Deprecated API (for platform compatibility, always returns 0/nullptr) - // Task graph is now managed by PTO2Runtime, not Runtime - // ========================================================================= - - /** @deprecated Task count is now in PTO2 shared memory */ - int get_task_count() const { return 0; } - - /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */ - Task *get_task(int) { return nullptr; } - - /** @deprecated Use PTO2 dispatch mode */ - bool get_use_pto2_dispatch() const { return true; } - - /** @deprecated Use PTO2 dispatch mode */ - void set_use_pto2_dispatch(bool) {} - - // ========================================================================= - // Host API (host-only, not copied to device) - // ========================================================================= - - // Host API function pointers for device memory operations - // NOTE: Placed at end of class to avoid affecting device memory layout - HostApi host_api; - - // Host-only staging for orchestration SO; consumed by DeviceRunner. - const void *pending_orch_so_data_{nullptr}; - size_t pending_orch_so_size_{0}; -}; - -#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_RUNTIME_H_ diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h b/src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h deleted file mode 100644 index 15af8992b..000000000 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ - -#pragma once - -#include -#include - -#include -#include -#include -#include - -#include "common.h" -#include "data_type.h" - -constexpr int RUNTIME_MAX_TENSOR_DIMS = 5; - -/** - * Buffer Handle - * - * Represents a device memory buffer with address and total size in bytes. - * This is the underlying memory allocation that a Tensor describes access patterns for. - */ -struct PTOBufferHandle { - uint64_t addr; // Device memory address (bytes) - uint64_t size; // Total buffer size in bytes -}; - -enum class OverlapStatus { - NO_OVERLAP, - COVERED, - OTHER, -}; - -struct Segment { - uint64_t begin; - uint64_t end; - - bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; } - bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; } -}; - -/** - * TensorCreateInfo — metadata for runtime-allocated output tensors. - * - * Captures shape, dtype, and buffer size without allocating memory. - * Passed by value to Arg::add_output(); the runtime allocates from the heap - * and materializes a full Tensor via Tensor::init_from_create_info(). - */ -struct TensorCreateInfo { - DataType dtype; - uint32_t ndims; - uint32_t raw_shapes[RUNTIME_MAX_TENSOR_DIMS]; - bool manual_dep; - bool has_initial_value; - uint64_t initial_value; - - TensorCreateInfo( - const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false - ) : - dtype(dtype), - ndims(ndims), - manual_dep(manual_dep), - has_initial_value(false), - initial_value(0) { - for (uint32_t i = 0; i < ndims; i++) { - raw_shapes[i] = shapes[i]; - } - } - - void set_initial_value(uint64_t value) { - has_initial_value = true; - initial_value = value; - } - - uint64_t buffer_size_bytes() const { - uint64_t total = 1; - for (uint32_t i = 0; i < ndims; i++) { - total *= raw_shapes[i]; - } - return total * get_element_size(dtype); - } -}; - -/** - * Tensor descriptor for Task input/output (128B = 2 cache lines) - * - * Describes a memory access pattern on Global Memory (GM) using - * raw_shapes (underlying buffer dimensions), shapes (current view dimensions), - * and offsets (multi-dimensional offset into the buffer). - * - * - `buffer` contains the underlying memory allocation (addr in bytes, size in bytes) - * - `raw_shapes[]`, `shapes[]`, `offsets[]` are in ELEMENTS - * - `dtype` specifies element type for interpreting buffer contents - * - * Fast-path flags (both on cache line 1): - * - is_all_offset_zero: when true, offsets[] are implicitly zero — skip offset read/write - * - is_raw_eq_shapes: when true, raw_shapes[] == shapes[] — skip raw_shapes read/write, - * use shapes[] wherever raw_shapes would be needed - * - * When BOTH flags are true, cache line 2 is never accessed. - * - * Layout: cache line 1 holds hot-path fields (buffer, start_offset, version, - * dtype, ndims, flags, shapes); cache line 2 holds warm-path fields (raw_shapes, offsets). - */ -struct alignas(64) Tensor { - // === Cache line 1 (64B) — hot path === - PTOBufferHandle buffer; // Underlying memory buffer (addr in bytes, size in bytes) - uint64_t start_offset; // Cached 1D element offset (precomputed from raw_shapes + offsets), only calc before - // incore, useless in orch - int32_t version; // Tensor version for overlap detection - DataType dtype; // Data type of tensor elements - uint32_t ndims; // Number of dimensions used - bool is_all_offset_zero; // True when all offsets[] are zero (skip offset read/write) - bool is_raw_eq_shapes; // True when raw_shapes[] == shapes[] (skip raw_shapes read/write) - bool manual_dep; // True when dependency is managed manually (skip tensormap lookup/insert) - uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS]; // Current view shape per dimension - uint32_t __padding__; - - // === Cache line 2 (64B) — warm path === - uint32_t raw_shapes[RUNTIME_MAX_TENSOR_DIMS]; // Underlying buffer shape per dimension - uint32_t offsets[RUNTIME_MAX_TENSOR_DIMS]; // Multi-dimensional offset per dimension - - Tensor() = default; - Tensor(const Tensor &) = default; - Tensor &operator=(const Tensor &) = default; - Tensor(Tensor &&) = default; - Tensor &operator=(Tensor &&) = default; - ~Tensor() = default; - - /// Return the effective raw_shapes pointer (shapes[] when is_raw_eq_shapes). - /// Avoids cache line 2 access for the common case. - const uint32_t *get_raw_shapes() const { return is_raw_eq_shapes ? shapes : raw_shapes; } - - Tensor( - void *addr, uint64_t buffer_size_bytes, const uint32_t raw_shapes[], const uint32_t shapes[], - const uint32_t offsets[], uint32_t ndims, DataType dtype, int32_t version, bool is_all_offset_zero = false, - bool is_raw_eq_shapes = false, bool manual_dep = false - ) { - init( - addr, buffer_size_bytes, raw_shapes, shapes, offsets, ndims, dtype, version, is_all_offset_zero, - is_raw_eq_shapes, manual_dep - ); - } - - // --- Initialization --- - void init( - void *addr, uint64_t buffer_size_bytes, const uint32_t in_raw_shapes[], const uint32_t in_shapes[], - const uint32_t in_offsets[], uint32_t in_ndims, DataType in_dtype, int32_t in_version, - bool in_is_all_offset_zero = false, bool in_is_raw_eq_shapes = false, bool in_manual_dep = false - ) { - buffer = {reinterpret_cast(addr), buffer_size_bytes}; - ndims = in_ndims; - dtype = in_dtype; - version = in_version; - is_all_offset_zero = in_is_all_offset_zero; - is_raw_eq_shapes = in_is_raw_eq_shapes; - manual_dep = in_manual_dep; - for (uint32_t i = 0; i < in_ndims; i++) { - shapes[i] = in_shapes[i]; - } - if (!in_is_raw_eq_shapes) { - for (uint32_t i = 0; i < in_ndims; i++) { - raw_shapes[i] = in_raw_shapes[i]; - } - } - if (!in_is_all_offset_zero) { - for (uint32_t i = 0; i < in_ndims; i++) { - offsets[i] = in_offsets[i]; - } - } - } - - void init(const Tensor &other) { - memcpy(this, &other, 64); // fast copy cache line 1 - if (!other.is_raw_eq_shapes) { - for (uint32_t i = 0; i < ndims; i++) { - raw_shapes[i] = other.raw_shapes[i]; - } - } - if (!other.is_all_offset_zero) { - for (uint32_t i = 0; i < ndims; i++) { - offsets[i] = other.offsets[i]; - } - } - } - - void init_with_view( - const Tensor &other, const uint32_t view_shapes[], const uint32_t view_offsets[], bool in_manual_dep = false - ) { - buffer = other.buffer; - ndims = other.ndims; - dtype = other.dtype; - version = other.version; - manual_dep = in_manual_dep; - // view always diverges shapes from raw_shapes, so is_raw_eq_shapes = false. - // Read parent's effective raw_shapes (avoids parent cache line 2 when parent is_raw_eq_shapes). - is_raw_eq_shapes = false; - const uint32_t *parent_raw = other.get_raw_shapes(); - for (uint32_t i = 0; i < ndims; i++) { - raw_shapes[i] = parent_raw[i]; - shapes[i] = view_shapes[i]; - } - // Compute offsets and zero-flag - bool all_zero = true; - if (other.is_all_offset_zero) { - for (uint32_t i = 0; i < ndims; i++) { - if (view_offsets[i] != 0) { - all_zero = false; - break; - } - } - if (!all_zero) { - for (uint32_t i = 0; i < ndims; i++) { - offsets[i] = view_offsets[i]; - } - } - } else { - all_zero = false; - for (uint32_t i = 0; i < ndims; i++) { - offsets[i] = other.offsets[i] + view_offsets[i]; - } - } - is_all_offset_zero = all_zero; - } - - // --- Operations --- - void update_start_offset() { - if (is_all_offset_zero) { - start_offset = 0; - return; - } - const uint32_t *rs = get_raw_shapes(); - uint64_t result = 0; - uint64_t stride = 1; - for (int i = static_cast(ndims) - 1; i >= 0; i--) { - result += offsets[i] * stride; - stride *= rs[i]; - } - start_offset = result; - } - - void copy(const Tensor &other) { init(other); } - - Tensor view(const uint32_t view_shapes[], const uint32_t view_offsets[], bool manual_dep = false) const { - Tensor result; - result.init_with_view(*this, view_shapes, view_offsets, manual_dep); - return result; - } - - bool is_contiguous() const { - if (is_raw_eq_shapes || ndims == 0) { - return true; - } - for (uint32_t i = 1; i < ndims; i++) { - if (shapes[i] != raw_shapes[i]) { - return false; - } - } - return true; - } - - bool valid_reshape(const uint32_t new_shapes[], uint32_t new_ndims) const { - uint64_t x = numel(); - uint64_t y = 1; - for (uint32_t i = 0; i < new_ndims; i++) { - y *= new_shapes[i]; - } - return x == y; - } - - Tensor reshape(const uint32_t new_shapes[], uint32_t new_ndims, bool manual_dep = false) const { - debug_assert(valid_reshape(new_shapes, new_ndims)); - always_assert(is_contiguous()); - Tensor result; - result.copy(*this); - result.ndims = new_ndims; - result.is_all_offset_zero = true; - result.is_raw_eq_shapes = true; - result.manual_dep = manual_dep; - for (uint32_t i = 0; i < new_ndims; i++) { - result.shapes[i] = new_shapes[i]; - } - return result; - } - - bool valid_transpose(uint32_t x, uint32_t y) const { return x < ndims && y < ndims; } - - Tensor transpose(uint32_t x, uint32_t y, bool manual_dep = false) const { - debug_assert(valid_transpose(x, y)); - Tensor result; - result.copy(*this); - result.manual_dep = manual_dep; - // transpose swaps the same dims in both arrays, so equality is preserved - std::swap(result.shapes[x], result.shapes[y]); - if (!result.is_raw_eq_shapes) { - std::swap(result.raw_shapes[x], result.raw_shapes[y]); - } - if (!result.is_all_offset_zero) { - std::swap(result.offsets[x], result.offsets[y]); - } - return result; - } - - uint64_t numel() const { - if (ndims == 0) { - return 0; - } - uint64_t total = 1; - for (uint32_t i = 0; i < ndims; i++) { - total *= shapes[i]; - } - return total; - } - - bool is_same_memref(const Tensor &other) const { return buffer.addr == other.buffer.addr; } - - /// Materialize a TensorCreateInfo into this Tensor (fresh contiguous output). - void init_from_create_info(const struct TensorCreateInfo &ci, void *addr, int32_t version_val) { - init( - addr, ci.buffer_size_bytes(), ci.raw_shapes, ci.raw_shapes, nullptr, ci.ndims, ci.dtype, version_val, - /*is_all_offset_zero=*/true, - /*is_raw_eq_shapes=*/true, ci.manual_dep - ); - } - - std::string dump() const { - std::stringstream ss; - std::string indent = " "; - ss << "{" << '\n'; - ss << indent << "buffer.addr: " << buffer.addr << '\n'; - ss << indent << "buffer.size: " << buffer.size << " bytes" << '\n'; - ss << indent << "dtype: " << get_dtype_name(dtype) << '\n'; - ss << indent << "ndims: " << ndims << '\n'; - ss << indent << "version: " << version << '\n'; - - const uint32_t *rs = get_raw_shapes(); - ss << indent << "raw_shapes: ["; - for (uint32_t i = 0; i < ndims; i++) { - if (i > 0) { - ss << ", "; - } - ss << rs[i]; - } - ss << "]" << '\n'; - ss << indent << "shapes: ["; - for (uint32_t i = 0; i < ndims; i++) { - if (i > 0) { - ss << ", "; - } - ss << shapes[i]; - } - ss << "]" << '\n'; - ss << indent << "offsets: ["; - for (uint32_t i = 0; i < ndims; i++) { - if (i > 0) { - ss << ", "; - } - ss << (is_all_offset_zero ? 0u : offsets[i]); - } - ss << "]" << '\n'; - ss << "}" << '\n'; - return ss.str(); - } -}; - -static_assert(sizeof(Tensor) == 128, "Tensor must be exactly 2 cache lines (128 bytes)"); -static_assert(offsetof(Tensor, raw_shapes) == 64); - -using TensorData = Tensor; - -// ============================================================================= -// Factory Helpers -// ============================================================================= -/** - * Create a Tensor for pre-allocated external memory. - */ -static inline Tensor make_tensor_external( - void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false, - int32_t version = 0 -) { - static uint32_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {}; - uint64_t total = 1; - for (uint32_t i = 0; i < ndims; i++) { - total *= shapes[i]; - } - return { - addr, - total * get_element_size(dtype), - shapes, - shapes, - zero_offsets, - ndims, - dtype, - version, - /*is_all_offset_zero=*/true, - /*is_raw_eq_shapes=*/true, - manual_dep - }; -} diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index a1d1f1540..8d6b97f13 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -35,7 +35,7 @@ PTO2 (Parallel Task Orchestration v2) is a runtime system for executing task gra ## 1. Runtime Variants -Three runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy. +Two runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy. ### 1.1 host_build_graph @@ -45,15 +45,7 @@ The host builds the complete task graph before launching device execution. The o - **Scheduling**: AICPU receives the pre-built graph and dispatches tasks by traversing dependencies - **Use case**: development and debugging; no device-side orchestration overhead -### 1.2 aicpu_build_graph - -The orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`). - -- **Task storage**: same `Task[]` array as host_build_graph -- **AicpuBuildApi**: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc` -- **Use case**: reduced host→device data transfer; graph can depend on device-side data - -### 1.3 tensormap_and_ringbuffer (PTO2) +### 1.2 tensormap_and_ringbuffer (PTO2) The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking. diff --git a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h b/src/a5/platform/include/aicore/l2_perf_collector_aicore.h index dbc1aa512..00a30af1e 100644 --- a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h +++ b/src/a5/platform/include/aicore/l2_perf_collector_aicore.h @@ -38,8 +38,8 @@ * Buffer management and final commit are handled by AICPU. * * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended). - * For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites - * with the full (ring_id << 32) | local_id encoding after handshake match. + * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id + * encoding after handshake match. * * @param l2_perf_buf Performance buffer pointer * @param task_id Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits diff --git a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h index 2eecb6a41..a6a5e6f68 100644 --- a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h +++ b/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h @@ -109,7 +109,7 @@ void l2_perf_aicpu_init_phase_profiling(int num_sched_threads); * @param loop_iter Current loop iteration number * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or * full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator - * phases in multi-ring runtimes: tensormap_and_ringbuffer, aicpu_build_graph) + * phases in tensormap_and_ringbuffer) */ void l2_perf_aicpu_record_phase( int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter, @@ -146,9 +146,8 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx); * @param start_time Phase start timestamp * @param end_time Phase end timestamp * @param submit_idx Task submission index (acts as loop_iter) - * @param task_id Task identifier. For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), this is the - * full PTO2 encoding: (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler - * swimlanes. + * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding: + * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes. */ void l2_perf_aicpu_record_orch_phase( AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id diff --git a/src/a5/platform/include/common/l2_perf_profiling.h b/src/a5/platform/include/common/l2_perf_profiling.h index 7c26d7c23..98168375a 100644 --- a/src/a5/platform/include/common/l2_perf_profiling.h +++ b/src/a5/platform/include/common/l2_perf_profiling.h @@ -61,8 +61,8 @@ struct L2PerfRecord { uint64_t finish_time; // AICPU timestamp: when AICPU observed task completion // AICore writes the register dispatch token (low 32 bits only) zero-extended into task_id. - // For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites - // with the full PTO2 encoding (ring_id << 32) | local_id after FIN/perf row match. + // For tensormap_and_ringbuffer, AICPU overwrites with the full PTO2 encoding + // (ring_id << 32) | local_id after FIN/perf row match. // For host_build_graph, task_id stays as the plain integer task index (ring_id = 0). uint64_t task_id; uint32_t func_id; // Kernel function identifier @@ -140,8 +140,8 @@ struct AicpuPhaseRecord { uint32_t loop_iter; // Loop iteration number AicpuPhaseId phase_id; // Phase type union { - uint64_t task_id; // Multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph): - // full PTO2 encoding (ring_id << 32) | local_id for cross-view correlation. + uint64_t task_id; // tensormap_and_ringbuffer: full PTO2 encoding + // (ring_id << 32) | local_id for cross-view correlation. uint64_t tasks_processed; // Scheduler phases: number of tasks processed in this batch }; }; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index 4fd07bee2..1f151dfb5 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -35,7 +35,7 @@ PTO2 (Parallel Task Orchestration v2) is a runtime system for executing task gra ## 1. Runtime Variants -Three runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy. +Two runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy. ### 1.1 host_build_graph @@ -45,15 +45,7 @@ The host builds the complete task graph before launching device execution. The o - **Scheduling**: AICPU receives the pre-built graph and dispatches tasks by traversing dependencies - **Use case**: development and debugging; no device-side orchestration overhead -### 1.2 aicpu_build_graph - -The orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`). - -- **Task storage**: same `Task[]` array as host_build_graph -- **AicpuBuildApi**: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc` -- **Use case**: reduced host→device data transfer; graph can depend on device-side data - -### 1.3 tensormap_and_ringbuffer (PTO2) +### 1.2 tensormap_and_ringbuffer (PTO2) The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking. diff --git a/tests/conftest.py b/tests/conftest.py index 6b2baa0e1..fd106ebe8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -127,11 +127,6 @@ def pytest_collection_modifyitems(session, config, items): available_runtimes = discover_runtimes_for_arch(arch) for item in items: - # Skip aicpu_build_graph tests for architectures that don't have it - if "test_discovers_aicpu_build_graph" in item.nodeid: - if "aicpu_build_graph" not in available_runtimes: - item.add_marker(pytest.mark.skip(reason=f"aicpu_build_graph not available for {arch} architecture")) - # Skip tensormap_and_ringbuffer tests for architectures that don't have it if "tensormap_and_ringbuffer" in item.nodeid: if "tensormap_and_ringbuffer" not in available_runtimes: diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/README.md b/tests/st/a2a3/aicpu_build_graph/bgemm/README.md deleted file mode 100644 index 504c14c5c..000000000 --- a/tests/st/a2a3/aicpu_build_graph/bgemm/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# BGEMM Example (AICPU Build Graph Runtime) - -Tiled matrix multiplication example demonstrating Cube (AIC) and Vector (AIV) core cooperation. - -## Computation - -```text -C = A @ B -``` - -Tiled computation with 4x4x4 grid: - -- Tile size: 64 x 64 -- Matrix A: 256 x 256 (4x4 tiles) -- Matrix B: 256 x 256 (4x4 tiles) -- Matrix C: 256 x 256 (4x4 tiles) - -## Task Graph - -For each output tile C[m,n]: - -```text -for k in [0, GRID_K): - P = A[m,k] @ B[k,n] (gemm_tile on Cube core) - C[m,n] = C[m,n] + P (tile_add on Vector core) -``` - -Dependencies: - -- gemm_tile → tile_add: P must be computed before accumulation -- tile_add[k] → gemm_tile[k+1]: K-dimension accumulation is sequential - -Total tasks: 128 (64 gemm + 64 add) - -## Kernels - -| Kernel | Core Type | Function | -| ------ | --------- | -------- | -| kernel_gemm_tile | AIC (Cube) | 64x64 matrix multiplication | -| kernel_tile_add | AIV (Vector) | 64x64 element-wise addition | - -## File Structure - -```text -bgemm/ -├── golden.py # Test specification -├── README.md # This file -└── kernels/ - ├── kernel_config.py # Kernel configuration - ├── orchestration/ - │ └── bgemm_orch.cpp # Task graph builder - ├── aic/ - │ └── kernel_gemm_tile.cpp # Cube core matmul kernel - └── aiv/ - └── kernel_tile_add.cpp # Vector core add kernel -``` - -## Technical Details - -### Memory Layout (Tile-First) - -```text -A: [BATCH, GRID_M, GRID_K, TILE_M, TILE_K] -B: [BATCH, GRID_K, GRID_N, TILE_K, TILE_N] -C: [BATCH, GRID_M, GRID_N, TILE_M, TILE_N] -``` - -### Runtime Characteristics - -- Task graph is built on AICPU -- Framework automatically manages I/O tensor device memory -- Orchestration function allocates intermediate buffers via AicpuBuildApi - -### Kernel Implementation - -Both kernels use PTO ISA tile operations: - -- **kernel_gemm_tile**: Uses `TileLeft`, `TileRight`, `TileAcc` types with `TLOAD`, `TMOV`, `TMATMUL`, `TSTORE` instructions -- **kernel_tile_add**: Uses `TileVec` type with `TLOAD`, `TADD`, `TSTORE` instructions - -### Pipeline Synchronization - -Kernels include proper pipeline synchronization: - -- `PIPE_MTE2` → `PIPE_M`/`PIPE_V`: After loads, before compute -- `PIPE_M`/`PIPE_V` → `PIPE_MTE3`: After compute, before store diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp deleted file mode 100644 index 9682a5278..000000000 --- a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Tile-based Matrix Multiplication Kernel (Cube Core) - * - * Computes: output = input_a @ input_b (64x64 tile matmul) - * Uses TMATMUL instruction - * - * Args (Tensor*): - * args[0] = input_a (INPUT) - * args[1] = input_b (INPUT) - * args[2] = output (OUTPUT) - */ - -#include -#include -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -AICORE constexpr inline T CeilAlign(T num_1, T num_2) { - if (num_2 == 0) { - return 0; - } - return (num_1 + num_2 - 1) / num_2 * num_2; -} - -static __aicore__ void -gemm_tile_impl(__gm__ Tensor *input_a_tensor, __gm__ Tensor *input_b_tensor, __gm__ Tensor *output_tensor) { - __gm__ float *input_a = - reinterpret_cast<__gm__ float *>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset; - __gm__ float *input_b = - reinterpret_cast<__gm__ float *>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset; - __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset; - - constexpr int TILE = 64; - constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float); - constexpr int M = CeilAlign(TILE, 16); - constexpr int K = CeilAlign(TILE, blockAlign); - constexpr int N = CeilAlign(TILE, blockAlign); - - using GlobalDataA = - GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; - using GlobalDataB = - GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; - using GlobalDataC = - GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; - - GlobalDataA src0Global(input_a); - GlobalDataB src1Global(input_b); - GlobalDataC dstGlobal(output); - - using TileMatA = Tile; - using TileMatB = Tile; - - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - TLOAD(aMatTile, src0Global); - TLOAD(bMatTile, src1Global); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(dstGlobal, cTile); - - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]); - - gemm_tile_impl(input_a, input_b, output); -} diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp deleted file mode 100644 index 123c1abc1..000000000 --- a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Tile-based Element-wise Addition Kernel (Vector Core) - INOUT Pattern - * - * Computes: C_tile = C_tile + P (64x64 tile accumulation) - * Uses TADD instruction - * - * Args (Tensor*): - * args[0] = C_tile (INOUT: read + write accumulator) - * args[1] = P (INPUT: matmul result to accumulate) - */ - -#include -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); - - __gm__ float *c_ptr = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset; - __gm__ float *p_ptr = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset; - - constexpr int TILE = 64; - - using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; - using DynStridDim5 = Stride<1, 1, 1, TILE, 1>; - using GlobalData = GlobalTensor; - using TileData = Tile; - - TileData cTile(TILE, TILE); - TileData pTile(TILE, TILE); - TileData outTile(TILE, TILE); - TASSIGN(cTile, 0x0); - TASSIGN(pTile, 0x10000); - TASSIGN(outTile, 0x20000); - - GlobalData cGlobal(c_ptr); - GlobalData pGlobal(p_ptr); - GlobalData outGlobal(c_ptr); // write back to same C location - - TLOAD(cTile, cGlobal); - TLOAD(pTile, pGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - TADD(outTile, cTile, pTile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(outGlobal, outTile); - - pipe_sync(); -} diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp deleted file mode 100644 index acae7cfa6..000000000 --- a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * BGEMM Orchestration Function (aicpu_build_graph Runtime) - * - * Builds the task graph for tiled matrix multiplication: C = A @ B - * - * Configuration: - * - Tile size: 64 x 64 - * - Grid: 4 x 4 x 4 (GRID_M x GRID_K x GRID_N) - * - Batch: 1 - * - * Memory layout (tile-first, 5D flattened): - * A: [BATCH, GRID_M, GRID_K, TILE, TILE] - * B: [BATCH, GRID_K, GRID_N, TILE, TILE] - * C: [BATCH, GRID_M, GRID_N, TILE, TILE] - * - * Task graph per output tile C[batch, m, n]: - * for k in [0, GRID_K): - * P = A[m,k] @ B[k,n] (gemm_tile on Cube core, func_id=0) - * C[m,n] = C[m,n] + P (tile_add on Vector core, func_id=1) - * - * Dependencies are explicit via rt_add_dependency: - * - gemm(k) -> add(k): add reads P which gemm produces - * - add(k-1) -> add(k): add reads/writes C_view (K accumulation chain) - * - * Arg layout: [A, B, C] — shape/dtype/size in ContinuousTensor metadata - */ - -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -#define FUNC_GEMM_TILE 0 -#define FUNC_TILE_ADD 1 - -static constexpr int TILE = 64; -static constexpr int GRID_M = 4; -static constexpr int GRID_K = 4; -static constexpr int GRID_N = 4; -static constexpr int BATCH = 1; - -static constexpr uint32_t TILE_ELEMS = TILE * TILE; -static constexpr uint64_t TILE_BYTES = TILE_ELEMS * sizeof(float); - -extern "C" { - -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; // NOLINT(readability/casting) - return PTO2OrchestrationConfig{ - .expected_arg_count = 3, - }; -} - -__attribute__((visibility("default"))) void -aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) { - Tensor ext_A = from_tensor_arg(orch_args.tensor(0)); - Tensor ext_B = from_tensor_arg(orch_args.tensor(1)); - Tensor ext_C = from_tensor_arg(orch_args.tensor(2)); - - LOG_INFO(rt, "[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d", GRID_M, GRID_K, GRID_N, BATCH, TILE); - - uint32_t tile_shapes[1] = {TILE_ELEMS}; - - for (int batch = 0; batch < BATCH; batch++) { - for (int m_idx = 0; m_idx < GRID_M; m_idx++) { - for (int n_idx = 0; n_idx < GRID_N; n_idx++) { - PTO2_SCOPE(rt) { - uint32_t c_elem_offset = (static_cast(batch) * GRID_M * GRID_N + - static_cast(m_idx) * GRID_N + static_cast(n_idx)) * - TILE_ELEMS; - uint32_t c_view_offsets[1] = {c_elem_offset}; - Tensor C_view = ext_C.view(tile_shapes, c_view_offsets); - - PTO2TaskId last_add_task = PTO2TaskId{}; - bool has_last_add = false; - - for (int k_idx = 0; k_idx < GRID_K; k_idx++) { - uint32_t a_elem_offset = - (static_cast(batch) * GRID_M * GRID_K + static_cast(m_idx) * GRID_K + - static_cast(k_idx)) * - TILE_ELEMS; - uint32_t b_elem_offset = - (static_cast(batch) * GRID_K * GRID_N + static_cast(k_idx) * GRID_N + - static_cast(n_idx)) * - TILE_ELEMS; - - uint32_t a_view_offsets[1] = {a_elem_offset}; - Tensor A_view = ext_A.view(tile_shapes, a_view_offsets); - uint32_t b_view_offsets[1] = {b_elem_offset}; - Tensor B_view = ext_B.view(tile_shapes, b_view_offsets); - - // P = A[m,k] @ B[k,n] - Arg args_gemm; - args_gemm.add_input(A_view); - args_gemm.add_input(B_view); - args_gemm.add_output(TensorCreateInfo(tile_shapes, 1, DataType::FLOAT32)); - SubmitResult r_gemm = rt_submit_aic_task(rt, FUNC_GEMM_TILE, args_gemm); - - // C[m,n] += P - Arg args_add; - args_add.add_inout(C_view); - args_add.add_input(r_gemm.outputs.get_ref(0)); - SubmitResult r_add = rt_submit_aiv_task(rt, FUNC_TILE_ADD, args_add); - - // gemm -> add: add reads P which gemm produces - rt_add_dependency(rt, r_gemm.task_id, r_add.task_id); - // K accumulation chain: previous add -> current add - if (has_last_add) { - rt_add_dependency(rt, last_add_task, r_add.task_id); - } - - last_add_task = r_add.task_id; - has_last_add = true; - } - } - } - } - } - - LOG_INFO( - rt, "[bgemm_orch] Submitted tasks for %d batches, %dx%d output tiles, %d K steps each", BATCH, GRID_M, GRID_N, - GRID_K - ); -} - -} // extern "C" diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py b/tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py deleted file mode 100644 index 9be0dc72b..000000000 --- a/tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- - -"""BGEMM — aicpu_build_graph runtime with tiled matrix multiplication. - -Computation: C = A @ B (4x4x4 grid, 64x64 tiles). -Tests AIC (Cube) + AIV (Vector) cooperation with tile-first memory layout. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test - -TILE_M = 64 -TILE_K = 64 -TILE_N = 64 -GRID_M = 4 -GRID_K = 4 -GRID_N = 4 -BATCH = 1 - - -@scene_test(level=2, runtime="aicpu_build_graph") -class TestBgemm(SceneTestCase): - """BGEMM: tiled C = A @ B with AIC gemm + AIV tile add.""" - - RTOL = 1e-3 - ATOL = 1e-3 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/bgemm_orch.cpp", - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "source": "kernels/aic/kernel_gemm_tile.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "source": "kernels/aiv/kernel_tile_add.cpp", - "core_type": "aiv", - "signature": [D.INOUT, D.IN], - }, - ], - } - - CASES = [ - { - "name": "default", - "platforms": ["a2a3sim", "a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 3}, - "params": {}, - }, - ] - - def generate_args(self, params): - A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01 - B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01 - C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32) - - return TaskArgsBuilder( - Tensor("A", A.flatten()), - Tensor("B", B.flatten()), - Tensor("C", C.flatten()), - ) - - def compute_golden(self, args, params): - A = args.A.reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) - B = args.B.reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) - C = args.C.reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) - - C[:] = 0.0 - for batch in range(BATCH): - for m_idx in range(GRID_M): - for n_idx in range(GRID_N): - for k_idx in range(GRID_K): - C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx]) - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp b/tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp deleted file mode 100644 index babd6f685..000000000 --- a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Minimal orchestration for orch_so_cache test (a2a3 aicpu_build_graph) - * - * Computes: f = a + b (single AIV task) - * - * Args layout (3 args): - * [0] = a (INPUT) - 128 x 128 float32 - * [1] = b (INPUT) - 128 x 128 float32 - * [2] = f (OUTPUT) - 128 x 128 float32 - */ - -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -#define FUNC_ADD 0 // kernel_add: args[0..2] -> f = a + b - -extern "C" { - -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; - return PTO2OrchestrationConfig{ - .expected_arg_count = 3, - }; -} - -__attribute__((visibility("default"))) void -aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) { - Tensor ext_a = from_tensor_arg(orch_args.tensor(0)); - Tensor ext_b = from_tensor_arg(orch_args.tensor(1)); - Tensor ext_f = from_tensor_arg(orch_args.tensor(2)); - - PTO2_SCOPE(rt) { - // f = a + b - Arg args; - args.add_input(ext_a); - args.add_input(ext_b); - args.add_inout(ext_f); - rt_submit_aiv_task(rt, FUNC_ADD, args); - } -} - -} // extern "C" diff --git a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py b/tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py deleted file mode 100644 index 7fa638e2c..000000000 --- a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""End-to-end coverage for the orchestration SO host-side cache (a2a3 aicpu_build_graph). - -The host hashes the orchestration SO's GNU Build-ID, skips re-uploading bytes -that already live on device, and tells AICPU to reuse the cached `dlopen` -handle. The framework reuses one `Worker` (and therefore one `DeviceRunner`) -across cases inside a `SceneTestCase`, so running multiple cases against the -same `CALLABLE` exercises the cache-hit path on every case after the first. - -This test deliberately: - - Reuses the vector_example kernel_add (args[0..2] -> f = a + b). - - Spans three cases with different (a, b) inputs — proves cache hit doesn't - leak any per-run state across iterations. - - Uses the same tensor size (128*128) because the AIV kernel has a hardcoded - tile shape (128x128) and does not accept a runtime size. - - Runs on both sim and hardware (sim DeviceRunner uses the same code path, - just with `mem_alloc_` returning host memory). - -Verification is purely outcome-based: every case must produce the correct -result. A regression in cache logic (stale handle, wrong device buffer, -missing dlopen on first run) shows up as wrong output or a runtime failure. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test - -_VECTOR_KERNELS = "../vector_example/kernels" - - -@scene_test(level=2, runtime="aicpu_build_graph") -class TestOrchSoCache(SceneTestCase): - """Same callable, three cases — case 0 misses the cache, cases 1-2 hit it.""" - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/example_orchestration.cpp", - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - ], - } - - # Three cases sharing one callable. The framework iterates them on a - # single Worker; cases after the first land on cache-hit. Different - # (a, b) values verify that no per-run state leaks across iterations. - _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3} - _PLATFORMS = ["a2a3sim", "a2a3"] - - # All cases use the same size (128*128) because the AIV kernel has a - # hardcoded tile shape (128x128) and does not read a runtime size - # argument — running with a smaller tensor would cause an out-of-bounds - # access. - CASES = [ - { - "name": "first_miss", - "platforms": _PLATFORMS, - "config": _COMMON_CONFIG, - "params": {"size": 128 * 128, "a": 2.0, "b": 3.0}, - }, - { - "name": "second_hit", - "platforms": _PLATFORMS, - "config": _COMMON_CONFIG, - "params": {"size": 128 * 128, "a": 1.0, "b": 4.0}, - }, - { - "name": "third_hit", - "platforms": _PLATFORMS, - "config": _COMMON_CONFIG, - "params": {"size": 128 * 128, "a": 0.5, "b": 0.5}, - }, - ] - - def generate_args(self, params): - size = params["size"] - a = params["a"] - b = params["b"] - return TaskArgsBuilder( - Tensor("a", torch.full((size,), a, dtype=torch.float32)), - Tensor("b", torch.full((size,), b, dtype=torch.float32)), - Tensor("f", torch.zeros(size, dtype=torch.float32)), - ) - - def compute_golden(self, args, params): - # f = a + b - args.f[:] = args.a + args.b - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp deleted file mode 100644 index 45f90aab3..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -constexpr int M = 16; -constexpr int K = 16; -constexpr int N = 16; - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp deleted file mode 100644 index 04aa9b5f6..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) -// -// Supports two tile configurations via runtime dispatch: -// Case1: (16, 128) @ (128, 128) -> (16, 128) -// Case2: (64, 64) @ ( 64, 128) -> (64, 128) -// -// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). -// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. -// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) { - __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); - __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr); - __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); - - // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 - using GlobalA = GlobalTensor, Stride>; - using GlobalB = GlobalTensor, Stride>; - using GlobalOut = GlobalTensor, Stride>; - - GlobalA pijGlobal(pij_addr + pij->start_offset); - GlobalB vjGlobal(vj_addr + vj->start_offset); - GlobalOut oiGlobal(oi_addr + oi->start_offset); - - // L1 Mat tiles: standard ND pattern for both A and B - using TileMatA = Tile; - using TileMatB = Tile; - - // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Load pij and vj to L1 with separate events for pipeline overlap - TLOAD(aMatTile, pijGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done - TLOAD(bMatTile, vjGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done - - // Move A to L0A as soon as A load completes (B may still be loading) - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - TMOV(aTile, aMatTile); - // Move B to L0B after B load completes - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Single matmul: (M,K) x (K,N) -> (M,N) - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(oiGlobal, cTile); - - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); - uint64_t q_tile_size = static_cast(pij->shapes[0]); - // args[4] = block_size, args[5] = head_dim - - if (q_tile_size == 16) { - pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); - } else { - pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp deleted file mode 100644 index f65656605..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) -// -// Supports two tile configurations via runtime dispatch: -// Case1: (16, 128) @ (128, 128).T -> (16, 128) -// Case2: (64, 128) @ (128, 64).T -> (64, 64) -// -// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. -// This is equivalent to (K, N) in column-major (DN) layout. -// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) { - __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr); - __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr); - __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); - - // qi (M, K) bf16 in ND (row-major) layout - using GlobalA = GlobalTensor, Stride>; - // kj stored as (N, K) row-major = (K, N) column-major -> DN layout - using GlobalB = GlobalTensor, Stride, Layout::DN>; - using GlobalOut = GlobalTensor, Stride>; - - GlobalA qiGlobal(qi_addr + qi->start_offset); - GlobalB kjGlobal(kj_addr + kj->start_offset); - GlobalOut sijGlobal(sij_addr + sij->start_offset); - - // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) - using TileMatA = Tile; - using TileMatB = Tile; - - // L0 tiles - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Load A and B to L1 with separate events for pipeline overlap - TLOAD(aMatTile, qiGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done - TLOAD(bMatTile, kjGlobal); - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done - - // Move A to L0A as soon as A load completes (B may still be loading) - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - TMOV(aTile, aMatTile); - // Move B to L0B after B load completes - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - // Matmul - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(sijGlobal, cTile); - - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]); - uint64_t q_tile_size = static_cast(qi->shapes[0]); - // args[4] = head_dim (128), args[5] = block_size - - if (q_tile_size == 16) { - qk_matmul_impl<16, 128, 128>(qi, kj, sij); - } else { - qk_matmul_impl<64, 128, 64>(qi, kj, sij); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp deleted file mode 100644 index 45f90aab3..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -constexpr int M = 16; -constexpr int K = 16; -constexpr int N = 16; - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp deleted file mode 100644 index bfdddc75e..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Online Softmax Update + Normalize Kernel (AIV) -// -// Operates on full tiles where M=q_tile_size, N=head_dim (128): -// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors -// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors -// -// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): -// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. -// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). -// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. -// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void online_update_impl( - __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li, - __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst -) { - __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); - __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); - __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr); - __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr); - __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr); - __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); - __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); - - // Aligned rows for ColMajor DN tiles (32-byte alignment) - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - // --- GlobalTensor types --- - - // Data (M, N) RowMajor - using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; - - // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading - using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; - - // Scalar ND: for storing mi_new and li_new back to GM - constexpr int kScalarCols = 32 / sizeof(float); - constexpr int kScalarRows = M / kScalarCols; - using GlobalScalarND = - GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; - - // --- GlobalTensor instances --- - - GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); - GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); - GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); - - // DN globals for loading scalars as ColMajor - GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); - GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); - GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); - GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); - - // ND globals for storing scalar results - GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); - GlobalScalarND liGlobalND(li_ptr + li->start_offset); - - // --- Tile types --- - - using TileDataMxN = Tile; - using TileScalarDN = Tile; - - // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE - using TileScalarRow = Tile; - - // ND tile for storing back to GM - using TileScalarND = - Tile; - - // --- UB memory layout --- - - constexpr int kDataBytes = M * N * sizeof(float); - constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); - - // Data tiles - TileDataMxN oiNewTile; - TileDataMxN oiTile; - - // Scalar DN tiles loaded from GM (ColMajor) - TileScalarDN mijDN, lijDN, miDN, liDN; - - // Temporary DN tiles for results - TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; - - TASSIGN(oiNewTile, 0); - TASSIGN(oiTile, kDataBytes); - TASSIGN(mijDN, 2 * kDataBytes); - TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); - TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); - TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); - TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); - TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); - TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); - TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); - - if (is_first) { - // --- First block: copy inputs to accumulators --- - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(mijDN, mijGlobalDN); - TLOAD(lijDN, lijGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Store mi = mij, li = lij, oi = oi_new - // Alias ND tiles to the same UB as DN tiles for storing as ND format - TileScalarND mijND, lijND; - TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN - TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, mijND); // mi = mij - TSTORE(liGlobalND, lijND); // li = lij - TSTORE(oiGlobal, oiNewTile); // oi = oi_new - - if (is_last) { - // Single block: normalize dst = oi_new / lij - // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); - TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(dstGlobal, oiNewTile); - } - } else { - // --- Subsequent blocks: accumulate --- - - // Load all inputs - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(oiTile, oiGlobal); - TLOAD(mijDN, mijGlobalDN); - TLOAD(lijDN, lijGlobalDN); - TLOAD(miDN, miGlobalDN); - TLOAD(liDN, liGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic - TileScalarRow miRow, mijRow, liRow, lijRow; - TRESHAPE(miRow, miDN); - TRESHAPE(mijRow, mijDN); - TRESHAPE(liRow, liDN); - TRESHAPE(lijRow, lijDN); - - // Scalar arithmetic in RowMajor (1, M) layout - TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; - TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); - TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); - TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); - TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); - - TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) - pipe_barrier(PIPE_V); - TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new - pipe_barrier(PIPE_V); - TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) - pipe_barrier(PIPE_V); - TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new - pipe_barrier(PIPE_V); - TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) - pipe_barrier(PIPE_V); - TMUL(tmpRow, alphaRow, liRow); // alpha * li - pipe_barrier(PIPE_V); - TMUL(liNewRow, betaRow, lijRow); // beta * lij - pipe_barrier(PIPE_V); - TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij - - // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL - TRESHAPE(alphaDN, alphaRow); - TRESHAPE(betaDN, betaRow); - - // Scale data tiles using row-broadcast multiply - TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha - TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta - pipe_barrier(PIPE_V); - TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new - - // Store mi_new and li_new to GM (ND format) - // Alias ND tiles to the same UB locations as miNewRow and liNewRow - TileScalarND miNewND, liNewND; - TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); - - if (is_last) { - // Normalize and output: dst = oi / li_new - TRESHAPE(liNewDN, liNewRow); - pipe_barrier(PIPE_V); - TROWEXPANDDIV(oiTile, oiTile, liNewDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liNewND); // persist li_new - TSTORE(dstGlobal, oiTile); - } else { - // Store updated accumulators - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liNewND); // persist li_new - TSTORE(oiGlobal, oiTile); - } - } - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); - __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]); - __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]); - __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]); - __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]); - uint64_t is_first = static_cast(args[7]); - uint64_t is_last = static_cast(args[8]); - uint64_t q_tile_size = static_cast(mij->shapes[0]); - // args[10] = head_dim (128) - - if (q_tile_size == 16) { - online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); - } else { - online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp deleted file mode 100644 index 0669123c2..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Softmax Preparation Kernel (AIV) with partial block masking -// -// Operates on (M, N) tile where M=q_tile_size, N=block_size: -// Case1: sij is (16, 128) -// Case2: sij is (64, 64) -// -// For partial blocks (valid_len < N), positions [valid_len, N) in sij are -// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 -// so that invalid key positions contribute zero attention weight. -// -// Computes: -// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) -// sij_scale = sij_masked * scale -// mij = row_max(sij_scale) -> (M, 1) -// pij = exp(sij_scale - mij) -> (M, N) -// lij = row_sum(pij) -> (M, 1) - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void softmax_prepare_impl( - __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij -) { - uint64_t valid_len = static_cast(sij->shapes[1]); - __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); - __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); - __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); - __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); - - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; - using GlobalDataMxN_bf16 = GlobalTensor, Stride<1, 1, 1, N, 1>>; - using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; - - GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); - GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset); - GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); - GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); - - // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary - using TileSijDyn = Tile; - // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf - using TileSijPad = Tile; - - using TileVecMxN = Tile; - using TileVecMxN_bf16 = Tile; - using TileScalarDN = Tile; - - TileVecMxN sijTile; - TileSijDyn sijDynTile(static_cast(valid_len)); - TileSijPad sijPadTile; - TileVecMxN pijTile; - TileVecMxN tmpTile; - TileScalarDN maxTile; - TileScalarDN sumTile; - TileVecMxN_bf16 pijBf16Tile; - - // All sij tiles share UB address 0x0 (in-place masking) - TASSIGN(sijTile, 0x0); - TASSIGN(sijDynTile, 0x0); - TASSIGN(sijPadTile, 0x0); - TASSIGN(pijTile, M * N * sizeof(float)); - TASSIGN(tmpTile, 2 * M * N * sizeof(float)); - TASSIGN(maxTile, 3 * M * N * sizeof(float)); - TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); - TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); - - // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks - // printf("sij addr incore %x\n", sij->buffer.addr); - TLOAD(sijTile, sijGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, - // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. - TFILLPAD_INPLACE(sijPadTile, sijDynTile); - pipe_barrier(PIPE_V); - - TMULS(sijTile, sijTile, scale_value); - pipe_barrier(PIPE_V); - TROWMAX(maxTile, sijTile, tmpTile); - pipe_barrier(PIPE_V); - TROWEXPANDSUB(pijTile, sijTile, maxTile); - pipe_barrier(PIPE_V); - TEXP(pijTile, pijTile); - // Truncate pij to bf16 first - pipe_barrier(PIPE_V); - TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); // pij bf16 ready, can store early - - // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel - pipe_barrier(PIPE_V); - TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); - pipe_barrier(PIPE_V); - TROWSUM(sumTile, pijTile, tmpTile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); // sum ready - - // Store pij (overlaps with TCVT + TROWSUM above) - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(pijGlobal, pijBf16Tile); - - // Store max and sum - TSTORE(mijGlobal, maxTile); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(lijGlobal, sumTile); - - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]); - __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]); - union { - uint64_t u; - float f; - } scale_conv; - scale_conv.u = static_cast(args[4]); - float scale_value = scale_conv.f; - uint64_t q_tile_size = static_cast(sij->shapes[0]); - - if (q_tile_size == 16) { - softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij); - } else { - softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp deleted file mode 100644 index 9ded96c08..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Paged Attention Orchestration — Per-Block Version - * (aicpu_build_graph variant: explicit add_dependency, no TensorMap) - * - * For each batch, for each head tile, for each KV block: - * 1. QK matmul: qi @ kj^T → sij (q_tile, block_size) - * 2. Softmax: sij → pij, mi, li - * 3. PV matmul: pij @ vj → oi_tmp (q_tile, head_dim) - * 4. Update: online softmax accumulation - * - * Dependency graph per block: - * QK → Softmax → PV → Update - * └──────────→ Update - * Update(prev block) ──→ Update(this block) - * Hub(init) ────────────→ Update(first block) - */ - -#include -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -#define FUNC_QK_MATMUL 0 -#define FUNC_SOFTMAX_PREPARE 1 -#define FUNC_PV_MATMUL 2 -#define FUNC_ONLINE_UPDATE 3 -#define FUNC_AIC_HUB 4 -#define FUNC_AIV_HUB 5 - -extern "C" { - -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; // NOLINT(readability/casting) - return PTO2OrchestrationConfig{ - .expected_arg_count = 7, - }; -} - -__attribute__((visibility("default"))) void -aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) { - // Read dimensions from tensor metadata - // query: shape=[batch, num_heads, head_dim] - uint64_t batch = orch_args.tensor(0).shapes[0]; - uint64_t num_heads = orch_args.tensor(0).shapes[1]; - uint64_t head_dim = orch_args.tensor(0).shapes[2]; - DataType data_type = orch_args.tensor(0).dtype; - - // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim] - uint64_t block_size = orch_args.tensor(1).shapes[1]; - - // block_table: shape=[batch, max_num_blocks_per_req] - uint64_t block_num = orch_args.tensor(3).shapes[1]; - - // scale from scalar arg - uint64_t scale_value = orch_args.scalar(0); - - uint64_t q_head_num = num_heads; - uint64_t q_tile = std::min(num_heads, static_cast(128)); - uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; - - // Reshape tensors for kernel consumption (2D flattened) - void *query_ptr = orch_args.tensor(0).data_as(); - void *kc_ptr = orch_args.tensor(1).data_as(); - void *vc_ptr = orch_args.tensor(2).data_as(); - void *out_ptr = orch_args.tensor(5).data_as(); - - uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; - - uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - uint32_t key_cache_shapes[2] = { - static_cast(total_blocks_count * block_size), static_cast(head_dim) - }; - uint32_t value_cache_shapes[2] = { - static_cast(total_blocks_count * block_size), static_cast(head_dim) - }; - uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false); - Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false); - Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false); - Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); - - int *host_block_table = orch_args.tensor(3).data_as(); - int *host_context_lens = orch_args.tensor(4).data_as(); - - for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { - uint64_t cur_seq = host_context_lens[b_idx]; - uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; - - for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { - PTO2_SCOPE(rt) { - uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; - - uint32_t oi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t li_shapes[1] = {static_cast(q_tile)}; - uint32_t mi_shapes[1] = {static_cast(q_tile)}; - uint32_t qi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; - Tensor qi = query.view(qi_shapes, qi_offsets); - uint32_t out_view_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; - Tensor out_view = out.view(out_view_shapes, out_view_offsets); - - // Hub task: zero-initialize accumulators - Arg args_inplace; - args_inplace.add_output(TensorCreateInfo(oi_shapes, 2, DataType::FLOAT32)); - args_inplace.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32)); - args_inplace.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32)); - SubmitResult r_hub = rt_submit_aiv_task(rt, FUNC_AIV_HUB, args_inplace); - const Tensor &oi = r_hub.outputs.get_ref(0); - const Tensor &li_update = r_hub.outputs.get_ref(1); - const Tensor &mi_update = r_hub.outputs.get_ref(2); - - PTO2TaskId prev_update_task = r_hub.task_id; - - for (uint64_t bn = 0; bn < bn_this_batch; bn++) { - uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn]; - uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); - - // KV views for this block - uint32_t kv_shapes[2] = {static_cast(block_size), static_cast(head_dim)}; - uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; - Tensor kj = key_cache.view(kv_shapes, kv_offsets); - Tensor vj = value_cache.view(kv_shapes, kv_offsets); - - // === Task 1: QK matmul === - uint32_t sij_shapes[2] = {static_cast(q_tile), static_cast(block_size)}; - - Arg args_qk; - args_qk.add_input(qi); - args_qk.add_input(kj); - args_qk.add_output(TensorCreateInfo(sij_shapes, 2, DataType::FLOAT32)); - SubmitResult r_qk = rt_submit_aic_task(rt, FUNC_QK_MATMUL, args_qk); - - // === Task 2: Softmax === - uint32_t sij_valid_shapes[2] = {static_cast(q_tile), static_cast(valid_len)}; - uint32_t sij_valid_offsets[2] = {0, 0}; - Tensor sij_valid = r_qk.outputs.get_ref(0).view(sij_valid_shapes, sij_valid_offsets); - - Arg args_sf; - args_sf.add_input(sij_valid); - args_sf.add_output(TensorCreateInfo(sij_shapes, 2, data_type)); - args_sf.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32)); - args_sf.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32)); - args_sf.add_scalar(scale_value); - SubmitResult r_sf = rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, args_sf); - rt_add_dependency(rt, r_qk.task_id, r_sf.task_id); - - // === Task 3: PV matmul === - uint32_t oi_tmp_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - - Arg args_pv; - args_pv.add_input(r_sf.outputs.get_ref(0)); - args_pv.add_input(vj); - args_pv.add_output(TensorCreateInfo(oi_tmp_shapes, 2, DataType::FLOAT32)); - SubmitResult r_pv = rt_submit_aic_task(rt, FUNC_PV_MATMUL, args_pv); - rt_add_dependency(rt, r_sf.task_id, r_pv.task_id); - - // === Task 4: Online update === - uint64_t is_first = (bn == 0) ? 1 : 0; - uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; - - Arg args_up; - args_up.add_input(r_sf.outputs.get_ref(1)); - args_up.add_input(r_sf.outputs.get_ref(2)); - args_up.add_input(r_pv.outputs.get_ref(0)); - args_up.add_inout(mi_update); - args_up.add_inout(li_update); - args_up.add_inout(oi); - args_up.add_inout(out_view); - args_up.add_scalar(is_first); - args_up.add_scalar(is_last); - SubmitResult r_up = rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, args_up); - rt_add_dependency(rt, r_sf.task_id, r_up.task_id); - rt_add_dependency(rt, r_pv.task_id, r_up.task_id); - rt_add_dependency(rt, prev_update_task, r_up.task_id); - - prev_update_task = r_up.task_id; - } - } - } - } -} - -} // extern "C" diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py b/tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py deleted file mode 100644 index b4ee7a376..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged attention — aicpu_build_graph runtime (production scale, bfloat16). - -Tests aicpu_build_graph runtime with hub kernels (aic_hub, aiv_hub), -INOUT tensors, and AIC+AIV mixed execution. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test -from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden # noqa: PLC0415 -from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs # noqa: PLC0415 - - -@scene_test(level=2, runtime="aicpu_build_graph") -class TestPagedAttentionAicpuBuildGraph(SceneTestCase): - """Paged attention with aicpu_build_graph runtime and hub kernels.""" - - RTOL = 1e-3 - ATOL = 1e-3 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/paged_attention_orch.cpp", - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "source": "kernels/aic/aic_qk_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "source": "kernels/aic/aic_pv_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 4, - "source": "kernels/aic/aic_hub.cpp", - "core_type": "aic", - "signature": [], - }, - { - "func_id": 1, - "source": "kernels/aiv/aiv_softmax_prepare.cpp", - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "source": "kernels/aiv/aiv_online_update.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, - { - "func_id": 5, - "source": "kernels/aiv/aiv_hub.cpp", - "core_type": "aiv", - "signature": [], - }, - ], - } - - CASES = [ - { - "name": "case1", - "platforms": ["a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "params": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - { - "name": "case2", - "platforms": ["a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "params": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - ] - - def generate_args(self, params): - inputs = _pa_generate_inputs(params) - specs = [] - for name, val in inputs: - if isinstance(val, torch.Tensor): - specs.append(Tensor(name, val)) - else: - specs.append(Scalar(name, val)) - return TaskArgsBuilder(*specs) - - def compute_golden(self, args, params): - tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} - _pa_compute_golden(tensors, params) - for s in args.specs: - if isinstance(s, Tensor) and s.name in tensors: - getattr(args, s.name)[:] = tensors[s.name] - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp deleted file mode 100644 index 45f90aab3..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -constexpr int M = 16; -constexpr int K = 16; -constexpr int N = 16; - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp deleted file mode 100644 index d06e1e06c..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks -// -// Processes n_blocks blocks using SplitK accumulation pattern: -// Block 0: TMATMUL(C, A, B) — initialize accumulator -// Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C -// -// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K) -// Per-block vj addresses: value_cache base + block_indices lookup -// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks -// -// Optimizations: -// - Double-buffered L1 tiles (ping/pong for A and B) -// - TLOAD(next pij+vj) overlaps with TMATMUL_ACC(current) via MTE2/PIPE_M parallelism -// -// Supports two tile configurations via runtime dispatch: -// Case1: (16, 128) @ (128, 128) -> (16, 128) -// Case2: (64, 64) @ ( 64, 128) -> (64, 128) -// -// pij is bfloat16 (from softmax_prepare TCVT). -// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void pv_matmul_n_impl( - __gm__ bfloat16_t *pij_base, __gm__ bfloat16_t *val_base, __gm__ float *oi_base, uint64_t n_blocks, - __gm__ int32_t *block_table -) { - using GlobalA = GlobalTensor, Stride>; - using GlobalB = GlobalTensor, Stride>; - using GlobalOut = GlobalTensor, Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - // Double-buffered L1 tiles (ping/pong) - TileMatA aMatTile_ping, aMatTile_pong; - TileMatB bMatTile_ping, bMatTile_pong; - TASSIGN(aMatTile_ping, 0x0); - TASSIGN(aMatTile_pong, 0x10000); - TASSIGN(bMatTile_ping, 0x20000); - TASSIGN(bMatTile_pong, 0x30000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - GlobalOut oiGlobal(oi_base); - - // Pre-load first iteration's tiles into ping buffers - GlobalA pijGlobal_0(pij_base); - GlobalB vjGlobal_0(val_base + block_table[0] * K * N); - TLOAD(aMatTile_ping, pijGlobal_0); - TLOAD(bMatTile_ping, vjGlobal_0); - - for (uint64_t i = 0; i < n_blocks; i++) { - // Select current buffers based on iteration parity - TileMatA &curA = (i % 2 == 0) ? aMatTile_ping : aMatTile_pong; - TileMatB &curB = (i % 2 == 0) ? bMatTile_ping : bMatTile_pong; - - // Wait for current TLOAD to complete - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // Wait for previous matmul to complete (L0A/L0B safe to overwrite) - if (i > 0) { - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - } - - TMOV(aTile, curA); - TMOV(bTile, curB); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - if (i == 0) { - TMATMUL(cTile, aTile, bTile); - } else { - TMATMUL_ACC(cTile, cTile, aTile, bTile); - } - - // Prefetch next iteration's data (MTE2 overlaps with matmul completion) - if (i + 1 < n_blocks) { - // Signal matmul completion for next iteration's TMOV guard - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - TileMatA &nxtA = (i % 2 == 0) ? aMatTile_pong : aMatTile_ping; - TileMatB &nxtB = (i % 2 == 0) ? bMatTile_pong : bMatTile_ping; - GlobalA pijGlobal_next(pij_base + (i + 1) * M * K); - GlobalB vjGlobal_next(val_base + block_table[i + 1] * K * N); - TLOAD(nxtA, pijGlobal_next); - TLOAD(nxtB, vjGlobal_next); - } - } - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - TSTORE(oiGlobal, cTile); - - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ TensorData *pij_buf = reinterpret_cast<__gm__ TensorData *>(args[0]); - __gm__ TensorData *value_cache = reinterpret_cast<__gm__ TensorData *>(args[1]); - __gm__ TensorData *oi_new = reinterpret_cast<__gm__ TensorData *>(args[2]); - uint64_t n_blocks = static_cast(args[3]); - __gm__ int32_t *block_table = reinterpret_cast<__gm__ int32_t *>(args[4]); - - __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset; - __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr); - __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset; - - uint64_t q_tile_size = static_cast(pij_buf->shapes[0]); - - if (q_tile_size == 16) { - pv_matmul_n_impl<16, 128, 128>(pij_base, val_base, oi_base, n_blocks, block_table); - } else { - pv_matmul_n_impl<64, 64, 128>(pij_base, val_base, oi_base, n_blocks, block_table); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp deleted file mode 100644 index 5f38ee47f..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block -// -// Processes n_blocks blocks in a single kernel invocation. -// Per-block kj addresses computed from key_cache base + block_indices lookup. -// qi is shared across all blocks (same query head against different key blocks). -// -// Output layout: n_blocks contiguous (M, N) tiles stacked vertically. -// Block i occupies sij[i*M : (i+1)*M, 0:N]. -// -// Optimizations: -// - qi TLOAD hoisted before the loop (constant across all iterations) -// -// Supports two tile configurations via runtime dispatch: -// Case1: (16, 128) @ (128, 128).T -> (16, 128) -// Case2: (64, 128) @ (128, 64).T -> (64, 64) -// -// Template: M=q_tile, K=head_dim, N=block_size - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void qk_matmul_n_impl( - __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ float *sij_base, uint64_t n_blocks, - __gm__ int32_t *block_table -) { - using GlobalA = GlobalTensor, Stride>; - using GlobalB = GlobalTensor, Stride, Layout::DN>; - using GlobalOut = GlobalTensor, Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile cTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(cTile, 0x0); - - // Hoist qi TLOAD before the loop (qi is constant across all blocks) - GlobalA qiGlobal(qi_base); - TLOAD(aMatTile, qiGlobal); - - for (uint64_t i = 0; i < n_blocks; i++) { - GlobalB kjGlobal(key_base + block_table[i] * N * K); - GlobalOut sijGlobal(sij_base + i * M * N); - - // Load only B each iteration (qi already in L1 from hoist) - TLOAD(bMatTile, kjGlobal); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - // TMOV qi from L1→L0A (re-copy since TMATMUL consumed L0A) and kj from L1→L0B - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - TMATMUL(cTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TSTORE(sijGlobal, cTile); - - if (i + 1 < n_blocks) { - pipe_barrier(PIPE_ALL); - } - } - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ TensorData *qi = reinterpret_cast<__gm__ TensorData *>(args[0]); - __gm__ TensorData *key_cache = reinterpret_cast<__gm__ TensorData *>(args[1]); - __gm__ TensorData *sij_buf = reinterpret_cast<__gm__ TensorData *>(args[2]); - uint64_t n_blocks = static_cast(args[3]); - __gm__ int32_t *block_table = reinterpret_cast<__gm__ int32_t *>(args[4]); - - __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset; - __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr); - __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset; - - uint64_t q_tile_size = static_cast(qi->shapes[0]); - - if (q_tile_size == 16) { - qk_matmul_n_impl<16, 128, 128>(qi_base, key_base, sij_base, n_blocks, block_table); - } else { - qk_matmul_n_impl<64, 128, 64>(qi_base, key_base, sij_base, n_blocks, block_table); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp deleted file mode 100644 index 45f90aab3..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -#include -#include - -using namespace pto; - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -constexpr int M = 16; -constexpr int K = 16; -constexpr int N = 16; - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp deleted file mode 100644 index a68908229..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Online Softmax Update + Normalize Kernel (AIV) -// -// Operates on full tiles where M=q_tile_size, N=head_dim (128): -// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors -// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors -// -// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): -// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. -// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). -// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. -// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void online_update_impl( - __gm__ TensorData *mij, __gm__ TensorData *lij, __gm__ TensorData *oi_new, __gm__ TensorData *mi, - __gm__ TensorData *li, __gm__ TensorData *oi, uint64_t is_first, uint64_t is_last, __gm__ TensorData *dst -) { - __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); - __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); - __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr); - __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr); - __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr); - __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); - __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); - - // Aligned rows for ColMajor DN tiles (32-byte alignment) - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - - // --- GlobalTensor types --- - - // Data (M, N) RowMajor - using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; - - // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading - using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; - - // Scalar ND: for storing mi_new and li_new back to GM - constexpr int kScalarCols = 32 / sizeof(float); - constexpr int kScalarRows = M / kScalarCols; - using GlobalScalarND = - GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; - - // --- GlobalTensor instances --- - - GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); - GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); - GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); - - // DN globals for loading scalars as ColMajor - GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); - GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); - GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); - GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); - - // ND globals for storing scalar results - GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); - GlobalScalarND liGlobalND(li_ptr + li->start_offset); - - // --- Tile types --- - - using TileDataMxN = Tile; - using TileScalarDN = Tile; - - // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE - using TileScalarRow = Tile; - - // ND tile for storing back to GM - using TileScalarND = - Tile; - - // --- UB memory layout --- - - constexpr int kDataBytes = M * N * sizeof(float); - constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); - - // Data tiles - TileDataMxN oiNewTile; - TileDataMxN oiTile; - - // Scalar DN tiles loaded from GM (ColMajor) - TileScalarDN mijDN, lijDN, miDN, liDN; - - // Temporary DN tiles for results - TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; - - TASSIGN(oiNewTile, 0); - TASSIGN(oiTile, kDataBytes); - TASSIGN(mijDN, 2 * kDataBytes); - TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); - TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); - TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); - TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); - TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); - TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); - TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); - - if (is_first) { - // --- First block: copy inputs to accumulators --- - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(mijDN, mijGlobalDN); - TLOAD(lijDN, lijGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // Store mi = mij, li = lij, oi = oi_new - // Alias ND tiles to same UB as DN tiles for ND-format store - TileScalarND mijND, lijND; - TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN - TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, mijND); // mi = mij - TSTORE(liGlobalND, lijND); // li = lij - TSTORE(oiGlobal, oiNewTile); // oi = oi_new - - if (is_last) { - // Single block: normalize dst = oi_new / lij - // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); - TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); - TSTORE(dstGlobal, oiNewTile); - } - } else { - // --- Subsequent blocks: accumulate --- - - // Load all inputs as DN (ColMajor) - TLOAD(oiNewTile, oiNewGlobal); - TLOAD(oiTile, oiGlobal); - TLOAD(mijDN, mijGlobalDN); - TLOAD(lijDN, lijGlobalDN); - TLOAD(miDN, miGlobalDN); - TLOAD(liDN, liGlobalDN); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic - TileScalarRow miRow, mijRow, liRow, lijRow; - TRESHAPE(miRow, miDN); - TRESHAPE(mijRow, mijDN); - TRESHAPE(liRow, liDN); - TRESHAPE(lijRow, lijDN); - - // Scalar arithmetic in RowMajor (1, M) layout - TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; - TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); - TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); - TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); - TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); - - TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) - pipe_barrier(PIPE_V); - TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new - pipe_barrier(PIPE_V); - TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) - pipe_barrier(PIPE_V); - TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new - pipe_barrier(PIPE_V); - TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) - pipe_barrier(PIPE_V); - TMUL(tmpRow, alphaRow, liRow); // alpha * li - pipe_barrier(PIPE_V); - TMUL(liNewRow, betaRow, lijRow); // beta * lij - pipe_barrier(PIPE_V); - TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij - - // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL - TRESHAPE(alphaDN, alphaRow); - TRESHAPE(betaDN, betaRow); - - // Scale data tiles using row-broadcast multiply - TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha - TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta - pipe_barrier(PIPE_V); - TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new - - // Store mi_new and li_new to GM (ND format) - // Alias ND tiles to the same UB locations as miNewRow and liNewRow - TileScalarND miNewND, liNewND; - TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); - TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); - - if (is_last) { - // Normalize and output: dst = oi / li_new - TRESHAPE(liNewDN, liNewRow); - pipe_barrier(PIPE_V); - TROWEXPANDDIV(oiTile, oiTile, liNewDN); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liNewND); // persist li_new - TSTORE(dstGlobal, oiTile); - } else { - // Store updated accumulators - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(miGlobalND, miNewND); // persist mi_new - TSTORE(liGlobalND, liNewND); // persist li_new - TSTORE(oiGlobal, oiTile); - } - } - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ TensorData *mij = reinterpret_cast<__gm__ TensorData *>(args[0]); - __gm__ TensorData *lij = reinterpret_cast<__gm__ TensorData *>(args[1]); - __gm__ TensorData *oi_new = reinterpret_cast<__gm__ TensorData *>(args[2]); - __gm__ TensorData *mi = reinterpret_cast<__gm__ TensorData *>(args[3]); - __gm__ TensorData *li = reinterpret_cast<__gm__ TensorData *>(args[4]); - __gm__ TensorData *oi = reinterpret_cast<__gm__ TensorData *>(args[5]); - __gm__ TensorData *dst = reinterpret_cast<__gm__ TensorData *>(args[6]); - uint64_t is_first = static_cast(args[7]); - uint64_t is_last = static_cast(args[8]); - uint64_t q_tile_size = static_cast(mij->shapes[0]); - // args[10] = head_dim (128) - - if (q_tile_size == 16) { - online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); - } else { - online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp deleted file mode 100644 index b484a0b8a..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -// Two-Pass Softmax Kernel (AIV) for n_blocks tiles -// -// Input: sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically -// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block -// mij (M,) fp32 — global row max across all blocks -// lij (M,) fp32 — total row sum across all blocks -// -// Pass 1: Iterate over n_blocks tiles, apply scale, mask last block, -// find global m = max over all blocks of rowmax(S_i * scale) -// Uses TRESHAPE for DN↔Row conversion to keep globalMax in UB -// (eliminates 63 × 4 GM round-trip operations). -// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16, -// accumulate l = sum over all blocks of rowsum(P_i) -// Uses double-buffered sij tiles to overlap TLOAD with computation. -// -// Two-pass ensures all P_i tiles share the same scale (global max), -// enabling direct TMATMUL_ACC accumulation in the PV kernel. -// -// Supports two tile configurations via runtime dispatch: -// Case1: M=16, N=128 (q_tile=16, block_size=128) -// Case2: M=64, N=64 (q_tile=64, block_size=64) - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -template -static __aicore__ void softmax_prepare_n_impl( - __gm__ float *sij_base, float scale_value, __gm__ bfloat16_t *pij_base, __gm__ float *mij_addr, - __gm__ float *lij_addr, uint64_t n_blocks, uint64_t valid_len_last -) { - constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); - constexpr int kScalarCols = 32 / sizeof(float); - constexpr int kScalarRows = M / kScalarCols; - - // --- GlobalTensor types --- - using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; - using GlobalDataMxN_bf16 = GlobalTensor, Stride<1, 1, 1, N, 1>>; - using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; - using GlobalScalarND = - GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; - - // --- Tile types --- - using TileSijDyn = Tile; - using TileSijPad = Tile; - using TileVecMxN = Tile; - using TileVecMxN_bf16 = Tile; - using TileScalarDN = Tile; - using TileScalarND = - Tile; - // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE - using TileScalarRow = Tile; - - // --- UB memory layout (double-buffered sij) --- - constexpr int kDataBytes = M * N * sizeof(float); - constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); - - // Double-buffered sij tiles - TileVecMxN sijTile_A; - TileSijPad sijPadTile_A; - TileVecMxN sijTile_B; - TileSijPad sijPadTile_B; - TileVecMxN pijTile; - TileVecMxN tmpTile; - TileVecMxN sumAccTile; - TileScalarDN localMaxDN; - TileScalarDN globalMaxDN; - TileScalarDN sumDN; - TileVecMxN_bf16 pijBf16Tile; - - // TRESHAPE aliases (same UB address as their DN counterparts) - TileScalarRow localMaxRow; - TileScalarRow globalMaxRow; - - // ND alias for storing globalMax to GM - TileScalarND globalMaxND; - - TASSIGN(sijTile_A, 0x0); - TASSIGN(sijPadTile_A, 0x0); - TASSIGN(sijTile_B, kDataBytes); - TASSIGN(sijPadTile_B, kDataBytes); - TASSIGN(pijTile, 2 * kDataBytes); - TASSIGN(tmpTile, 3 * kDataBytes); - TASSIGN(sumAccTile, 4 * kDataBytes); - int scalarBase = 5 * kDataBytes; - TASSIGN(localMaxDN, scalarBase); - TASSIGN(localMaxRow, scalarBase); // alias: same UB as localMaxDN - TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes); - TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes); // alias: same UB as globalMaxDN - TASSIGN(globalMaxND, scalarBase + kScalarDNBytes); // alias: same UB as globalMaxDN - TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes); - TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes); - - // GM aliases (mij/lij output buffers) - GlobalScalarND mijGlobalND(mij_addr); - GlobalScalarDN lijGlobalDN(lij_addr); - - // ======== Pass 1: Find global row max via TRESHAPE (no GM round-trip) ======== - for (uint64_t i = 0; i < n_blocks; i++) { - GlobalDataMxN sijGlobal(sij_base + i * M * N); - TLOAD(sijTile_A, sijGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - if (i == n_blocks - 1 && valid_len_last < static_cast(N)) { - TileSijDyn sijDynTile(static_cast(valid_len_last)); - TASSIGN(sijDynTile, 0x0); - TFILLPAD_INPLACE(sijPadTile_A, sijDynTile); - } - - TMULS(sijTile_A, sijTile_A, scale_value); - pipe_barrier(PIPE_V); - TROWMAX(localMaxDN, sijTile_A, tmpTile); - - // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX - TRESHAPE(localMaxRow, localMaxDN); - if (i == 0) { - pipe_barrier(PIPE_V); - TMAX(globalMaxRow, localMaxRow, localMaxRow); - } else { - pipe_barrier(PIPE_V); - TMAX(globalMaxRow, globalMaxRow, localMaxRow); - } - } - - // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB - TRESHAPE(globalMaxDN, globalMaxRow); - - // Store final global max to mij for online_update to consume - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(mijGlobalND, globalMaxND); - - // ======== Pass 2: Compute softmax with double-buffered sij ======== - // globalMaxDN is already in UB from TRESHAPE — no reload needed. - // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD. - set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - - // Pre-load first sij tile into buffer A - GlobalDataMxN sijGlobal_0(sij_base); - TLOAD(sijTile_A, sijGlobal_0); - - for (uint64_t i = 0; i < n_blocks; i++) { - GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N); - - // Wait for current tile's TLOAD to complete - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - // TFILLPAD on current buffer if last block with partial valid length - if (i == n_blocks - 1 && valid_len_last < static_cast(N)) { - TileSijDyn curSijDyn(static_cast(valid_len_last)); - if (i % 2 == 0) { - TASSIGN(curSijDyn, 0x0); - TFILLPAD_INPLACE(sijPadTile_A, curSijDyn); - } else { - TASSIGN(curSijDyn, static_cast(kDataBytes)); - TFILLPAD_INPLACE(sijPadTile_B, curSijDyn); - } - } - - // Compute on current buffer (select A or B based on iteration parity) - if (i % 2 == 0) { - TMULS(sijTile_A, sijTile_A, scale_value); - pipe_barrier(PIPE_V); - TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN); - } else { - TMULS(sijTile_B, sijTile_B, scale_value); - pipe_barrier(PIPE_V); - TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN); - } - pipe_barrier(PIPE_V); - TEXP(pijTile, pijTile); - TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); - TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); - - if (i == 0) { - TMULS(sumAccTile, pijTile, 1.0f); - } else { - TADD(sumAccTile, sumAccTile, pijTile); - } - - // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile) - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(pijGlobal, pijBf16Tile); - - // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race) - if (i + 1 < n_blocks) { - set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); - GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N); - if (i % 2 == 0) { - TLOAD(sijTile_B, sijGlobal_next); - } else { - TLOAD(sijTile_A, sijGlobal_next); - } - } - } - - // Compute final row sum from accumulated pij values - pipe_barrier(PIPE_V); - TROWSUM(sumDN, sumAccTile, tmpTile); - - // Store lij (total sum). mij already stored after Pass 1. - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(lijGlobalDN, sumDN); - - pipe_sync(); -} - -extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { - __gm__ TensorData *sij_buf = reinterpret_cast<__gm__ TensorData *>(args[0]); - __gm__ TensorData *pij_buf = reinterpret_cast<__gm__ TensorData *>(args[1]); - __gm__ TensorData *mij = reinterpret_cast<__gm__ TensorData *>(args[2]); - __gm__ TensorData *lij = reinterpret_cast<__gm__ TensorData *>(args[3]); - union { - uint64_t u; - float f; - } scale_conv; - scale_conv.u = static_cast(args[4]); - float scale_value = scale_conv.f; - uint64_t n_blocks = static_cast(args[5]); - uint64_t valid_len_last = static_cast(args[6]); - - __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset; - __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset; - __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset; - __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset; - - uint64_t q_tile_size = static_cast(sij_buf->shapes[0]); - - if (q_tile_size == 16) { - softmax_prepare_n_impl<16, 128>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last); - } else { - softmax_prepare_n_impl<64, 64>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last); - } -} diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp deleted file mode 100644 index d1b8a7c1d..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Paged Attention Orchestration — N_UNROLL=64, 4 Tasks Per Group - * (aicpu_build_graph variant: explicit add_dependency, no TensorMap) - * - * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks: - * 1. QK matmul: qi @ K^T for n_blocks → sij_buf (q_tile, n_blocks * block_size) - * 2. Softmax: two-pass over sij_buf → pij_buf, mi, li - * 3. PV matmul: SplitK accumulated P @ V → oi_new (q_tile, head_dim) - * 4. Update: online softmax accumulation with group-level mi, li, oi_new - * - * Dependency graph per group: - * QK → Softmax → PV → Update - * └──────────→ Update - * Update(prev group) ──→ Update(this group) - * Hub(init) ────────────→ Update(first group) - */ - -#include -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -#define N_UNROLL 64 - -#define FUNC_QK_MATMUL 0 -#define FUNC_SOFTMAX_PREPARE 1 -#define FUNC_PV_MATMUL 2 -#define FUNC_ONLINE_UPDATE 3 -#define FUNC_AIC_HUB 4 -#define FUNC_AIV_HUB 5 - -constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz - -inline double cycles_to_us(uint64_t cycles) { - return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; -} - -inline uint64_t get_sys_cnt_aicpu() { - uint64_t ticks; - asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); - return ticks; -} - -#ifdef ENABLE_PROFILING -#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 -#define CYCLE_COUNT_LAP(acc) \ - do { \ - _t1 = get_sys_cnt_aicpu(); \ - acc += (_t1 - _t0); \ - _t0 = _t1; \ - } while (0) -#else -#define CYCLE_COUNT_START() (void)0 -#define CYCLE_COUNT_LAP(acc) (void)0 -#endif - -extern "C" { -/** - * Orchestration config — the executor reads these values to set up - * shared memory and runtime before calling aicpu_orchestration_entry. - */ -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; // NOLINT(readability/casting) - return PTO2OrchestrationConfig{ - .expected_arg_count = 7, - }; -} - -__attribute__((visibility("default"))) void -aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) { -#ifdef ENABLE_PROFILING - uint64_t prof_param_extract = 0; - uint64_t prof_ext_tensor = 0; - uint64_t prof_make_tensor = 0; - uint64_t prof_tensor_view = 0; - uint64_t prof_param_setup = 0; - uint64_t prof_submit_task = 0; - uint64_t prof_scope_and_loop = 0; - int prof_submit_count = 0; - int prof_make_count = 0; - int prof_view_count = 0; -#endif - - CYCLE_COUNT_START(); - - // Read dimensions from tensor metadata - // query: shape=[batch, num_heads, head_dim] - uint64_t batch = orch_args.tensor(0).shapes[0]; - uint64_t num_heads = orch_args.tensor(0).shapes[1]; - uint64_t head_dim = orch_args.tensor(0).shapes[2]; - DataType data_type = orch_args.tensor(0).dtype; - - // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim] - uint64_t block_size = orch_args.tensor(1).shapes[1]; - - // block_table: shape=[batch, max_num_blocks_per_req] - uint64_t block_num = orch_args.tensor(3).shapes[1]; - - // scale from scalar arg - uint64_t scale_value = orch_args.scalar(0); - - uint64_t q_head_num = num_heads; - uint64_t q_tile = std::min(num_heads, static_cast(128)); - uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; - CYCLE_COUNT_LAP(prof_param_extract); - - // Reshape tensors for kernel consumption (2D flattened) - void *query_ptr = orch_args.tensor(0).data_as(); - void *kc_ptr = orch_args.tensor(1).data_as(); - void *vc_ptr = orch_args.tensor(2).data_as(); - void *out_ptr = orch_args.tensor(5).data_as(); - - uint64_t total_blocks_count = orch_args.tensor(1).shapes[0]; - - uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - uint32_t key_cache_shapes[2] = { - static_cast(total_blocks_count * block_size), static_cast(head_dim) - }; - uint32_t value_cache_shapes[2] = { - static_cast(total_blocks_count * block_size), static_cast(head_dim) - }; - uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; - Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false); - Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false); - Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false); - Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); - - int *host_block_table = orch_args.tensor(3).data_as(); - int *host_context_lens = orch_args.tensor(4).data_as(); - -#ifdef ENABLE_PROFILING - CYCLE_COUNT_LAP(prof_ext_tensor); -#endif - - // Prefetch first batch's block table data into cache (4 cache lines = 256 bytes) - for (int cl = 0; cl < N_UNROLL * static_cast(sizeof(int)); cl += 64) { - __builtin_prefetch(reinterpret_cast(host_block_table) + cl, 0, 3); - } - __builtin_prefetch(&host_context_lens[0], 0, 3); - - for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { - uint64_t cur_seq = host_context_lens[b_idx]; - uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; - // Pre-compute block table base pointer for this batch - int *bt_base = host_block_table + b_idx * block_num; - - // Prefetch next batch's block table + context_lens while processing current batch - if (b_idx + 1 < batch) { - int *bt_next = host_block_table + (b_idx + 1) * block_num; - for (int cl = 0; cl < N_UNROLL * static_cast(sizeof(int)); cl += 64) { - __builtin_prefetch(reinterpret_cast(bt_next) + cl, 0, 3); - } - __builtin_prefetch(&host_context_lens[b_idx + 1], 0, 3); - } - for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { - CYCLE_COUNT_LAP(prof_scope_and_loop); - PTO2_SCOPE(rt) { - uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; - - uint32_t oi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t li_shapes[1] = {static_cast(q_tile)}; - uint32_t mi_shapes[1] = {static_cast(q_tile)}; - -#ifdef ENABLE_PROFILING - prof_make_count += 3; - CYCLE_COUNT_LAP(prof_make_tensor); -#endif - - uint32_t qi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; - Tensor qi = query.view(qi_shapes, qi_offsets); - uint32_t out_view_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; - uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; - Tensor out_view = out.view(out_view_shapes, out_view_offsets); -#ifdef ENABLE_PROFILING - prof_view_count += 2; - CYCLE_COUNT_LAP(prof_tensor_view); -#endif - // Hub task: zero-initialize oi, li_update, mi_update - Arg args_inplace; - args_inplace.add_output(TensorCreateInfo(oi_shapes, 2, DataType::FLOAT32)); - args_inplace.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32)); - args_inplace.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32)); - CYCLE_COUNT_LAP(prof_param_setup); - SubmitResult r_hub = rt_submit_aiv_task(rt, FUNC_AIV_HUB, args_inplace); - const Tensor &oi = r_hub.outputs.get_ref(0); - const Tensor &li_update = r_hub.outputs.get_ref(1); - const Tensor &mi_update = r_hub.outputs.get_ref(2); -#ifdef ENABLE_PROFILING - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); -#endif - - // Reusable Arg objects — reset() before each use avoids - // repeated stack-frame construction in the inner loop. - Arg args_qk, args_sf, args_pv, args_up; - - PTO2TaskId prev_update_task = r_hub.task_id; - - for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) { - uint64_t n_blocks = std::min(static_cast(N_UNROLL), bn_this_batch - bn); - - // Valid length for last block in this group - uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size; - uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start); - CYCLE_COUNT_LAP(prof_param_extract); - - // === Task 1: Batched QK matmul === - uint32_t sij_buf_shapes[2] = { - static_cast(q_tile), static_cast(n_blocks * block_size) - }; - -#ifdef ENABLE_PROFILING - prof_make_count += 1; - CYCLE_COUNT_LAP(prof_make_tensor); -#endif - - args_qk.reset(); - args_qk.add_input(qi); - args_qk.add_input(key_cache); - args_qk.add_output(TensorCreateInfo(sij_buf_shapes, 2, DataType::FLOAT32)); - args_qk.add_scalar(n_blocks); - args_qk.add_scalar(reinterpret_cast(bt_base + bn)); - CYCLE_COUNT_LAP(prof_param_setup); - SubmitResult r_qk = rt_submit_aic_task(rt, FUNC_QK_MATMUL, args_qk); -#ifdef ENABLE_PROFILING - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); -#endif - - // === Task 2: Two-pass softmax over all blocks in group === - uint32_t pij_buf_shapes[2] = { - static_cast(q_tile), static_cast(n_blocks * block_size) - }; -#ifdef ENABLE_PROFILING - prof_make_count += 3; - CYCLE_COUNT_LAP(prof_make_tensor); -#endif - - args_sf.reset(); - args_sf.add_input(r_qk.outputs.get_ref(0)); - args_sf.add_output(TensorCreateInfo(pij_buf_shapes, 2, data_type)); - args_sf.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32)); - args_sf.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32)); - args_sf.add_scalar(scale_value); - args_sf.add_scalar(n_blocks); - args_sf.add_scalar(valid_len_last); - CYCLE_COUNT_LAP(prof_param_setup); - SubmitResult r_sf = rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, args_sf); - // QK → Softmax (sij_buf) - rt_add_dependency(rt, r_qk.task_id, r_sf.task_id); -#ifdef ENABLE_PROFILING - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); -#endif - - // === Task 3: SplitK PV matmul (accumulated P @ V) === - uint32_t oi_new_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; -#ifdef ENABLE_PROFILING - prof_make_count += 1; - CYCLE_COUNT_LAP(prof_make_tensor); -#endif - - args_pv.reset(); - args_pv.add_input(r_sf.outputs.get_ref(0)); - args_pv.add_input(value_cache); - args_pv.add_output(TensorCreateInfo(oi_new_shapes, 2, DataType::FLOAT32)); - args_pv.add_scalar(n_blocks); - args_pv.add_scalar(reinterpret_cast(bt_base + bn)); - CYCLE_COUNT_LAP(prof_param_setup); - SubmitResult r_pv = rt_submit_aic_task(rt, FUNC_PV_MATMUL, args_pv); - // Softmax → PV (pij_buf) - rt_add_dependency(rt, r_sf.task_id, r_pv.task_id); -#ifdef ENABLE_PROFILING - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); -#endif - - // === Task 4: Online update (per-group) === - uint64_t is_first = (bn == 0) ? 1 : 0; - uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0; - - args_up.reset(); - args_up.add_input(r_sf.outputs.get_ref(1)); - args_up.add_input(r_sf.outputs.get_ref(2)); - args_up.add_input(r_pv.outputs.get_ref(0)); - args_up.add_inout(mi_update); - args_up.add_inout(li_update); - args_up.add_inout(oi); - args_up.add_inout(out_view); - args_up.add_scalar(is_first); - args_up.add_scalar(is_last); - CYCLE_COUNT_LAP(prof_param_setup); - SubmitResult r_up = rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, args_up); - // Softmax → Update (mi, li) - rt_add_dependency(rt, r_sf.task_id, r_up.task_id); - // PV → Update (oi_new) - rt_add_dependency(rt, r_pv.task_id, r_up.task_id); - // Previous update → this update (mi_update, li_update, oi accumulation chain) - rt_add_dependency(rt, prev_update_task, r_up.task_id); -#ifdef ENABLE_PROFILING - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); -#endif - prev_update_task = r_up.task_id; - } - } - CYCLE_COUNT_LAP(prof_scope_and_loop); - } - } - CYCLE_COUNT_LAP(prof_scope_and_loop); - -#ifdef ENABLE_PROFILING - uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + - prof_submit_task + prof_scope_and_loop; - LOG_ALWAYS( - rt, "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count, - prof_make_count, prof_view_count, cycles_to_us(total) - ); - if (total > 0) { - LOG_ALWAYS( - rt, " param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract), - prof_param_extract * 100.0 / total - ); - LOG_ALWAYS( - rt, " ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total - ); - LOG_ALWAYS( - rt, " make_tensor(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor), - prof_make_tensor * 100.0 / total, - prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0 - ); - LOG_ALWAYS( - rt, " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view), - prof_tensor_view * 100.0 / total, - prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0 - ); - LOG_ALWAYS( - rt, " param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), - prof_param_setup * 100.0 / total - ); - LOG_ALWAYS( - rt, " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task), - prof_submit_task * 100.0 / total, - prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0 - ); - LOG_ALWAYS( - rt, " scope_and_loop : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope_and_loop), - prof_scope_and_loop * 100.0 / total - ); - } -#endif - -#undef CYCLE_COUNT_START -#undef CYCLE_COUNT_LAP -} - -} // extern "C" diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py deleted file mode 100644 index d0b982df0..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged attention unroll — aicpu_build_graph runtime (production scale, bfloat16). - -Tests aicpu_build_graph runtime with N_UNROLL=64, hub kernels (aic_hub, aiv_hub), -INOUT tensors, and AIC+AIV mixed execution. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test -from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden # noqa: PLC0415 -from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs # noqa: PLC0415 - - -@scene_test(level=2, runtime="aicpu_build_graph") -class TestPagedAttentionUnrollAicpuBuildGraph(SceneTestCase): - """Paged attention unroll with aicpu_build_graph runtime and hub kernels.""" - - RTOL = 1e-3 - ATOL = 1e-3 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/paged_attention_orch.cpp", - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "name": "QK", - "source": "kernels/aic/aic_qk_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": "kernels/aic/aic_pv_matmul.cpp", - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 4, - "name": "AIC_HUB", - "source": "kernels/aic/aic_hub.cpp", - "core_type": "aic", - "signature": [], - }, - { - "func_id": 1, - "name": "SF", - "source": "kernels/aiv/aiv_softmax_prepare.cpp", - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": "kernels/aiv/aiv_online_update.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, - { - "func_id": 5, - "name": "AIV_HUB", - "source": "kernels/aiv/aiv_hub.cpp", - "core_type": "aiv", - "signature": [], - }, - ], - } - - CASES = [ - { - "name": "Case1", - "platforms": ["a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "params": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - { - "name": "Case2", - "platforms": ["a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "manual": True, - "params": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - { - "name": "Case3", - "platforms": ["a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 24}, - "manual": True, - "params": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 256, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - }, - ] - - def generate_args(self, params): - inputs = _pa_generate_inputs(params) - specs = [] - for name, val in inputs: - if isinstance(val, torch.Tensor): - specs.append(Tensor(name, val)) - else: - specs.append(Scalar(name, val)) - return TaskArgsBuilder(*specs) - - def compute_golden(self, args, params): - tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} - _pa_compute_golden(tensors, params) - for s in args.specs: - if isinstance(s, Tensor) and s.name in tensors: - getattr(args, s.name)[:] = tensors[s.name] - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/README.md b/tests/st/a2a3/aicpu_build_graph/vector_example/README.md deleted file mode 100644 index 13106bc68..000000000 --- a/tests/st/a2a3/aicpu_build_graph/vector_example/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# PTO Runtime Example - AICPU Builds Graph (aicpu_build_graph) - -This example runs the same computation as `host_build_graph_example`, but the task graph is built on **AICPU** (1 builder thread) while scheduling/execution runs on **AICPU** (3 scheduler threads), for a total of **4** AICPU threads. - -## Run (simulation) - -```bash -python tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py -p a2a3sim - -# Or via pytest -pytest tests/st/a2a3/aicpu_build_graph/vector_example --platform a2a3sim -``` - -## Key difference vs host_build_graph/vector_example - -- The framework (`init_runtime_impl`) automatically manages I/O tensor device memory - using `arg_types`/`arg_sizes` and populates `runtime->orch_args[]`. -- `kernels/aicpu/orchestration.cpp` is compiled into a small AICPU-side plugin `.so`. - - The framework embeds the plugin bytes into `Runtime`. - - The AICPU runtime `dlopen()`s the embedded plugin and calls `orchestration(Runtime*)` on device. - - The orchestration allocates intermediate tensors via `api.device_malloc()` (HBM) and builds the task graph. diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp deleted file mode 100644 index 50954fdf9..000000000 --- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Element-wise Tensor Addition Kernel - * - * Implements: out[i] = src0[i] + src1[i] - */ - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); - __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset; - __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset; - __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; - - constexpr int kTRows_ = 128; - constexpr int kTCols_ = 128; - constexpr int vRows = 128; - constexpr int vCols = 128; - - using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; - using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; - using GlobalData = GlobalTensor; - using TileData = Tile; - - TileData src0Tile(vRows, vCols); - TileData src1Tile(vRows, vCols); - TileData dstTile(vRows, vCols); - TASSIGN(src0Tile, 0x0); - TASSIGN(src1Tile, 0x10000); - TASSIGN(dstTile, 0x20000); - - GlobalData src0Global(src0); - GlobalData src1Global(src1); - GlobalData dstGlobal(out); - - TLOAD(src0Tile, src0Global); - TLOAD(src1Tile, src1Global); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - TADD(dstTile, src0Tile, src1Tile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(dstGlobal, dstTile); - - pipe_sync(); -} diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp deleted file mode 100644 index 72f1fbde4..000000000 --- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Scalar Addition Kernel - * - * Implements: out[i] = src[i] + scalar - */ - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset; - __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; - - union { - uint64_t u64; - float f32; - } converter; - converter.u64 = args[2]; - float scalar = converter.f32; - - constexpr int kTRows_ = 128; - constexpr int kTCols_ = 128; - constexpr int vRows = 128; - constexpr int vCols = 128; - - using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; - using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; - using GlobalData = GlobalTensor; - using TileData = Tile; - - TileData srcTile(vRows, vCols); - TileData dstTile(vRows, vCols); - TASSIGN(srcTile, 0x0); - TASSIGN(dstTile, 0x10000); - - GlobalData srcGlobal(src); - GlobalData dstGlobal(out); - - TLOAD(srcTile, srcGlobal); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - TADDS(dstTile, srcTile, scalar); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(dstGlobal, dstTile); - - pipe_sync(); -} diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp deleted file mode 100644 index 6692257b4..000000000 --- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * Element-wise Tensor Multiplication Kernel - * - * Implements: out[i] = src0[i] * src1[i] - */ - -#include -#include - -#include "tensor.h" - -using namespace pto; - -#include "pipe_sync.h" - -#ifndef __gm__ -#define __gm__ -#endif - -#ifndef __aicore__ -#define __aicore__ [aicore] -#endif - -extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { - __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); - __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); - __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); - __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset; - __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset; - __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; - - constexpr int kTRows_ = 128; - constexpr int kTCols_ = 128; - constexpr int vRows = 128; - constexpr int vCols = 128; - - using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; - using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; - using GlobalData = GlobalTensor; - using TileData = Tile; - - TileData src0Tile(vRows, vCols); - TileData src1Tile(vRows, vCols); - TileData dstTile(vRows, vCols); - TASSIGN(src0Tile, 0x0); - TASSIGN(src1Tile, 0x10000); - TASSIGN(dstTile, 0x20000); - - GlobalData src0Global(src0); - GlobalData src1Global(src1); - GlobalData dstGlobal(out); - - TLOAD(src0Tile, src0Global); - TLOAD(src1Tile, src1Global); - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - TMUL(dstTile, src0Tile, src1Tile); - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(dstGlobal, dstTile); - - pipe_sync(); -} diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp deleted file mode 100644 index eeee70764..000000000 --- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) PyPTO Contributors. - * This program is free software, you can redistribute it and/or modify it under the terms and conditions of - * CANN Open Software License Agreement Version 2.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - * ----------------------------------------------------------------------------------------------------------- - */ -/** - * AICPU orchestration for the vector example. - * - * DAG structure for formula: f = (a + b + 1) * (a + b + 2) - * t0: c = a + b (func_id=0, kernel_add) - * t1: d = c + 1 (func_id=1, kernel_add_scalar) - * t2: e = c + 2 (func_id=1, kernel_add_scalar) - * t3: f = d * e (func_id=2, kernel_mul) - * Dependencies: t0->t1, t0->t2, t1->t3, t2->t3 - * - * Uses explicit add_dependency for all dependency edges (no TensorMap). - * Tasks are batch-published at scope_end. - */ - -#include -#include - -#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) - -extern "C" { - -__attribute__((visibility("default"))) PTO2OrchestrationConfig -aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { - (void)orch_args; - return PTO2OrchestrationConfig{ - .expected_arg_count = 3, - }; -} - -__attribute__((visibility("default"))) void -aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) { - // golden shape = kernel shape, use from_tensor_arg() directly - Tensor ext_a = from_tensor_arg(orch_args.tensor(0)); - Tensor ext_b = from_tensor_arg(orch_args.tensor(1)); - Tensor ext_f = from_tensor_arg(orch_args.tensor(2)); - - uint32_t SIZE = orch_args.tensor(0).shapes[0]; - - uint32_t shapes[1] = {SIZE}; - - PTO2_SCOPE(rt) { - // t0: c = a + b - Arg args_t0; - args_t0.add_input(ext_a); - args_t0.add_input(ext_b); - args_t0.add_output(TensorCreateInfo(shapes, 1, DataType::FLOAT32)); - SubmitResult r0 = rt_submit_aiv_task(rt, 0, args_t0); - - // t1: d = c + 1.0 - Arg args_t1; - args_t1.add_input(r0.outputs.get_ref(0)); - args_t1.add_output(TensorCreateInfo(shapes, 1, DataType::FLOAT32)); - args_t1.add_scalar(1.0f); - SubmitResult r1 = rt_submit_aiv_task(rt, 1, args_t1); - rt_add_dependency(rt, r0.task_id, r1.task_id); - - // t2: e = c + 2.0 - Arg args_t2; - args_t2.add_input(r0.outputs.get_ref(0)); - args_t2.add_output(TensorCreateInfo(shapes, 1, DataType::FLOAT32)); - args_t2.add_scalar(2.0f); - SubmitResult r2 = rt_submit_aiv_task(rt, 1, args_t2); - rt_add_dependency(rt, r0.task_id, r2.task_id); - - // t3: f = d * e - Arg args_t3; - args_t3.add_input(r1.outputs.get_ref(0)); - args_t3.add_input(r2.outputs.get_ref(0)); - args_t3.add_inout(ext_f); - SubmitResult r3 = rt_submit_aiv_task(rt, 2, args_t3); - rt_add_dependency(rt, r1.task_id, r3.task_id); - rt_add_dependency(rt, r2.task_id, r3.task_id); - } // scope_end: batch-publish all tasks -} - -} // extern "C" diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py b/tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py deleted file mode 100644 index 2e071c78f..000000000 --- a/tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Vector example — aicpu_build_graph runtime with device-side DAG building. - -Computation: f = (a + b + 1) * (a + b + 2), where a=2.0, b=3.0, so f=42.0. -Tests aicpu_build_graph runtime with intermediate tensors allocated from HeapRing. -""" - -import torch -from simpler.task_interface import ArgDirection as D - -from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test - - -@scene_test(level=2, runtime="aicpu_build_graph") -class TestVectorExample(SceneTestCase): - """Vector example: f = (a + b + 1) * (a + b + 2) via device-side DAG.""" - - RTOL = 1e-5 - ATOL = 1e-5 - - CALLABLE = { - "orchestration": { - "source": "kernels/orchestration/orchestration.cpp", - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT], - }, - "incores": [ - { - "func_id": 0, - "source": "kernels/aiv/kernel_add.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "source": "kernels/aiv/kernel_add_scalar.cpp", - "core_type": "aiv", - "signature": [D.IN, D.OUT], - }, - { - "func_id": 2, - "source": "kernels/aiv/kernel_mul.cpp", - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - ], - } - - CASES = [ - { - "name": "default", - "platforms": ["a2a3sim", "a2a3"], - "config": {"aicpu_thread_num": 4, "block_dim": 3}, - "params": {}, - }, - ] - - def generate_args(self, params): - SIZE = 128 * 128 - a = torch.full((SIZE,), 2.0, dtype=torch.float32) - b = torch.full((SIZE,), 3.0, dtype=torch.float32) - f = torch.zeros(SIZE, dtype=torch.float32) - - return TaskArgsBuilder( - Tensor("a", a), - Tensor("b", b), - Tensor("f", f), - ) - - def compute_golden(self, args, params): - a = args.a - b = args.b - args.f[:] = (a + b + 1) * (a + b + 2) - - -if __name__ == "__main__": - SceneTestCase.run_module(__name__) diff --git a/tests/ut/py/test_runtime_builder.py b/tests/ut/py/test_runtime_builder.py index 6d5951dcd..122cf867b 100644 --- a/tests/ut/py/test_runtime_builder.py +++ b/tests/ut/py/test_runtime_builder.py @@ -28,14 +28,6 @@ def test_discovers_real_runtimes(self, default_test_platform): runtimes = builder.list_runtimes() assert "host_build_graph" in runtimes - def test_discovers_aicpu_build_graph(self, default_test_platform): - """RuntimeBuilder discovers aicpu_build_graph from the real project tree.""" - from simpler_setup.runtime_builder import RuntimeBuilder # noqa: PLC0415 - - builder = RuntimeBuilder(platform=default_test_platform) - runtimes = builder.list_runtimes() - assert "aicpu_build_graph" in runtimes - def test_runtime_dir_resolves_to_project_root(self, default_test_platform, test_arch): """runtime_dir resolves to src/{arch}/runtime/ under the project root.""" from simpler_setup.runtime_builder import RuntimeBuilder # noqa: PLC0415 diff --git a/tools/README.md b/tools/README.md index 53526d9f1..468f27a2a 100644 --- a/tools/README.md +++ b/tools/README.md @@ -22,12 +22,12 @@ elapsed time. ``` Requires `PTO2_PROFILING=1` in the runtime; device log must include the -`orch_*` / `sched_*` lines. The `EXAMPLE_CASES` maps at the top of the script -control which examples/cases are run per runtime. +`orch_*` / `sched_*` lines. The `TMR_EXAMPLE_CASES` map at the top of the +script controls which examples/cases are run. ## verify_packaging.sh -Exercises all 5 install paths × 4 entry points from a fully clean state. +Exercises all 5 install paths × 2 entry points from a fully clean state. CI calls this directly; see [docs/python-packaging.md](../docs/python-packaging.md). Must run from the repo root inside an activated venv. diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index d88a9ef3d..710b60108 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -13,7 +13,7 @@ # Usage: # ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] [-r ] # -# Edit the EXAMPLE_CASES maps below to control which examples and cases to run. +# Edit the EXAMPLE_CASES map below to control which examples and cases to run. set -euo pipefail @@ -44,14 +44,6 @@ TMR_EXAMPLE_ORDER=( spmd_paged_attention ) -# --- aicpu_build_graph --- -declare -A ABG_EXAMPLE_CASES=( - [paged_attention_unroll]="Case1,Case2" -) -ABG_EXAMPLE_ORDER=( - paged_attention_unroll -) - # --------------------------------------------------------------------------- # Parse arguments # --------------------------------------------------------------------------- @@ -95,7 +87,7 @@ Options: -p, --platform Platform to run on (default: a2a3) -d, --device Device ID (default: 0) -n, --rounds Override number of rounds for each example (default: 100) - -r, --runtime Runtime to benchmark: tensormap_and_ringbuffer (default), aicpu_build_graph + -r, --runtime Runtime to benchmark: tensormap_and_ringbuffer (default) -v, --verbose Save detailed test_*.py output to a timestamped log file -h, --help Show this help @@ -156,12 +148,8 @@ case "$RUNTIME" in declare -n EXAMPLE_CASES=TMR_EXAMPLE_CASES EXAMPLE_ORDER=("${TMR_EXAMPLE_ORDER[@]}") ;; - aicpu_build_graph) - declare -n EXAMPLE_CASES=ABG_EXAMPLE_CASES - EXAMPLE_ORDER=("${ABG_EXAMPLE_ORDER[@]}") - ;; *) - echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer or aicpu_build_graph." + echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer." exit 1 ;; esac diff --git a/tools/verify_packaging.sh b/tools/verify_packaging.sh index a56cf192c..3b06271fa 100755 --- a/tools/verify_packaging.sh +++ b/tools/verify_packaging.sh @@ -7,7 +7,7 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -# Verify all 5 install paths x 4 entry points are green. +# Verify all 5 install paths x 2 entry points are green. # # Each mode runs from a fully clean state (uninstall + wipe build artifacts) so # leftover binaries from a previous mode cannot mask a regression in the next. @@ -70,7 +70,7 @@ print('incore helpers OK:', inc_dirs) " echo "::endgroup::" echo "::group::[${mode}] standalone test_*.py --help" - python tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py --help >/dev/null + python tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py --help >/dev/null echo "::endgroup::" echo "smoke[${mode}] OK" }