From ff3b0e6108e021e0af9915371db48a867471fe8b Mon Sep 17 00:00:00 2001
From: wcwxy <26245345+ChaoWao@users.noreply.github.com>
Date: Wed, 29 Apr 2026 15:31:37 +0800
Subject: [PATCH] Refactor: remove aicpu_build_graph runtime

All aicpu_build_graph features have been merged into
tensormap_and_ringbuffer; the standalone runtime is no longer needed.

- Delete src/a2a3/runtime/aicpu_build_graph/ and the corresponding ST
  test tree under tests/st/a2a3/aicpu_build_graph/
- Drop the test_discovers_aicpu_build_graph discovery test and the
  matching skip clause in tests/conftest.py
- Drop ABG_EXAMPLE_CASES and runtime branch from
  tools/benchmark_rounds.sh; redirect the verify_packaging.sh smoke
  test to the tensormap_and_ringbuffer paged_attention_unroll case
- Remove the runtime from issue templates, .claude rules/skills/
  commands, READMEs, the per-arch runtime docs, the tensor-dump and
  dynamic-linking docs, and the L2 perf header comments
- Fix stale verification-matrix copy: tools/README.md, the
  verify_packaging.sh banner, and docs/python-packaging.md all said
  "5 install paths x 4 entry points" but only two user-facing entry
  points exist (pytest, standalone test_*.py); update to "x 2"
- Update tools/README.md to refer to the single TMR_EXAMPLE_CASES map
  in benchmark_rounds.sh (was "EXAMPLE_CASES maps ... per runtime"
  back when ABG had its own map)
- Widen the scene-test retry from rc==124 only to any non-zero rc
  in all four spots (st-sim-a2a3, st-sim-a5, st-onboard-a2a3,
  st-onboard-a5), so transient PTO-ISA git-clone failures (e.g.
  SSL_ERROR_SYSCALL) trigger the pinned-commit retry instead of
  failing the job outright
---
 .claude/commands/perf-runtime-device.md       |    2 +-
 .claude/commands/test-runtime-device.md       |    2 +-
 .claude/commands/test-runtime-sim.md          |    2 +-
 .claude/rules/architecture.md                 |    2 +-
 .claude/skills/benchmark/SKILL.md             |   14 +-
 .github/ISSUE_TEMPLATE/bug_report.yml         |    1 -
 .github/ISSUE_TEMPLATE/performance_issue.yml  |    1 -
 .github/workflows/ci.yml                      |   14 +-
 README.md                                     |    3 +-
 docs/developer-guide.md                       |    2 -
 docs/dynamic-linking.md                       |    2 +-
 docs/python-packaging.md                      |    2 +-
 docs/tensor-dump.md                           |   17 +-
 docs/testing.md                               |    1 -
 examples/workers/README.md                    |    4 +-
 simpler_setup/kernel_compiler.py              |    2 +-
 simpler_setup/tools/swimlane_converter.py     |    2 +-
 src/a2a3/docs/runtimes.md                     |   34 +-
 .../include/aicore/l2_perf_collector_aicore.h |    4 +-
 .../include/aicpu/l2_perf_collector_aicpu.h   |    7 +-
 .../include/common/l2_perf_profiling.h        |    8 +-
 .../aicore/aicore_executor.cpp                |  153 --
 .../aicpu/aicpu_executor.cpp                  | 2341 -----------------
 .../runtime/aicpu_build_graph/build_config.py |   30 -
 .../aicpu_build_graph/docs/RUNTIME_LOGIC.md   |   31 -
 .../host/runtime_compile_info.cpp             |   27 -
 .../aicpu_build_graph/host/runtime_maker.cpp  |  379 ---
 .../orchestration/common.cpp                  |  166 --
 .../orchestration/pto_orchestration_api.h     |  194 --
 .../aicpu_build_graph/runtime/common.h        |   70 -
 .../runtime/pto2_dispatch_payload.h           |   43 -
 .../runtime/pto_orchestrator.cpp              |  608 -----
 .../runtime/pto_orchestrator.h                |  275 --
 .../runtime/pto_ring_buffer.cpp               |  116 -
 .../runtime/pto_ring_buffer.h                 |  619 -----
 .../runtime/pto_runtime2.cpp                  |  183 --
 .../aicpu_build_graph/runtime/pto_runtime2.h  |  281 --
 .../runtime/pto_runtime2_types.h              |  431 ---
 .../runtime/pto_scheduler.cpp                 |  241 --
 .../aicpu_build_graph/runtime/pto_scheduler.h |  729 -----
 .../runtime/pto_shared_memory.cpp             |  276 --
 .../runtime/pto_shared_memory.h               |  233 --
 .../runtime/pto_submit_types.h                |  106 -
 .../aicpu_build_graph/runtime/pto_types.h     |  279 --
 .../aicpu_build_graph/runtime/runtime.cpp     |  146 -
 .../aicpu_build_graph/runtime/runtime.h       |  293 ---
 .../aicpu_build_graph/runtime/tensor.h        |  409 ---
 .../docs/RUNTIME_LOGIC.md                     |   12 +-
 .../include/aicore/l2_perf_collector_aicore.h |    4 +-
 .../include/aicpu/l2_perf_collector_aicpu.h   |    7 +-
 .../include/common/l2_perf_profiling.h        |    8 +-
 .../docs/RUNTIME_LOGIC.md                     |   12 +-
 tests/conftest.py                             |    5 -
 .../st/a2a3/aicpu_build_graph/bgemm/README.md |   86 -
 .../bgemm/kernels/aic/kernel_gemm_tile.cpp    |  122 -
 .../bgemm/kernels/aiv/kernel_tile_add.cpp     |   75 -
 .../kernels/orchestration/bgemm_orch.cpp      |  137 -
 .../aicpu_build_graph/bgemm/test_bgemm.py     |   94 -
 .../orchestration/example_orchestration.cpp   |   55 -
 .../orch_so_cache/test_orch_so_cache.py       |  107 -
 .../paged_attention/kernels/aic/aic_hub.cpp   |   28 -
 .../kernels/aic/aic_pv_matmul.cpp             |  113 -
 .../kernels/aic/aic_qk_matmul.cpp             |  114 -
 .../paged_attention/kernels/aiv/aiv_hub.cpp   |   28 -
 .../kernels/aiv/aiv_online_update.cpp         |  255 --
 .../kernels/aiv/aiv_softmax_prepare.cpp       |  154 --
 .../orchestration/paged_attention_orch.cpp    |  196 --
 .../paged_attention/test_paged_attention.py   |  129 -
 .../kernels/aic/aic_hub.cpp                   |   28 -
 .../kernels/aic/aic_pv_matmul.cpp             |  152 --
 .../kernels/aic/aic_qk_matmul.cpp             |  127 -
 .../kernels/aiv/aiv_hub.cpp                   |   28 -
 .../kernels/aiv/aiv_online_update.cpp         |  255 --
 .../kernels/aiv/aiv_softmax_prepare.cpp       |  263 --
 .../orchestration/paged_attention_orch.cpp    |  370 ---
 .../test_paged_attention_unroll.py            |  152 --
 .../vector_example/README.md                  |   21 -
 .../vector_example/kernels/aiv/kernel_add.cpp |   73 -
 .../kernels/aiv/kernel_add_scalar.cpp         |   74 -
 .../vector_example/kernels/aiv/kernel_mul.cpp |   73 -
 .../kernels/orchestration/orchestration.cpp   |   86 -
 .../vector_example/test_vector_example.py     |   85 -
 tests/ut/py/test_runtime_builder.py           |    8 -
 tools/README.md                               |    6 +-
 tools/benchmark_rounds.sh                     |   18 +-
 tools/verify_packaging.sh                     |    4 +-
 86 files changed, 69 insertions(+), 12280 deletions(-)
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/build_config.py
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/common.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
 delete mode 100644 src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/README.md
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/README.md
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
 delete mode 100644 tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py

diff --git a/.claude/commands/perf-runtime-device.md b/.claude/commands/perf-runtime-device.md
index 4b4958103..6ca944a29 100644
--- a/.claude/commands/perf-runtime-device.md
+++ b/.claude/commands/perf-runtime-device.md
@@ -4,7 +4,7 @@ If `$ARGUMENTS` is provided, use it as the runtime name. Otherwise, default to `
 
 Reference `tools/benchmark_rounds.sh` for the full implementation pattern (device log resolution, timing parsing, reporting format).
 
-1. Validate the runtime is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list valid runtimes and stop.
+1. Validate the runtime is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list valid runtimes and stop.
 2. Check `command -v npu-smi` — if not found, tell the user this requires hardware and stop.
 3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3`.
 4. Find the lowest-ID idle device (HBM-Usage = 0) from the `npu-smi info` output. If none, stop.
diff --git a/.claude/commands/test-runtime-device.md b/.claude/commands/test-runtime-device.md
index 5551d1d12..889f67417 100644
--- a/.claude/commands/test-runtime-device.md
+++ b/.claude/commands/test-runtime-device.md
@@ -1,6 +1,6 @@
 # Run hardware device tests for a single runtime specified by $ARGUMENTS
 
-1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
+1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
 2. Check `command -v npu-smi` — if not found, tell the user to use `/test-runtime-sim` instead and stop.
 3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3`.
 4. Read `.github/workflows/ci.yml` to extract the current `--pto-isa-commit` and `--pto-session-timeout` values from the `st-onboard-<platform>` job's `pytest` invocation.
diff --git a/.claude/commands/test-runtime-sim.md b/.claude/commands/test-runtime-sim.md
index 59e2844c7..3f0a9e9da 100644
--- a/.claude/commands/test-runtime-sim.md
+++ b/.claude/commands/test-runtime-sim.md
@@ -1,6 +1,6 @@
 # Run simulation tests for a single runtime specified by $ARGUMENTS
 
-1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
+1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
 2. Read `.github/workflows/ci.yml` to extract the current `--pto-isa-commit` and `--pto-session-timeout` values from the `st-sim-*` jobs' `pytest` invocations.
 3. **Detect platform**: If `npu-smi` is available, parse the chip name from `npu-smi info`. Map `910B`/`910C` → `a2a3sim`, `950` → `a5sim`. If `npu-smi` is not found, default to `a2a3sim`.
 4. Run:
diff --git a/.claude/rules/architecture.md b/.claude/rules/architecture.md
index 905422467..7302f0a52 100644
--- a/.claude/rules/architecture.md
+++ b/.claude/rules/architecture.md
@@ -5,7 +5,7 @@ See [docs/chip-level-arch.md](../../docs/chip-level-arch.md) for the full diagra
 ## Key Concepts
 
 - **Three programs**: Host `.so`, AICPU `.so`, AICore `.o` — compiled independently, linked at runtime
-- **Three runtimes** under `src/{arch}/runtime/`: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`
+- **Two runtimes** under `src/{arch}/runtime/`: `host_build_graph`, `tensormap_and_ringbuffer`
 - **Two platform backends** under `src/{arch}/platform/`: `onboard/` (hardware), `sim/` (simulation)
 
 ## Python Package Layout
diff --git a/.claude/skills/benchmark/SKILL.md b/.claude/skills/benchmark/SKILL.md
index 39bd24a34..130f87ea3 100644
--- a/.claude/skills/benchmark/SKILL.md
+++ b/.claude/skills/benchmark/SKILL.md
@@ -45,20 +45,8 @@ The `-d` flag specifies NPU device IDs.
 `tools/benchmark_rounds.sh` supports `-r <runtime>`:
 
 - `tensormap_and_ringbuffer` (default)
-- `aicpu_build_graph`
 
-Each runtime has its own example list defined at the top of the script (`TMR_EXAMPLE_CASES` / `ABG_EXAMPLE_CASES`).
-
-**Auto-detection (compare mode only):** Always benchmark TMR. Also benchmark `aicpu_build_graph` if the diff touches its files:
-
-```bash
-RUNTIMES_TO_BENCH=(tensormap_and_ringbuffer)
-if git diff --name-only "$MERGE_BASE"...HEAD | grep -q 'aicpu_build_graph'; then
-  RUNTIMES_TO_BENCH+=(aicpu_build_graph)
-fi
-```
-
-Run `benchmark_rounds.sh` once per runtime, with `-r <runtime>` appended. **Runtimes are always benchmarked serially** — finish all baseline+current runs for one runtime before starting the next. This ensures no device ever runs two benchmark processes concurrently.
+The example list is defined at the top of the script (`TMR_EXAMPLE_CASES`).
 
 ## Step 1: Detect Mode
 
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 81131c554..6a9532444 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -29,7 +29,6 @@ body:
       description: Which runtime variant is affected?
       options:
         - tensormap_and_ringbuffer
-        - aicpu_build_graph
         - host_build_graph
         - All / Unknown
     validations:
diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml
index fffdfa5ea..918363032 100644
--- a/.github/ISSUE_TEMPLATE/performance_issue.yml
+++ b/.github/ISSUE_TEMPLATE/performance_issue.yml
@@ -29,7 +29,6 @@ body:
       description: Which runtime variant is affected?
       options:
         - tensormap_and_ringbuffer
-        - aicpu_build_graph
         - host_build_graph
         - All / Unknown
     validations:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fb32ca3f5..ddc403802 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -209,8 +209,8 @@ jobs:
           set +e
           pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
-          if [ $rc -eq 124 ]; then
-            echo "pytest timed out; retrying with pinned PTO-ISA commit"
+          if [ $rc -ne 0 ]; then
+            echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit"
             pytest examples tests/st --platform a2a3sim --device 0-15 -v \
               --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https
             rc=$?
@@ -267,8 +267,8 @@ jobs:
           set +e
           pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
-          if [ $rc -eq 124 ]; then
-            echo "pytest timed out; retrying with pinned PTO-ISA commit"
+          if [ $rc -ne 0 ]; then
+            echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit"
             pytest examples tests/st --platform a5sim --device 0-15 -v \
               --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https
             rc=$?
@@ -338,8 +338,8 @@ jobs:
           source .venv/bin/activate
           python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
-          if [ $rc -eq 124 ]; then
-            echo "pytest timed out; retrying with pinned PTO-ISA commit"
+          if [ $rc -ne 0 ]; then
+            echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit"
             python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v \
               --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https
             rc=$?
@@ -450,4 +450,4 @@ jobs:
           source .venv/bin/activate
           DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))")
           PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v --clone-protocol https"
-          task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -eq 124 ]; then echo 'pytest timed out; retrying with pinned PTO-ISA commit'; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https; rc=\$?; fi; exit \$rc"
+          task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -ne 0 ]; then echo \"pytest failed with rc=\$rc; retrying with pinned PTO-ISA commit\"; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https; rc=\$?; fi; exit \$rc"
diff --git a/README.md b/README.md
index 8d7652d48..3dda00399 100644
--- a/README.md
+++ b/README.md
@@ -29,12 +29,11 @@ PTO ISA headers are automatically cloned on first run. See [Getting Started](doc
 
 ## Runtime Variants
 
-Three runtimes under `src/{arch}/runtime/`, each with a different graph-building strategy:
+Two runtimes under `src/{arch}/runtime/`, each with a different graph-building strategy:
 
 | Runtime | Graph built on | Use case |
 | ------- | -------------- | -------- |
 | `host_build_graph` | Host CPU | Development, debugging |
-| `aicpu_build_graph` | AICPU (device) | Reduced host-device transfer |
 | `tensormap_and_ringbuffer` | AICPU (device) | Production workloads |
 
 See runtime docs per arch: [a2a3](src/a2a3/docs/runtimes.md), [a5](src/a5/docs/runtimes.md).
diff --git a/docs/developer-guide.md b/docs/developer-guide.md
index b7722d989..e106f792f 100644
--- a/docs/developer-guide.md
+++ b/docs/developer-guide.md
@@ -22,7 +22,6 @@ pto-runtime/
 │       └── runtime/                   # Runtime implementations
 │           ├── common/                # Shared components across runtimes
 │           ├── host_build_graph/      # Host-built graph runtime
-│           ├── aicpu_build_graph/     # AICPU-built graph runtime
 │           └── tensormap_and_ringbuffer/  # Advanced production runtime
 │
 ├── python/                            # Language bindings
@@ -55,7 +54,6 @@ pto-runtime/
 ├── examples/                          # Working examples
 │   └── {arch}/                        # Architecture-specific examples
 │       ├── host_build_graph/
-│       ├── aicpu_build_graph/
 │       └── tensormap_and_ringbuffer/
 │
 ├── tests/                             # Test suite
diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md
index 546ce30ab..2c622cf3d 100644
--- a/docs/dynamic-linking.md
+++ b/docs/dynamic-linking.md
@@ -221,7 +221,7 @@ SchedulerContext owns its own teardown:
   (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`,
   `finished_count_`).
 
-Applies to all 5 runtime executors: a2a3 (abg, hbg, tmr), a5 (hbg, tmr).
+Applies to all 4 runtime executors: a2a3 (hbg, tmr), a5 (hbg, tmr).
 
 ## SO Handle Caching and Reuse
 
diff --git a/docs/python-packaging.md b/docs/python-packaging.md
index a64dc4c7a..05da8ec8b 100644
--- a/docs/python-packaging.md
+++ b/docs/python-packaging.md
@@ -96,7 +96,7 @@ Plus one build-time entry point invoked by CMake during `pip install`:
 
 ## Install modes
 
-Five install paths × four entry points = the verification matrix. CI enforces the matrix on macOS and Ubuntu via `.github/workflows/ci.yml::packaging-matrix`.
+Five install paths × two entry points = the verification matrix. CI enforces the matrix on macOS and Ubuntu via `.github/workflows/ci.yml::packaging-matrix`.
 
 ### Mode-by-mode
 
diff --git a/docs/tensor-dump.md b/docs/tensor-dump.md
index 0f44061e3..1a79fa40a 100644
--- a/docs/tensor-dump.md
+++ b/docs/tensor-dump.md
@@ -6,8 +6,8 @@ runtime observability feature: host pre-allocates buffers on device,
 AICPU writes records during execution, host collects data and exports
 JSON manifest + binary payload.
 
-Supported on both architectures (`a2a3` / `a5`) and all three runtimes
-(`host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`).
+Supported on both architectures (`a2a3` / `a5`) and both runtimes
+(`host_build_graph`, `tensormap_and_ringbuffer`).
 Opt-in via `--dump-tensor` — zero overhead when disabled.
 
 The **primary design** (a2a3) uses shared memory (`halHostRegister`) +
@@ -250,8 +250,8 @@ all device-side writes were globally visible.
 
 AICPU only has device addresses and sizes — it does **not** know the
 logical shape / dtype / view geometry of each tensor unless the runtime
-registers it. Each of the three runtimes exposes metadata through a
-slightly different path, but they all converge on `TensorInfo` (see
+registers it. Each runtime exposes metadata through a slightly different
+path, but they all converge on `TensorInfo` (see
 [`tensor_info.h`](../src/a5/runtime/host_build_graph/runtime/tensor_info.h)):
 
 - **`host_build_graph`** — two orchestration-side APIs:
@@ -261,11 +261,10 @@ slightly different path, but they all converge on `TensorInfo` (see
   See
   [`dump_tensor_orch.cpp`](../tests/st/a5/host_build_graph/dump_tensor_example/kernels/orchestration/dump_tensor_orch.cpp)
   for both styles in one file.
-- **`aicpu_build_graph`** — runtime layer fills `TensorInfo` from
-  `PTO2TaskPayload::tensors[]` directly. No orchestration API needed.
-- **`tensormap_and_ringbuffer`** — identical to `aicpu_build_graph`;
-  the ring buffer carries `PTO2TaskPayload` which already contains
-  shape/offset arrays.
+- **`tensormap_and_ringbuffer`** — runtime layer fills `TensorInfo`
+  from `PTO2TaskPayload::tensors[]` directly. The ring buffer carries
+  `PTO2TaskPayload` which already contains shape/offset arrays, so no
+  orchestration API is needed.
 
 When metadata is missing or inconsistent, the task is **skipped for
 dump** and a single `LOG_WARN` is emitted (guarded by
diff --git a/docs/testing.md b/docs/testing.md
index 52a80e76b..d2a73ada0 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -420,7 +420,6 @@ pytest tests/ut --platform a2a3
 Small, fast examples that run on both simulation and real hardware. Organized by runtime:
 
 - `host_build_graph/` — HBG examples
-- `aicpu_build_graph/` — ABG examples
 - `tensormap_and_ringbuffer/` — TMR examples
 
 Each example has a `golden.py` with `generate_inputs()` and `compute_golden()` for result validation.
diff --git a/examples/workers/README.md b/examples/workers/README.md
index da193e1be..a7c5176d1 100644
--- a/examples/workers/README.md
+++ b/examples/workers/README.md
@@ -35,8 +35,8 @@ workers/
 
 Why no `tensormap_and_ringbuffer/` layer? Because every example here hard-codes
 `runtime="tensormap_and_ringbuffer"` in its `Worker(...)` call — that is the
-default user-facing runtime. Other runtimes (`host_build_graph`,
-`aicpu_build_graph`) are covered by scene tests under `tests/st/`, not here.
+default user-facing runtime. The other runtime (`host_build_graph`) is
+covered by scene tests under `tests/st/`, not here.
 
 ## Prerequisites
 
diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py
index 831d83adb..ef5f8be94 100644
--- a/simpler_setup/kernel_compiler.py
+++ b/simpler_setup/kernel_compiler.py
@@ -372,7 +372,7 @@ def compile_orchestration(
 
         Args:
             runtime_name: Name of the runtime (e.g., "host_build_graph",
-                         "tensormap_and_ringbuffer", "aicpu_build_graph")
+                         "tensormap_and_ringbuffer")
             source_path: Path to orchestration source file (.cpp)
             extra_include_dirs: Additional include directories (merged with
                                the runtime/platform include dirs)
diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py
index e30ad3b04..3d3c789c9 100644
--- a/simpler_setup/tools/swimlane_converter.py
+++ b/simpler_setup/tools/swimlane_converter.py
@@ -1015,7 +1015,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
 
     # Orchestrator → scheduler dispatch:
     # - Prefer orch_fanin end → dispatch (explicit deps / fanin path).
-    # - If no orch_fanin for this task (e.g. aicpu_build_graph without fanin records), use orch_params end → dispatch.
+    # - If no orch_fanin for this task, use orch_params end → dispatch.
     if orchestrator_phases and scheduler_phases:
         orch_fanin_by_task = {}
         orch_params_by_task = {}
diff --git a/src/a2a3/docs/runtimes.md b/src/a2a3/docs/runtimes.md
index de5ff380a..a7cd861ee 100644
--- a/src/a2a3/docs/runtimes.md
+++ b/src/a2a3/docs/runtimes.md
@@ -1,20 +1,20 @@
 # Runtime Variants (a2a3)
 
-Three runtime implementations live under `src/a2a3/runtime/`, each providing a different graph-building strategy. The `RUNTIME_CONFIG.runtime` field in `kernel_config.py` selects which runtime to use.
+Two runtime implementations live under `src/a2a3/runtime/`, each providing a different graph-building strategy. The `RUNTIME_CONFIG.runtime` field in `kernel_config.py` selects which runtime to use.
 
 ## Comparison
 
-| Feature | host_build_graph | aicpu_build_graph | tensormap_and_ringbuffer |
-| ------- | ---------------- | ----------------- | ------------------------ |
-| Graph built on | Host CPU | AICPU (device) | AICPU (device) |
-| Task storage | Fixed `Task[]` array | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) |
-| Dependencies | Explicit edges | Explicit edges | Auto-derived via TensorMap |
-| Memory management | Host-side | Host + device malloc | Ring buffer heap (GM) |
-| Concurrent build+schedule | No | Optional (`build_mode=1`) | Yes (always) |
-| Profiling support | Basic | Basic | Multi-level hierarchy |
-| Batch/streaming | No | No | Yes (flow control, back-pressure) |
-| Thread model | N scheduler threads | 1 builder + N schedulers | 1 orchestrator + 3 schedulers |
-| Use case | Development, debugging | Reduced host-device transfer | Production workloads |
+| Feature | host_build_graph | tensormap_and_ringbuffer |
+| ------- | ---------------- | ------------------------ |
+| Graph built on | Host CPU | AICPU (device) |
+| Task storage | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) |
+| Dependencies | Explicit edges | Auto-derived via TensorMap |
+| Memory management | Host-side | Ring buffer heap (GM) |
+| Concurrent build+schedule | No | Yes (always) |
+| Profiling support | Basic | Multi-level hierarchy |
+| Batch/streaming | No | Yes (flow control, back-pressure) |
+| Thread model | N scheduler threads | 1 orchestrator + 3 schedulers |
+| Use case | Development, debugging | Production workloads |
 
 ## host_build_graph
 
@@ -26,16 +26,6 @@ The simplest runtime. The host CPU builds the complete task dependency graph bef
 
 See [host_build_graph/docs/RUNTIME_LOGIC.md](../runtime/host_build_graph/docs/RUNTIME_LOGIC.md) for details.
 
-## aicpu_build_graph
-
-Orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`).
-
-- Same task array as host_build_graph
-- Device-side API: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc`
-- Reduces host-device data transfer; graph can depend on device-side data
-
-See [aicpu_build_graph/docs/RUNTIME_LOGIC.md](../runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md) for details.
-
 ## tensormap_and_ringbuffer (PTO2)
 
 The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking.
diff --git a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h b/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h
index 554e4af62..95c317117 100644
--- a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h
+++ b/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h
@@ -38,8 +38,8 @@
  * Buffer management and final commit are handled by AICPU.
  *
  * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended).
- * For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites
- * with the full (ring_id << 32) | local_id encoding after handshake match.
+ * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id
+ * encoding after handshake match.
  *
  * @param l2_perf_buf Performance buffer pointer
  * @param task_id Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits
diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
index 80b62c88a..131420cbb 100644
--- a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
@@ -127,7 +127,7 @@ void l2_perf_aicpu_init_phase_profiling(Runtime *runtime, int num_sched_threads)
  * @param loop_iter Current loop iteration number
  * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or
  *                        full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator
- *                        phases in multi-ring runtimes: tensormap_and_ringbuffer, aicpu_build_graph)
+ *                        phases in tensormap_and_ringbuffer)
  */
 void l2_perf_aicpu_record_phase(
     int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
@@ -164,9 +164,8 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
  * @param start_time Phase start timestamp
  * @param end_time Phase end timestamp
  * @param submit_idx Task submission index (acts as loop_iter)
- * @param task_id Task identifier. For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), this is the
- * full PTO2 encoding: (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler
- * swimlanes.
+ * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding:
+ * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes.
  */
 void l2_perf_aicpu_record_orch_phase(
     AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
diff --git a/src/a2a3/platform/include/common/l2_perf_profiling.h b/src/a2a3/platform/include/common/l2_perf_profiling.h
index d13bcd94a..3f82eeb4e 100644
--- a/src/a2a3/platform/include/common/l2_perf_profiling.h
+++ b/src/a2a3/platform/include/common/l2_perf_profiling.h
@@ -83,8 +83,8 @@ struct L2PerfRecord {
     uint64_t finish_time;    // AICPU timestamp: when AICPU observed task completion
 
     // AICore writes the register dispatch token (low 32 bits only) zero-extended into task_id.
-    // For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites
-    // with the full PTO2 encoding (ring_id << 32) | local_id after FIN/perf row match.
+    // For tensormap_and_ringbuffer, AICPU overwrites with the full PTO2 encoding
+    // (ring_id << 32) | local_id after FIN/perf row match.
     // For host_build_graph, task_id stays as the plain integer task index (ring_id = 0).
     uint64_t task_id;
     uint32_t func_id;    // Kernel function identifier
@@ -273,8 +273,8 @@ struct AicpuPhaseRecord {
     uint32_t loop_iter;     // Loop iteration number
     AicpuPhaseId phase_id;  // Phase type
     union {
-        uint64_t task_id;          // Multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph):
-                                   // full PTO2 encoding (ring_id << 32) | local_id for cross-view correlation.
+        uint64_t task_id;          // tensormap_and_ringbuffer: full PTO2 encoding
+                                   // (ring_id << 32) | local_id for cross-view correlation.
         uint64_t tasks_processed;  // Scheduler phases: number of tasks processed in this batch
     };
 };
diff --git a/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
deleted file mode 100644
index 2a356485d..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-#include "aicore/aicore.h"
-#include "aicore/l2_perf_collector_aicore.h"
-#include "aicore/pmu_collector_aicore.h"
-#include "common/l2_perf_profiling.h"
-#include "common/platform_config.h"  // Register-based communication
-#include "pto2_dispatch_payload.h"
-#include "runtime.h"
-
-/**
- * Unified function pointer type for kernel dispatch
- *
- * All kernels follow the same signature: void kernel(__gm__ int64_t* args)
- * This enables simple, switch-free dispatch.
- */
-typedef void (*UnifiedKernelFunc)(__gm__ int64_t *);
-
-/**
- * Execute task from PTO2DispatchPayload.
- *
- * Reads function_bin_addr and args from the dispatch payload.
- *
- * @param payload Pointer to PTO2DispatchPayload in global memory
- */
-__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2DispatchPayload *payload) {
-    if (payload == nullptr || payload->function_bin_addr == 0) {
-        return;
-    }
-
-    UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr;
-    kernel(reinterpret_cast<__gm__ int64_t *>(payload->args));
-    OUT_OF_ORDER_STORE_BARRIER();
-}
-
-/**
- * AICore main execution loop
- *
- * Implements the AICPU-AICore register-based dispatch protocol:
- * 1. Wait for AICPU ready signal via handshake buffer
- * 2. Report physical core ID and core type, signal AICore ready
- * 3. Poll DATA_MAIN_BASE register for task dispatch until exit signal
- *
- * Task dispatch reads PTO2DispatchPayload address from Handshake.task.
- * Task ID is derived from the register value (task_id + 1 encoding).
- *
- * @param runtime Pointer to Runtime in global memory
- * @param block_idx Block index (core ID)
- * @param core_type Core type (AIC or AIV)
- */
-__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type) {
-    __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[block_idx]);
-
-    // Phase 1: Wait for AICPU initialization signal
-    while (my_hank->aicpu_ready == 0) {
-        dcci(my_hank, SINGLE_CACHE_LINE);
-    }
-
-    // Phase 2: Report physical core ID, signal ready
-    my_hank->physical_core_id = get_physical_core_id();
-    OUT_OF_ORDER_STORE_BARRIER();
-    my_hank->aicore_regs_ready = 1;
-    dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT);
-    while (my_hank->aicpu_regs_ready == 0) {
-        dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE);
-    }
-    // Report initial idle status via register
-    write_reg(RegId::COND, AICORE_IDLE_VALUE);
-
-    // Phase 3: Report core type, signal ready
-    my_hank->core_type = core_type;
-    OUT_OF_ORDER_STORE_BARRIER();
-    my_hank->aicore_done = block_idx + 1;  // Signal ready (use block_idx + 1 to avoid 0)
-
-    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
-
-    // Cache payload address (set once by AICPU during initialization, never changes)
-    __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
-
-    bool l2_perf_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
-    bool dump_tensor_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
-    bool pmu_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_PMU);
-
-    // Phase 4: Main execution loop - poll register for tasks until exit signal
-    // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
-    uint32_t reg_val = AICPU_IDLE_TASK_ID;
-    uint32_t last_reg_val = AICPU_IDLE_TASK_ID;
-
-    while (true) {
-        reg_val = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
-        if (reg_val == AICORE_EXIT_SIGNAL) {
-            // Signal exit acknowledgment to AICPU
-            write_reg(RegId::COND, AICORE_EXITED_VALUE);
-            break;
-        }
-
-        // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
-        if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
-            SPIN_WAIT_HINT();
-            continue;
-        }
-
-        {
-            uint32_t task_id = reg_val;  // Decode: register holds task_id directly
-
-            // Invalidate payload buffer (AICPU updates its content each dispatch)
-            dcci(payload, ENTIRE_DATA_CACHE);
-
-            write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
-
-            // Performance profiling: record start time
-            uint64_t start_time = get_sys_cnt_aicore();
-
-            if (pmu_enabled) {
-                pmu_aicore_begin();
-            }
-
-            // Execute the task
-            execute_task(payload);
-
-            if (pmu_enabled) {
-                pmu_aicore_end();
-            }
-
-            if (dump_tensor_enabled) {
-                pipe_barrier(PIPE_ALL);
-            }
-
-            // Performance profiling: record task execution
-            // (func_id and core_type are filled by AICPU at completion time)
-            if (l2_perf_enabled) {
-                uint64_t end_time = get_sys_cnt_aicore();
-                __gm__ L2PerfBuffer *l2_perf_buf = (__gm__ L2PerfBuffer *)my_hank->l2_perf_records_addr;
-                l2_perf_aicore_record_task(l2_perf_buf, task_id, start_time, end_time);
-            }
-
-            last_reg_val = reg_val;
-            write_reg(RegId::COND, MAKE_FIN_VALUE(task_id));
-        }
-    }
-
-    // Flush all dirty cache lines to HBM before kernel exit.
-    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
deleted file mode 100644
index 13cdd52f7..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp
+++ /dev/null
@@ -1,2341 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include <dlfcn.h>
-#include <unistd.h>
-
-#include <atomic>
-#include <cerrno>
-#include <cinttypes>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#ifdef __linux__
-#include <sys/mman.h>
-#endif
-
-#include "aicpu/device_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/orch_so_file.h"
-#include "pto2_dispatch_payload.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE)
-#include "pto_runtime2.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-// Performance profiling headers
-#include "aicpu/l2_perf_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#include "common/memory_barrier.h"
-#include "common/l2_perf_profiling.h"
-#include "common/unified_log.h"
-
-// Register-based communication
-#include "aicpu/platform_regs.h"
-#include "common/platform_config.h"
-
-// Core type definitions
-#include "common/core_type.h"
-
-// CoreCallable for resolved dispatch address
-#include "callable.h"
-
-#if PTO2_PROFILING
-// Accumulated nanoseconds per sub-step
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#endif
-
-// Device orchestration function signature (loaded via dlopen).
-// The orchestration .so receives a PTO2Runtime* (with ops table populated)
-// instead of a raw shared-memory pointer.
-typedef void (*DeviceOrchestrationFunc)(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args);
-
-// Config function exported by orchestration .so
-typedef PTO2OrchestrationConfig (*DeviceOrchestrationConfigFunc)(const ChipStorageTaskArgs &orch_args);
-
-constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
-constexpr int32_t MAX_AIC_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD;
-constexpr int32_t MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD;
-
-constexpr int32_t MAX_IDLE_ITERATIONS = 800000;       // ~20s idle then scheduler gives up (avoid long hang)
-constexpr int32_t STALL_LOG_INTERVAL = 50000;         // DEV_ALWAYS every N idle iters to debug hang
-constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
-constexpr int32_t STALL_DUMP_READY_MAX = 8;
-constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
-constexpr int32_t STALL_DUMP_CORE_MAX = 8;
-constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
-constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
-
-static PTO2Runtime *rt{nullptr};
-
-// Per-core dispatch payload storage (one per physical core)
-static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER];
-
-// Core information for discovery (with register address for fast dispatch)
-struct CoreInfo {
-    int32_t worker_id;          // Index in runtime.workers[]
-    uint32_t physical_core_id;  // Hardware physical core ID (from AICore)
-    uint64_t reg_addr;          // Cached register address for fast access
-    CoreType core_type;
-};
-
-struct CoreTypeTracker {
-    int32_t idle_count;
-    int32_t running_count;
-    int32_t idle[MAX_CORES_PER_THREAD];
-    int32_t running[MAX_CORES_PER_THREAD];
-
-    void move_idle_to_running(int32_t idx) {
-        running[running_count++] = idle[idx];
-        idle[idx] = idle[--idle_count];
-    }
-
-    void move_running_to_idle(int32_t idx) {
-        idle[idle_count++] = running[idx];
-        running[idx] = running[--running_count];
-    }
-
-    int32_t find_idle_index(int32_t core_id) {
-        for (int32_t i = 0; i < idle_count; i++) {
-            if (idle[i] == core_id) return i;
-        }
-        return -1;
-    }
-};
-
-struct Cluster {
-    int32_t aic_core_id;
-    int32_t aiv_core_ids[2];
-};
-
-struct CoreStateTracker {
-    CoreTypeTracker by_type[2];  // indexed by static_cast<int32_t>(CoreType)
-    Cluster clusters[MAX_AIC_PER_THREAD];
-    int32_t cluster_count;
-
-    CoreTypeTracker &aic() { return by_type[0]; }
-    CoreTypeTracker &aiv() { return by_type[1]; }
-
-    template <CoreType CT>
-    CoreTypeTracker &get() {
-        return by_type[static_cast<int32_t>(CT)];
-    }
-
-    int32_t find_cluster_for_shape(PTO2ResourceShape shape, bool *core_idle) {
-        for (int32_t i = 0; i < cluster_count; i++) {
-            Cluster &c = clusters[i];
-            switch (shape) {
-            case PTO2ResourceShape::AIC_ONLY:
-                if (core_idle[c.aic_core_id]) return i;
-                break;
-            case PTO2ResourceShape::AIV_X1:
-                if (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]]) return i;
-                break;
-            case PTO2ResourceShape::AIV_X2:
-                if (core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i;
-                break;
-            case PTO2ResourceShape::AIC_AIV_X1:
-                if (core_idle[c.aic_core_id] && (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]]))
-                    return i;
-                break;
-            case PTO2ResourceShape::AIC_AIV_X2:
-                if (core_idle[c.aic_core_id] && core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i;
-                break;
-            }
-        }
-        return -1;
-    }
-};
-
-struct AicpuExecutor {
-    int32_t sched_thread_num_;
-    bool orch_to_sched_{false};
-
-    // ===== Thread management state =====
-    std::atomic<int32_t> thread_idx_{0};
-    std::atomic<bool> initialized_{false};
-    std::atomic<bool> init_done_{false};
-    std::atomic<bool> init_failed_{false};
-    std::atomic<bool> finished_{false};
-
-    int32_t thread_num_{0};
-    int32_t cores_total_num_{0};
-    int32_t thread_cores_num_{0};  // Cores per scheduler thread (0 for orchestrator when thread_num_==4)
-    int32_t core_count_per_thread_[MAX_AICPU_THREADS];  // Actual core count per thread
-    int32_t core_assignments_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
-
-    // Core discovery arrays (with register addresses)
-    CoreInfo aic_cores_[MAX_CORES_PER_THREAD];
-    CoreInfo aiv_cores_[MAX_CORES_PER_THREAD];
-    int32_t aic_count_{0};
-    int32_t aiv_count_{0};
-
-#if PTO2_PROFILING
-    // Logical core_id -> hardware physical core id, collected during handshake.
-    // Handed to pmu_aicpu_init() so the platform can resolve per-core PMU MMIO
-    // bases.
-    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER];
-#endif
-
-    // Fast lookup: core_id -> reg_addr (for register-based dispatch)
-    uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD];
-
-    // Per-core monotonic dispatch counter for register protocol uniqueness.
-    // Multi-ring task_ids can collide in the lower 32 bits (e.g., ring 0 local 0
-    // and ring 1 local 0 both truncate to 0), breaking the AICore's last_reg_val
-    // duplicate detection and causing false-positive COND completion. A per-core
-    // counter guarantees each dispatch writes a unique DATA_MAIN_BASE value.
-    uint32_t dispatch_seq_by_core_[RUNTIME_MAX_WORKER]{};
-
-    // Per-core subtask slot tracking (which PTO2SubtaskSlot is running on each core)
-    PTO2SubtaskSlot executing_subslot_by_core_[RUNTIME_MAX_WORKER]{};
-
-    // Per-core slot state tracking (PTO2TaskSlotState* for the running task on each core)
-    PTO2TaskSlotState *executing_slot_state_by_core_[RUNTIME_MAX_WORKER]{};
-
-    // Platform register base address array (set via get_platform_regs())
-    uint64_t regs_{0};
-
-    // Track executing register task_id per core (AICPU_TASK_INVALID = idle).
-    // NOTE: this is NOT the task_id; it is the per-core dispatch id used by the
-    // register protocol (derived from dispatch_seq_by_core_ and masked by TASK_ID_MASK).
-    int32_t executing_reg_task_ids_[MAX_CORES_PER_THREAD];
-    CoreStateTracker trackers_[MAX_AICPU_THREADS];
-    bool core_idle_[MAX_CORES_PER_THREAD];
-
-    // ===== Task queue state (managed by scheduler ready queues) =====
-
-    // Task execution tracking
-    std::atomic<int32_t> completed_tasks_{0};
-    int32_t total_tasks_{0};
-    std::atomic<int32_t> finished_count_{0};
-    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
-    // volatile prevents the compiler from hoisting the load out of spin loops.
-    volatile bool orchestrator_done_{false};
-    std::atomic<bool> pto2_init_done_{false};
-    std::atomic<bool> runtime_init_ready_{false};
-    std::atomic<bool> pto2_init_complete_{false};  // init block finished; others wait for this
-
-    // ===== Dynamic core transition state =====
-    std::atomic<bool> transition_requested_{false};
-    std::atomic<int32_t> wait_reassign_{0};
-    std::atomic<bool> reassigned_{false};
-    std::atomic<bool> completed_{false};
-
-    // Orchestration SO handle - defer dlclose until all tasks complete
-    void *orch_so_handle_{nullptr};
-    char orch_so_path_[256]{};  // Path to orchestration SO file for cleanup
-
-    // Shared orchestration function pointer (loaded by first orch thread, used by all)
-    DeviceOrchestrationFunc orch_func_{nullptr};
-    const ChipStorageTaskArgs *orch_args_cached_{nullptr};
-
-    // ===== Performance profiling state =====
-    uint64_t dispatch_timestamps_[RUNTIME_MAX_WORKER];  // Per-core AICPU dispatch timestamp
-    uint32_t
-        core_dispatch_counts_[RUNTIME_MAX_WORKER];  // Per-core total dispatched task counter (for buffer management)
-
-    uint64_t *func_id_to_addr_;
-    uint64_t get_function_bin_addr(int func_id) const {
-        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
-        return func_id_to_addr_[func_id];
-    }
-
-    // ===== Methods =====
-    int32_t init(Runtime *runtime);
-    int32_t handshake_all_cores(Runtime *runtime);
-    void assign_cores_to_threads();
-    void reassign_cores_for_all_threads();
-    int32_t resolve_and_dispatch_pto2(Runtime *runtime, int32_t thread_idx);
-    int32_t shutdown_aicore(Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num);
-    int32_t run(Runtime *runtime);
-    void deinit(Runtime *runtime);
-    void emergency_shutdown(Runtime *runtime);
-    void diagnose_stuck_state(
-        Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num, Handshake *hank
-    );
-
-    // Build slim PTO2DispatchPayload: only function_bin_addr + args.
-    // Metadata (task_id, subslot, kernel_id, core_type) stays in TaskDescriptor.
-    // Dispatch order: tensor args first, then scalar args.
-    void build_pto2_payload(PTO2DispatchPayload &out, int32_t kernel_id, PTO2TaskPayload &task_pl) {
-        uint64_t callable_addr = get_function_bin_addr(kernel_id);
-        const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
-        out.function_bin_addr = callable->resolved_addr();
-        int32_t n = 0;
-        for (int32_t i = 0; i < task_pl.tensor_count; i++) {
-            task_pl.tensors[i].update_start_offset();
-            out.args[n++] = reinterpret_cast<uint64_t>(&task_pl.tensors[i]);
-        }
-        for (int32_t i = 0; i < task_pl.scalar_count; i++) {
-            out.args[n++] = task_pl.scalars[i];
-        }
-    }
-
-    // Template methods for Phase 1 and Phase 2
-    template <CoreType CT>
-    void check_running_cores_for_completion(
-        int32_t thread_idx, CoreTypeTracker &ct, Handshake *hank, int32_t &completed_this_turn,
-        int32_t &cur_thread_completed, bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[],
-        int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-        ,
-        bool l2_perf_enabled, uint32_t &phase_complete_count
-#endif
-#if PTO2_SCHED_PROFILING
-        ,
-        uint64_t &complete_probe_count, uint64_t &complete_hit_count, uint64_t &notify_edges_total,
-        int32_t &notify_max_degree, uint64_t &notify_tasks_enqueued, uint64_t &fanin_edges_total,
-        int32_t &fanin_max_degree, uint64_t &sched_complete_perf_cycle
-#endif
-    ) {
-        for (int32_t i = ct.running_count - 1; i >= 0; i--) {
-            int32_t core_id = ct.running[i];
-            uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-
-            int32_t expected_reg_task_id = executing_reg_task_ids_[core_id];
-            uint64_t reg_val = read_reg(reg_addr, RegId::COND);
-            int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
-            int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-            bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE;
-#if PTO2_SCHED_PROFILING
-            if (l2_perf_enabled) {
-                complete_probe_count++;
-                if (done) {
-                    complete_hit_count++;
-                }
-            }
-#endif
-
-            if (done) {
-                executing_reg_task_ids_[core_id] = AICPU_TASK_INVALID;
-                PTO2SubtaskSlot subslot = executing_subslot_by_core_[core_id];
-                PTO2TaskSlotState &slot_state = *executing_slot_state_by_core_[core_id];
-
-                // Two-stage completion: mark subtask done, then handle mixed-task completion
-                bool mixed_complete = rt->scheduler.on_subtask_complete(slot_state, subslot);
-                if (mixed_complete) {
-#if PTO2_SCHED_PROFILING
-                    PTO2CompletionStats cstats =
-                        rt->scheduler.on_mixed_task_complete(slot_state, thread_idx, local_bufs);
-                    notify_edges_total += cstats.fanout_edges;
-                    if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
-                    notify_tasks_enqueued += cstats.tasks_enqueued;
-                    phase_complete_count++;
-#else
-                    rt->scheduler.on_mixed_task_complete(slot_state, local_bufs);
-#if PTO2_PROFILING
-                    phase_complete_count++;
-#endif
-#endif
-                    if (deferred_release_count < 256) {
-                        deferred_release_slot_states[deferred_release_count++] = &slot_state;
-                    } else {
-                        DEV_ALWAYS("Thread %d: release", thread_idx);
-                        while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                            int32_t fe = rt->scheduler.on_task_release(
-                                *deferred_release_slot_states[--deferred_release_count], thread_idx
-                            );
-#else
-                            int32_t fe =
-                                rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                            (void)fe;
-#if PTO2_SCHED_PROFILING
-                            fanin_edges_total += fe;
-                            if (fe > fanin_max_degree) fanin_max_degree = fe;
-#endif
-                        }
-                        deferred_release_slot_states[deferred_release_count++] = &slot_state;
-                    }
-                }
-                ct.move_running_to_idle(i);
-                core_idle_[core_id] = true;
-#if PTO2_PROFILING
-                if (l2_perf_enabled) {
-#if PTO2_SCHED_PROFILING
-                    uint64_t t_perf_start = get_sys_cnt_aicpu();
-#endif
-                    Handshake *h = &hank[core_id];
-                    uint64_t finish_ts = get_sys_cnt_aicpu();
-                    L2PerfBuffer *l2_perf_buf = reinterpret_cast<L2PerfBuffer *>(h->l2_perf_records_addr);
-
-                    // Pre-extract fanout (platform layer cannot depend on PTO2DepListEntry)
-                    uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
-                    int32_t fanout_n = 0;
-                    PTO2DepListEntry *cur = slot_state.fanout_head;
-                    while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) {
-                        fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw;
-                        cur = cur->next;
-                    }
-
-                    int32_t perf_slot_idx = static_cast<int32_t>(executing_subslot_by_core_[core_id]);
-                    if (l2_perf_aicpu_complete_record(
-                            l2_perf_buf, static_cast<uint32_t>(expected_reg_task_id), slot_state.task->task_id.raw,
-                            slot_state.task->kernel_id[perf_slot_idx], CT, dispatch_timestamps_[core_id], finish_ts,
-                            fanout_arr, fanout_n
-                        ) != 0) {
-                        DEV_ERROR(
-                            "Core %d: l2_perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id,
-                            static_cast<uint64_t>(slot_state.task->task_id.raw)
-                        );
-                    }
-#if PTO2_SCHED_PROFILING
-                    sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
-#endif
-                }
-#endif
-
-#if PTO2_PROFILING
-                if (is_pmu_enabled()) {
-                    pmu_aicpu_record_task(
-                        core_id, thread_idx, slot_state.task->task_id.raw,
-                        slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
-                    );
-                }
-#endif
-
-                DEV_DEBUG(
-                    "Thread %d: %s core %d completed PTO2 task %d (mixed_complete=%d)", thread_idx,
-                    CT == CoreType::AIC ? "AIC" : "AIV", core_id, expected_reg_task_id, mixed_complete ? 1 : 0
-                );
-                cur_thread_completed++;
-                if (mixed_complete) {
-#if PTO2_PROFILING
-                    if (is_dump_tensor_enabled()) {
-                        dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-                            thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
-                            [](uint8_t active_mask, uint8_t raw_subtask_id) {
-                                return pto2_subtask_active(active_mask, static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                            },
-                            [this](int32_t func_id) {
-                                return get_function_bin_addr(func_id);
-                            }
-                        );
-                    }
-#endif
-                    completed_this_turn++;
-                }
-                made_progress = true;
-            }
-        }
-    }
-
-    static const char *shape_name(PTO2ResourceShape shape) {
-        switch (shape) {
-        case PTO2ResourceShape::AIC_ONLY:
-            return "AIC_ONLY";
-        case PTO2ResourceShape::AIV_X1:
-            return "AIV_X1";
-        case PTO2ResourceShape::AIV_X2:
-            return "AIV_X2";
-        case PTO2ResourceShape::AIC_AIV_X1:
-            return "AIC_AIV_X1";
-        case PTO2ResourceShape::AIC_AIV_X2:
-            return "AIC_AIV_X2";
-        }
-        return "UNKNOWN";
-    }
-
-    struct ResourceCount {
-        int32_t aic;
-        int32_t aiv;
-    };
-
-    static constexpr ResourceCount shape_resource_count(PTO2ResourceShape shape) {
-        constexpr ResourceCount kTable[PTO2_NUM_RESOURCE_SHAPES] = {
-            {1, 0},  // AIC_ONLY    = 0
-            {0, 1},  // AIV_X1      = 1
-            {0, 2},  // AIV_X2      = 2
-            {1, 1},  // AIC_AIV_X1  = 3
-            {1, 2},  // AIC_AIV_X2  = 4
-        };
-        return kTable[static_cast<int>(shape)];
-    }
-
-    /**
-     * Returns the dispatch probe order for a given scheduler thread.
-     * Widest shapes first to avoid consuming cluster resources with narrow tasks.
-     * Even/odd threads use different fallback orders (AIC-first vs AIV-first)
-     * to reduce contention on the same ready queue across adjacent threads.
-     */
-    static const PTO2ResourceShape *get_dispatch_order(int32_t thread_idx) {
-        // Even threads: AIC-first fallback after widest
-        static constexpr PTO2ResourceShape kEvenOrder[PTO2_NUM_RESOURCE_SHAPES] = {
-            PTO2ResourceShape::AIC_AIV_X2, PTO2ResourceShape::AIC_AIV_X1, PTO2ResourceShape::AIC_ONLY,
-            PTO2ResourceShape::AIV_X2,     PTO2ResourceShape::AIV_X1,
-        };
-        // Odd threads: AIV-first fallback after widest
-        static constexpr PTO2ResourceShape kOddOrder[PTO2_NUM_RESOURCE_SHAPES] = {
-            PTO2ResourceShape::AIC_AIV_X2, PTO2ResourceShape::AIV_X2,   PTO2ResourceShape::AIC_AIV_X1,
-            PTO2ResourceShape::AIV_X1,     PTO2ResourceShape::AIC_ONLY,
-        };
-        return (thread_idx % 2 == 0) ? kEvenOrder : kOddOrder;
-    }
-
-    PTO2TaskSlotState *pop_ready_task(
-        PTO2ResourceShape shape, int32_t thread_idx
-#if PTO2_SCHED_PROFILING
-        ,
-        uint64_t &pop_hit, uint64_t &pop_miss, uint64_t &sched_dispatch_pop_cycle
-#endif
-    ) {
-        (void)thread_idx;
-#if PTO2_SCHED_PROFILING
-        extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
-        uint64_t t_pop_start = get_sys_cnt_aicpu();
-        PTO2TaskSlotState *slot_state = rt->scheduler.get_ready_task(
-            shape, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
-        );
-        sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
-#else
-        PTO2TaskSlotState *slot_state = rt->scheduler.get_ready_task(shape);
-#endif
-        if (slot_state) {
-#if PTO2_SCHED_PROFILING
-            pop_hit++;
-#endif
-        } else {
-#if PTO2_SCHED_PROFILING
-            pop_miss++;
-#endif
-        }
-        return slot_state;
-    }
-
-    void dispatch_subtask_to_core(
-        Runtime *runtime, CoreStateTracker &tracker, int32_t core_id, CoreType core_type, PTO2TaskSlotState &slot_state,
-        PTO2SubtaskSlot subslot
-#if PTO2_PROFILING
-        ,
-        bool l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-        ,
-        int32_t thread_idx
-#endif
-    ) {
-#if !PTO2_PROFILING
-        (void)runtime;  // NOLINT(readability/casting)
-#endif
-        PTO2DispatchPayload &payload = s_pto2_payload_per_core[core_id];
-        PTO2TaskDescriptor &task = *slot_state.task;
-        int32_t slot_idx = static_cast<int32_t>(subslot);
-        build_pto2_payload(payload, task.kernel_id[slot_idx], *slot_state.payload);
-        executing_subslot_by_core_[core_id] = subslot;
-        executing_slot_state_by_core_[core_id] = &slot_state;
-#if PTO2_PROFILING
-        if (l2_perf_enabled) {
-            dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
-            if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) {
-                l2_perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
-                core_dispatch_counts_[core_id] = 0;
-            }
-            core_dispatch_counts_[core_id]++;
-        }
-#endif
-
-        // Per-core monotonic counter for register protocol uniqueness.
-        // PTO2 task_id encodes (ring_id << 32 | local_id); truncation to uint32 loses ring_id,
-        // so tasks from different rings with the same local_id would write identical DATA_MAIN_BASE
-        // values. The AICore uses last_reg_val to detect new dispatches and would skip the
-        // duplicate, while the stale COND register from the previous task (same local_id) would
-        // cause a false-positive completion.
-        dispatch_seq_by_core_[core_id]++;
-        uint32_t reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK;
-        // Skip reserved sentinel range [AICORE_EXIT_SIGNAL, 0x7FFFFFFF]: jump directly to 0.
-        if (reg_task_id >= AICORE_EXIT_SIGNAL) {
-            dispatch_seq_by_core_[core_id] += (TASK_ID_MASK - reg_task_id + 1);
-            reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK;
-        }
-        write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(reg_task_id));
-
-        CoreTypeTracker &ct = tracker.by_type[static_cast<int32_t>(core_type)];
-        int32_t idle_idx = ct.find_idle_index(core_id);
-        ct.move_idle_to_running(idle_idx);
-        core_idle_[core_id] = false;
-        executing_reg_task_ids_[core_id] = reg_task_id;
-    }
-};
-
-static AicpuExecutor g_aicpu_executor;
-
-// ===== AicpuExecutor Method Implementations =====
-
-/**
- * Handshake with all cores and discover their types
- * Sets up register addresses for fast dispatch.
- */
-int32_t AicpuExecutor::handshake_all_cores(Runtime *runtime) {
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    cores_total_num_ = runtime->worker_count;
-
-    // Validate cores_total_num_ before using as array index
-    if (cores_total_num_ == 0 || cores_total_num_ > MAX_CORES_PER_THREAD) {
-        DEV_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, MAX_CORES_PER_THREAD);
-        return -1;
-    }
-
-    aic_count_ = 0;
-    aiv_count_ = 0;
-
-    DEV_INFO("Handshaking with %d cores", cores_total_num_);
-
-    // Step 1: Write per-core payload addresses and send handshake signal
-    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
-    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        all_handshakes[i].task = reinterpret_cast<uint64_t>(&s_pto2_payload_per_core[i]);
-        OUT_OF_ORDER_STORE_BARRIER();
-        all_handshakes[i].aicpu_ready = 1;
-    }
-    OUT_OF_ORDER_STORE_BARRIER();
-
-    // Get platform physical cores count for validation
-    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
-
-    // Step 2: Wait for all cores to respond, collect core type and register addresses
-    bool handshake_failed = false;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        while (hank->aicore_regs_ready == 0) {}
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        // Validate physical_core_id before using as array index
-        if (physical_core_id >= max_physical_cores_count) {
-            DEV_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        // Get register address using physical_core_id
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {}
-
-        CoreType type = hank->core_type;
-
-        if (type == CoreType::AIC) {
-            aic_cores_[aic_count_].worker_id = i;
-            aic_cores_[aic_count_].physical_core_id = physical_core_id;
-            aic_cores_[aic_count_].reg_addr = reg_addr;
-            aic_cores_[aic_count_].core_type = type;
-            aic_count_++;
-            DEV_INFO("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        } else {
-            aiv_cores_[aiv_count_].worker_id = i;
-            aiv_cores_[aiv_count_].physical_core_id = physical_core_id;
-            aiv_cores_[aiv_count_].reg_addr = reg_addr;
-            aiv_cores_[aiv_count_].core_type = type;
-            aiv_count_++;
-            DEV_INFO("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        }
-
-        core_id_to_reg_addr_[i] = reg_addr;
-#if PTO2_PROFILING
-        physical_core_ids_[i] = physical_core_id;
-#endif
-    }
-
-    if (handshake_failed) {
-        emergency_shutdown(runtime);
-        return -1;
-    }
-
-    DEV_INFO("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
-    return 0;
-}
-
-/**
- * Assign discovered cores to scheduler threads
- * (Aligned with host_build_graph mechanism)
- */
-void AicpuExecutor::assign_cores_to_threads() {
-    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % divisor.
-    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
-    int32_t divisor = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_;
-    int32_t cluster_count = aic_count_;
-
-    DEV_INFO(
-        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count, divisor,
-        aic_count_, aiv_count_
-    );
-
-    memset(core_idle_, true, sizeof(core_idle_));
-    for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) {
-        executing_reg_task_ids_[i] = AICPU_TASK_INVALID;
-    }
-    for (int32_t i = 0; i < thread_num_; i++) {
-        trackers_[i].aic().running_count = 0;
-        trackers_[i].aiv().running_count = 0;
-        trackers_[i].aic().idle_count = 0;
-        trackers_[i].aiv().idle_count = 0;
-        trackers_[i].cluster_count = 0;
-        core_count_per_thread_[i] = 0;
-    }
-
-    // Per-sched-thread running core index used while filling core_assignments_.
-    int32_t core_idx[MAX_AICPU_THREADS] = {};
-
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % divisor;
-        CoreStateTracker &tracker = trackers_[t];
-        int32_t &idx = core_idx[t];
-
-        int32_t aic_wid = aic_cores_[ci].worker_id;
-        int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id;
-        int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id;
-
-        tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}};
-
-        core_assignments_[t][idx++] = aic_wid;
-        tracker.aic().idle[tracker.aic().idle_count++] = aic_wid;
-
-        core_assignments_[t][idx++] = aiv0_wid;
-        core_assignments_[t][idx++] = aiv1_wid;
-        tracker.aiv().idle[tracker.aiv().idle_count++] = aiv0_wid;
-        tracker.aiv().idle[tracker.aiv().idle_count++] = aiv1_wid;
-
-        DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
-    }
-
-    for (int32_t t = 0; t < divisor; t++) {
-        core_count_per_thread_[t] = core_idx[t];
-        DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], trackers_[t].cluster_count);
-    }
-
-    // Max clusters any single sched thread can hold: ceil(cluster_count / divisor).
-    int32_t max_clusters_per_thread = (cluster_count + divisor - 1) / divisor;
-    thread_cores_num_ = max_clusters_per_thread * 3;
-}
-
-/**
- * Reassign all cores evenly across all threads (schedulers + orchestrators).
- * Called by the last orchestrator thread when orchestration completes.
- * Writes into new_core_assignments_ / new_core_count_per_thread_.
- */
-void AicpuExecutor::reassign_cores_for_all_threads() {
-    DEV_INFO("Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", thread_num_, aic_count_, aiv_count_);
-
-    // Collect running/idle state from all threads before reassignment
-    bool running_cores[MAX_CORES_PER_THREAD];
-    memset(running_cores, 0, sizeof(running_cores));
-
-    for (int32_t i = 0; i < thread_num_; i++) {
-        for (int32_t j = 0; j < trackers_[i].aic().running_count; j++) {
-            int32_t core_id = trackers_[i].aic().running[j];
-            running_cores[core_id] = true;
-        }
-        for (int32_t j = 0; j < trackers_[i].aiv().running_count; j++) {
-            int32_t core_id = trackers_[i].aiv().running[j];
-            running_cores[core_id] = true;
-        }
-    }
-
-    // Reset all trackers
-    for (int32_t i = 0; i < thread_num_; i++) {
-        core_count_per_thread_[i] = 0;
-        trackers_[i].aic().running_count = 0;
-        trackers_[i].aic().idle_count = 0;
-        trackers_[i].aiv().running_count = 0;
-        trackers_[i].aiv().idle_count = 0;
-        trackers_[i].cluster_count = 0;
-    }
-
-    // Restore a single core's running/idle state into its new thread's tracker
-    auto reassign_core = [&](int32_t worker_id, CoreTypeTracker &type_tracker, int32_t thread_idx) {
-        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = worker_id;
-        if (running_cores[worker_id]) {
-            type_tracker.running[type_tracker.running_count++] = worker_id;
-        } else {
-            type_tracker.idle[type_tracker.idle_count++] = worker_id;
-        }
-    };
-
-    // Assign whole clusters round-robin across all threads
-    for (int32_t ci = 0; ci < aic_count_; ci++) {
-        int32_t t = ci % thread_num_;
-        CoreStateTracker &tracker = trackers_[t];
-
-        int32_t aic_wid = aic_cores_[ci].worker_id;
-        int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id;
-        int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id;
-
-        tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}};
-
-        reassign_core(aic_wid, tracker.aic(), t);
-        reassign_core(aiv0_wid, tracker.aiv(), t);
-        reassign_core(aiv1_wid, tracker.aiv(), t);
-    }
-
-    // Log final distribution for verification
-    DEV_INFO("Core reassignment complete:");
-    for (int32_t t = 0; t < thread_num_; t++) {
-        DEV_INFO(
-            "  Thread %d: %d cores, %d clusters (AIC: running=%d idle=%d, AIV: running=%d idle=%d)", t,
-            core_count_per_thread_[t], trackers_[t].cluster_count, trackers_[t].aic().running_count,
-            trackers_[t].aic().idle_count, trackers_[t].aiv().running_count, trackers_[t].aiv().idle_count
-        );
-    }
-}
-
-int32_t AicpuExecutor::init(Runtime *runtime) {
-    bool expected = false;
-    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) {
-        return 0;
-    }
-
-    DEV_INFO("AicpuExecutor: Initializing");
-
-    if (runtime == nullptr) {
-        DEV_ERROR("runtime is nullptr");
-        init_failed_.store(true, std::memory_order_release);
-        return -1;
-    }
-
-    func_id_to_addr_ = runtime->func_id_to_addr_;
-
-    // Read execution parameters from runtime
-    thread_num_ = runtime->sche_cpu_num;
-    if (thread_num_ == 0) thread_num_ = 1;
-    sched_thread_num_ = thread_num_ - 1;
-    orch_to_sched_ = runtime->orch_to_sched;
-
-    if (thread_num_ < 1 || thread_num_ > MAX_AICPU_THREADS) {
-        DEV_ERROR("Invalid thread_num: %d", thread_num_);
-        init_failed_.store(true, std::memory_order_release);
-        return -1;
-    }
-
-    // Initialize core_id_to_reg_addr_ array to 0 before handshake
-    for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) {
-        core_id_to_reg_addr_[i] = 0;
-    }
-
-    // Use handshake mechanism to discover cores (aligned with host_build_graph)
-    int32_t rc = handshake_all_cores(runtime);
-    if (rc != 0) {
-        DEV_ERROR("handshake_all_cores failed");
-        init_failed_.store(true, std::memory_order_release);
-        return -1;
-    }
-
-    // Dynamically assign cores to threads
-    assign_cores_to_threads();
-
-    DEV_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_);
-
-    // Initialize runtime execution state
-    // Task count comes from PTO2 shared memory
-    if (runtime->get_gm_sm_ptr()) {
-        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
-        int32_t pto2_count = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            pto2_count += header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-        }
-        total_tasks_ = pto2_count > 0 ? pto2_count : 0;
-    } else {
-        total_tasks_ = 0;
-    }
-    completed_tasks_.store(0, std::memory_order_release);
-    // Host orchestration: graph already built, no wait needed. Device orch: Thread 3 will set this.
-    bool orch_on_host = runtime->get_orch_built_on_host();
-    DEV_INFO("Init: orch_built_on_host=%d", orch_on_host ? 1 : 0);
-    orchestrator_done_ = orch_on_host;
-
-    // Initial ready tasks will be populated via scheduler ready queues
-
-    // Reset per-core dispatch timestamps and task counters
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        dispatch_timestamps_[i] = 0;
-        core_dispatch_counts_[i] = 0;
-    }
-
-    // Clear per-core dispatch payloads and subslot tracking
-    memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core));
-    memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_));
-    memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_));
-    memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_));
-
-    DEV_INFO("Init: PTO2 mode, task count from shared memory");
-
-    finished_count_.store(0, std::memory_order_release);
-
-    init_done_.store(true, std::memory_order_release);
-    DEV_INFO("AicpuExecutor: Init complete");
-    return 0;
-}
-
-/**
- * Shutdown AICore - Send exit signal via registers to all AICore kernels
- */
-int32_t AicpuExecutor::shutdown_aicore(
-    Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num
-) {
-    (void)runtime;
-    if (core_num == 0) return 0;
-
-    DEV_INFO("Thread %d: Shutting down %d cores", thread_idx, core_num);
-
-    for (int32_t i = 0; i < core_num; i++) {
-        int32_t core_id = cur_thread_cores[i];
-        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-        if (reg_addr != 0) {
-            platform_deinit_aicore_regs(reg_addr);
-        } else {
-            DEV_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
-        }
-    }
-    DEV_INFO("Thread %d: Shutdown complete", thread_idx);
-    return 0;
-}
-
-int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t thread_idx) {
-    int32_t &core_num = core_count_per_thread_[thread_idx];
-    CoreStateTracker &tracker = trackers_[thread_idx];
-    DEV_INFO("Thread %d: resolve_and_dispatch_pto2 entry", thread_idx);
-
-    void *sm_base = runtime->get_gm_sm_ptr();
-    if (!sm_base) {
-        DEV_ERROR("PTO2 dispatch: sm_base is null");
-        return -1;
-    }
-    DEV_INFO("Thread %d: sm_base=%p", thread_idx, sm_base);
-
-    PTO2SharedMemoryHeader *header = static_cast<PTO2SharedMemoryHeader *>(sm_base);
-    DEV_INFO(
-        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
-        static_cast<uint64_t>(header->rings[0].task_descriptors_offset),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    Handshake *hank = static_cast<Handshake *>(runtime->workers);
-    DEV_INFO(
-        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    // One-time init: assign perf buffers (one thread does it; others wait)
-    if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) {
-        DEV_INFO("Thread %d: doing one-time init", thread_idx);
-
-#if PTO2_PROFILING
-        // Assign perf buffers to cores early so profiling captures all tasks
-        // (total_tasks written to header later when orchestrator completes)
-        if (is_l2_swimlane_enabled()) {
-            l2_perf_aicpu_init_profiling(runtime);
-            // Initialize phase profiling for scheduler threads + orchestrator threads
-            l2_perf_aicpu_init_phase_profiling(runtime, sched_thread_num_);
-            l2_perf_aicpu_set_orch_thread_idx(sched_thread_num_);
-        }
-#endif
-#if PTO2_PROFILING
-        if (is_dump_tensor_enabled()) {
-            dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_);
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Initialize PMU: program events, start counters, and pop initial buffers
-        if (is_pmu_enabled()) {
-            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
-            DEV_INFO("PMU profiling started on %d cores", cores_total_num_);
-        }
-#endif
-
-        DEV_INFO("Thread %d: one-time init done", thread_idx);
-        pto2_init_complete_.store(true, std::memory_order_release);
-    } else {
-        while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-    }
-
-    DEV_INFO("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_num);
-    int32_t cur_thread_completed = 0;
-    int32_t idle_iterations = 0;
-    int32_t last_progress_count = 0;
-#if PTO2_PROFILING
-    bool l2_perf_enabled = is_l2_swimlane_enabled();
-#endif
-
-    // Scheduler profiling counters
-#if PTO2_PROFILING
-    uint64_t sched_scan_cycle = 0;
-    uint64_t sched_complete_cycle = 0;
-    uint64_t sched_dispatch_cycle = 0;
-    uint64_t sched_idle_cycle = 0;
-    uint64_t sched_loop_count = 0;
-    uint32_t phase_complete_count = 0;
-    uint32_t phase_dispatch_count = 0;
-#if PTO2_SCHED_PROFILING
-    uint64_t complete_probe_count = 0;
-    uint64_t complete_hit_count = 0;
-    uint64_t notify_edges_total = 0;
-    int32_t notify_max_degree = 0;
-    uint64_t notify_tasks_enqueued = 0;
-    uint64_t fanin_edges_total = 0;
-    int32_t fanin_max_degree = 0;
-    uint64_t pop_hit = 0;
-    uint64_t pop_miss = 0;
-    uint64_t local_dispatch_count = 0;
-    uint64_t local_overflow_count = 0;
-    uint64_t sched_complete_perf_cycle = 0;
-    uint64_t sched_dispatch_pop_cycle = 0;
-    uint64_t sched_dispatch_setup_cycle = 0;
-#endif
-#endif
-
-    // Local-first dispatch buffers (stack-allocated, one per CoreType per scheduling thread).
-    // Initialized once; must be empty at the start of each iteration.
-    constexpr int LOCAL_READY_CAP_PER_TYPE = 256;
-    PTO2TaskSlotState *local_aic_ptrs[LOCAL_READY_CAP_PER_TYPE];
-    PTO2TaskSlotState *local_aiv_ptrs[LOCAL_READY_CAP_PER_TYPE];
-    PTO2LocalReadyBuffer local_bufs[PTO2_LOCAL_DISPATCH_TYPE_NUM];  // [0]=AIC, [1]=AIV
-    local_bufs[0].reset(local_aic_ptrs, LOCAL_READY_CAP_PER_TYPE);
-    local_bufs[1].reset(local_aiv_ptrs, LOCAL_READY_CAP_PER_TYPE);
-    PTO2TaskSlotState *deferred_release_slot_states[256];
-    int32_t deferred_release_count = 0;
-
-    bool cores_released = false;
-
-    while (true) {
-        bool made_progress = false;
-#if PTO2_PROFILING
-        CYCLE_COUNT_START();
-        sched_loop_count++;
-        uint64_t _t0_phase = _t0;
-#endif
-        int32_t task_count = 0;
-        if (tracker.aic().running_count == 0 && tracker.aiv().running_count == 0) {
-            bool orch_done = orchestrator_done_;
-            if (orch_done) {
-                // Check for orchestrator fatal error — exit immediately
-                int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-                if (orch_err != PTO2_ERROR_NONE) {
-                    DEV_ERROR(
-                        "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
-                        "completed_tasks=%d, total_tasks=%d",
-                        thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
-                    );
-                    emergency_shutdown(runtime);
-                    completed_.store(true, std::memory_order_release);
-                    break;
-                }
-
-                // Normal exit: all tasks complete
-                task_count = total_tasks_;
-                if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
-                    completed_.store(true, std::memory_order_release);
-                    DEV_INFO(
-                        "Thread %d: PTO2 completed tasks %d/%d", thread_idx,
-                        completed_tasks_.load(std::memory_order_relaxed), task_count
-                    );
-                    break;
-                }
-            }
-        }
-
-        // Check for core transition request (execute once per thread)
-        if (!cores_released && orch_to_sched_ && transition_requested_.load(std::memory_order_acquire)) {
-            if (!reassigned_.load(std::memory_order_acquire)) {
-                wait_reassign_.fetch_add(1, std::memory_order_release);
-                while (!reassigned_.load(std::memory_order_acquire)) {
-                    if (completed_.load(std::memory_order_acquire)) {
-                        break;
-                    }
-                    SPIN_WAIT_HINT();
-                }
-                if (completed_.load(std::memory_order_acquire)) {
-                    break;
-                }
-            }
-            cores_released = true;
-        }
-
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(sched_idle_cycle);
-#endif
-
-        // Process completed and dispatch FIRST to minimize Sched (dispatch→finish) latency.
-        // Sched time = finish_ts - dispatch_ts; recording finish_ts here at loop start reduces
-        // tail overhead (time from AICore done to AICPU recording finish).
-
-        // Phase 1: Check running cores for completion, process and move to idle
-        int32_t completed_this_turn = 0;
-
-        // Check AIC running cores
-        bool try_completed = false;
-        always_assert(
-            local_bufs[0].count == 0 && local_bufs[1].count == 0
-        );  // Invariant: previous iteration fully consumed
-        if (tracker.aic().running_count > 0) {
-            try_completed = true;
-            check_running_cores_for_completion<CoreType::AIC>(
-                thread_idx, tracker.aic(), hank, completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                l2_perf_enabled, phase_complete_count
-#endif
-#if PTO2_SCHED_PROFILING
-                ,
-                complete_probe_count, complete_hit_count, notify_edges_total, notify_max_degree, notify_tasks_enqueued,
-                fanin_edges_total, fanin_max_degree, sched_complete_perf_cycle
-#endif
-            );
-        }
-
-        // Check AIV running cores
-        if (tracker.aiv().running_count > 0) {
-            try_completed = true;
-            check_running_cores_for_completion<CoreType::AIV>(
-                thread_idx, tracker.aiv(), hank, completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                l2_perf_enabled, phase_complete_count
-#endif
-#if PTO2_SCHED_PROFILING
-                ,
-                complete_probe_count, complete_hit_count, notify_edges_total, notify_max_degree, notify_tasks_enqueued,
-                fanin_edges_total, fanin_max_degree, sched_complete_perf_cycle
-#endif
-            );
-        }
-        if (completed_this_turn > 0) {
-#if PTO2_SCHED_PROFILING
-            rt->scheduler.tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
-#endif
-            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
-            int32_t new_total = prev + completed_this_turn;
-            last_progress_count = new_total;
-            if (thread_idx == 0 && task_count > 0) {
-                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
-                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
-                    DEV_ALWAYS(
-                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
-                        100.0 * new_total / task_count
-                    );
-                }
-            }
-        }
-
-#if PTO2_PROFILING
-        if (!try_completed) {
-            CYCLE_COUNT_LAP(sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(sched_complete_cycle);
-            if (l2_perf_enabled && phase_complete_count > 0) {
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count
-                );
-                _t0_phase = _t1;
-                phase_complete_count = 0;
-            }
-        }
-#endif
-
-        // Phase 2: Local dispatch — drain local_bufs, match to idle clusters (zero MPMC operations)
-        // Phase 3: Global queue — push overflow to readyQ + fill remaining idle cores from readyQ
-        bool try_pushed = false;
-
-        // Local dispatch: drain both per-CoreType local_bufs, match to idle clusters by shape
-        PTO2TaskSlotState *overflow_ptrs[LOCAL_READY_CAP_PER_TYPE * PTO2_LOCAL_DISPATCH_TYPE_NUM];
-        int overflow_count = 0;
-        for (int bi = 0; bi < PTO2_LOCAL_DISPATCH_TYPE_NUM; bi++) {
-            while (local_bufs[bi].count > 0) {
-                PTO2TaskSlotState *slot_state = local_bufs[bi].pop();
-                PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask);
-                int32_t ci = tracker.find_cluster_for_shape(shape, core_idle_);
-
-                if (ci >= 0) {
-                    try_pushed = true;
-                    Cluster &c = tracker.clusters[ci];
-#if PTO2_SCHED_PROFILING
-                    uint64_t t_setup_start = get_sys_cnt_aicpu();
-#endif
-                    ResourceCount rc = shape_resource_count(shape);
-#if PTO2_PROFILING
-                    if (is_dump_tensor_enabled()) {
-                        dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-                            thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH,
-                            [](uint8_t active_mask, uint8_t raw_subtask_id) {
-                                return pto2_subtask_active(active_mask, static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                            },
-                            [this](int32_t func_id) {
-                                return get_function_bin_addr(func_id);
-                            }
-                        );
-                    }
-#endif
-                    if (rc.aic) {
-                        dispatch_subtask_to_core(
-                            runtime, tracker, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC
-#if PTO2_PROFILING
-                            ,
-                            l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-                            ,
-                            thread_idx
-#endif
-                        );
-                    }
-                    if (rc.aiv >= 1) {
-                        int32_t aiv0 = core_idle_[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
-                        dispatch_subtask_to_core(
-                            runtime, tracker, aiv0, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0
-#if PTO2_PROFILING
-                            ,
-                            l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-                            ,
-                            thread_idx
-#endif
-                        );
-                    }
-                    if (rc.aiv >= 2) {
-                        dispatch_subtask_to_core(
-                            runtime, tracker, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1
-#if PTO2_PROFILING
-                            ,
-                            l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-                            ,
-                            thread_idx
-#endif
-                        );
-                    }
-#if PTO2_PROFILING
-                    phase_dispatch_count++;
-#endif
-#if PTO2_SCHED_PROFILING
-                    pop_hit++;
-                    local_dispatch_count++;
-                    sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
-#endif
-                    made_progress = true;
-                    DEV_DEBUG(
-                        "Thread %d: Dispatching %s task %" PRId64 " to cluster %d (local)", thread_idx,
-                        shape_name(shape), static_cast<int64_t>(slot_state->task->task_id.raw), ci
-                    );
-                } else {
-                    overflow_ptrs[overflow_count++] = slot_state;
-#if PTO2_SCHED_PROFILING
-                    local_overflow_count++;
-#endif
-                }
-            }
-        }
-
-        // Push overflow to global readyQ (shape-based)
-        for (int i = 0; i < overflow_count; i++) {
-            rt->scheduler.requeue_ready_task(*overflow_ptrs[i]);
-        }
-
-        // Phase 3: Global dispatch — fill remaining idle cores from global readyQ (cluster-based)
-        const PTO2ResourceShape *dispatch_order = get_dispatch_order(thread_idx);
-
-        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) {
-            PTO2ResourceShape shape = dispatch_order[si];
-            if (rt->scheduler.ready_queues[static_cast<int32_t>(shape)].size() == 0) continue;
-
-            while (true) {
-                int32_t ci = tracker.find_cluster_for_shape(shape, core_idle_);
-                if (ci < 0) break;
-
-                PTO2TaskSlotState *slot_state = pop_ready_task(
-                    shape, thread_idx
-#if PTO2_SCHED_PROFILING
-                    ,
-                    pop_hit, pop_miss, sched_dispatch_pop_cycle
-#endif
-                );
-                if (!slot_state) break;
-
-                try_pushed = true;
-#if PTO2_PROFILING
-                phase_dispatch_count++;
-#endif
-#if PTO2_SCHED_PROFILING
-                uint64_t t_setup_start = get_sys_cnt_aicpu();
-#endif
-                Cluster &c = tracker.clusters[ci];
-                ResourceCount rc = shape_resource_count(shape);
-#if PTO2_PROFILING
-                if (is_dump_tensor_enabled()) {
-                    dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-                        thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH,
-                        [](uint8_t active_mask, uint8_t raw_subtask_id) {
-                            return pto2_subtask_active(active_mask, static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                        },
-                        [this](int32_t func_id) {
-                            return get_function_bin_addr(func_id);
-                        }
-                    );
-                }
-#endif
-                if (rc.aic) {
-                    dispatch_subtask_to_core(
-                        runtime, tracker, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC
-#if PTO2_PROFILING
-                        ,
-                        l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-                        ,
-                        thread_idx
-#endif
-                    );
-                }
-                if (rc.aiv >= 1) {
-                    int32_t aiv_id = core_idle_[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
-                    dispatch_subtask_to_core(
-                        runtime, tracker, aiv_id, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0
-#if PTO2_PROFILING
-                        ,
-                        l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-                        ,
-                        thread_idx
-#endif
-                    );
-                }
-                if (rc.aiv >= 2) {
-                    dispatch_subtask_to_core(
-                        runtime, tracker, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1
-#if PTO2_PROFILING
-                        ,
-                        l2_perf_enabled
-#endif
-#if PTO2_PROFILING
-                        ,
-                        thread_idx
-#endif
-                    );
-                }
-                made_progress = true;
-#if PTO2_SCHED_PROFILING
-                sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
-#endif
-                DEV_DEBUG(
-                    "Thread %d: Dispatching %s task %" PRId64 " to cluster %d", thread_idx, shape_name(shape),
-                    static_cast<int64_t>(slot_state->task->task_id.raw), ci
-                );
-            }
-        }
-
-#if PTO2_PROFILING
-        if (!try_pushed) {
-            CYCLE_COUNT_LAP(sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(sched_dispatch_cycle);
-            if (l2_perf_enabled && phase_dispatch_count > 0) {
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count
-                );
-                _t0_phase = _t1;
-                phase_dispatch_count = 0;
-            }
-#endif
-        }
-
-        if (made_progress) {
-            idle_iterations = 0;
-        } else {
-            // Batch deferred fanin releases during idle.
-            // Processing all pending releases at once advances the ring faster,
-            // freeing heap space for the orchestrator without blocking completion polling.
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                int32_t fe =
-                    rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-            int32_t fe = rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                (void)fe;
-#if PTO2_SCHED_PROFILING
-                fanin_edges_total += fe;
-                if (fe > fanin_max_degree) fanin_max_degree = fe;
-#endif
-            }
-            idle_iterations++;
-
-            // Check for orchestrator fatal error during idle (every 1024 iterations)
-            // orch_error_code is set in shared memory by the orchestrator's spin loop
-            // BEFORE orchestrator_done_ is set, so this catches errors earlier.
-            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
-                int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-                if (orch_err != PTO2_ERROR_NONE) {
-                    DEV_ERROR(
-                        "Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx,
-                        orch_err
-                    );
-                    emergency_shutdown(runtime);
-                    completed_.store(true, std::memory_order_release);
-                    break;
-                }
-            }
-
-            if (thread_idx == 0 && task_count > 0 && idle_iterations % STALL_LOG_INTERVAL == 0) {
-                int32_t c = completed_tasks_.load(std::memory_order_relaxed);
-                DEV_ALWAYS(
-                    "PTO2 stall: no progress for %d iterations, completed=%d total=%d (last progress at %d)",
-                    idle_iterations, c, task_count, last_progress_count
-                );
-                // Scan all task slots to find truly stuck tasks using scheduler state
-                PTO2SchedulerState *sched = &rt->scheduler;
-                PTO2SharedMemoryHeader *sm_header_diag = static_cast<PTO2SharedMemoryHeader *>(sm_base);
-                int32_t cnt_ready = 0, cnt_waiting = 0, cnt_inflight = 0;
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    int32_t ring_task_count =
-                        sm_header_diag->rings[r].fc.current_task_index.load(std::memory_order_relaxed);
-                    for (int32_t si = 0; si < ring_task_count; si++) {
-                        PTO2TaskSlotState &slot_state = sched->get_slot_state(r, si);
-                        PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
-                        int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
-                        int32_t fi = slot_state.fanin_count;
-                        int32_t kid = slot_state.task->kernel_id[0];
-                        if (st >= PTO2_TASK_COMPLETED) continue;  // Already done
-                        if (st == PTO2_TASK_READY || st == PTO2_TASK_RUNNING) {
-                            cnt_inflight++;
-                            continue;
-                        }
-                        // PENDING
-                        if (rc >= fi) {
-                            // Ready (all deps satisfied) but not enqueued — this is the real bug
-                            cnt_ready++;
-                            if (cnt_ready <= STALL_DUMP_READY_MAX) {
-                                DEV_ALWAYS(
-                                    "  STUCK-READY  ring=%d task_id=%" PRId64
-                                    " kernel_id=%d refcount=%d fanin=%d state=%d",
-                                    r, static_cast<int64_t>(slot_state.task->task_id.raw), kid, rc, fi,
-                                    static_cast<int32_t>(st)
-                                );
-                            }
-                        } else {
-                            cnt_waiting++;
-                            if (cnt_waiting <= STALL_DUMP_WAIT_MAX) {
-                                DEV_ALWAYS(
-                                    "  STUCK-WAIT   ring=%d task_id=%" PRId64
-                                    " kernel_id=%d refcount=%d fanin=%d state=%d",
-                                    r, static_cast<int64_t>(slot_state.task->task_id.raw), kid, rc, fi,
-                                    static_cast<int32_t>(st)
-                                );
-                            }
-                        }
-                    }
-                }
-                DEV_ALWAYS(
-                    "  scan result: stuck_ready=%d stuck_waiting=%d in_flight=%d", cnt_ready, cnt_waiting, cnt_inflight
-                );
-                // Log this thread's dispatch state
-                int32_t total_idle = tracker.aic().idle_count + tracker.aiv().idle_count;
-                int32_t total_running = tracker.aic().running_count + tracker.aiv().running_count;
-                DEV_ALWAYS(
-                    "  thread=%d idle_cores=%d (AIC=%d AIV=%d) running_cores=%d (AIC=%d AIV=%d) core_num=%d",
-                    thread_idx, total_idle, tracker.aic().idle_count, tracker.aiv().idle_count, total_running,
-                    tracker.aic().running_count, tracker.aiv().running_count, core_num
-                );
-                // Dump AIC running cores
-                for (int32_t ci = 0; ci < tracker.aic().running_count && ci < STALL_DUMP_CORE_MAX; ci++) {
-                    int32_t cid = tracker.aic().running[ci];
-                    int32_t sw_tid = executing_reg_task_ids_[cid];
-                    int32_t hw_kernel = -1;
-                    if (sw_tid >= 0 && executing_slot_state_by_core_[cid]) {
-                        int32_t diag_slot = static_cast<int32_t>(executing_subslot_by_core_[cid]);
-                        hw_kernel = executing_slot_state_by_core_[cid]->task->kernel_id[diag_slot];
-                    }
-                    uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND);
-                    DEV_ALWAYS(
-                        "    core=%d cond=0x%x(state=%d,id=%d) exec_id=%d kernel=%d", cid,
-                        static_cast<unsigned>(cond_reg), EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg),
-                        sw_tid, hw_kernel
-                    );
-                }
-                // Dump AIV running cores
-                for (int32_t ci = 0; ci < tracker.aiv().running_count && ci < STALL_DUMP_CORE_MAX; ci++) {
-                    int32_t cid = tracker.aiv().running[ci];
-                    int32_t sw_tid = executing_reg_task_ids_[cid];
-                    int32_t hw_kernel = -1;
-                    if (sw_tid >= 0 && executing_slot_state_by_core_[cid]) {
-                        int32_t diag_slot = static_cast<int32_t>(executing_subslot_by_core_[cid]);
-                        hw_kernel = executing_slot_state_by_core_[cid]->task->kernel_id[diag_slot];
-                    }
-                    uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND);
-                    DEV_ALWAYS(
-                        "    core=%d cond=0x%x(state=%d,id=%d) exec_id=%d kernel=%d", cid,
-                        static_cast<unsigned>(cond_reg), EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg),
-                        sw_tid, hw_kernel
-                    );
-                }
-                // Dump cluster state
-                for (int32_t cli = 0; cli < tracker.cluster_count && cli < STALL_DUMP_CORE_MAX; cli++) {
-                    Cluster &cl = tracker.clusters[cli];
-                    DEV_ALWAYS(
-                        "    cluster[%d] aic=%d(%s) aiv0=%d(%s) aiv1=%d(%s)", cli, cl.aic_core_id,
-                        core_idle_[cl.aic_core_id] ? "idle" : "busy", cl.aiv_core_ids[0],
-                        core_idle_[cl.aiv_core_ids[0]] ? "idle" : "busy", cl.aiv_core_ids[1],
-                        core_idle_[cl.aiv_core_ids[1]] ? "idle" : "busy"
-                    );
-                }
-            }
-            if (idle_iterations > MAX_IDLE_ITERATIONS) {
-                DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations);
-                return -1;
-            } else {
-                SPIN_WAIT_HINT();
-            }
-#if PTO2_PROFILING
-            CYCLE_COUNT_LAP(sched_idle_cycle);
-            if (l2_perf_enabled) {
-                l2_perf_aicpu_record_phase(
-                    thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, sched_loop_count, 0
-                );
-                _t0_phase = _t1;
-            }
-#endif
-        }
-    }
-
-#if PTO2_PROFILING
-    // Scheduler summary logging (always print when PTO2_PROFILING=1)
-    uint64_t sched_total = sched_complete_cycle + sched_scan_cycle + sched_dispatch_cycle + sched_idle_cycle;
-    if (sched_total == 0) sched_total = 1;  // avoid div-by-zero
-
-#if PTO2_SCHED_PROFILING
-    // Two-level tree display: sub-phase breakdown within complete and dispatch
-    {
-        PTO2SchedProfilingData sp = pto2_scheduler_get_profiling(thread_idx);
-        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
-        uint64_t complete_poll = (sched_complete_cycle > otc_total + sched_complete_perf_cycle) ?
-                                     (sched_complete_cycle - otc_total - sched_complete_perf_cycle) :
-                                     0;
-        uint64_t dispatch_poll = (sched_dispatch_cycle > sched_dispatch_pop_cycle + sched_dispatch_setup_cycle) ?
-                                     (sched_dispatch_cycle - sched_dispatch_pop_cycle - sched_dispatch_setup_cycle) :
-                                     0;
-
-        DEV_ALWAYS(
-            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
-            cycles_to_us(sched_total), cur_thread_completed
-        );
-
-        // Level 1: complete
-        double notify_avg =
-            cur_thread_completed > 0 ? static_cast<double>(notify_edges_total) / cur_thread_completed : 0.0;
-        double fanin_avg =
-            cur_thread_completed > 0 ? static_cast<double>(fanin_edges_total) / cur_thread_completed : 0.0;
-        DEV_ALWAYS(
-            "Thread %d:   complete       : %.3fus (%.1f%%)  [fanout: edges=%" PRIu64
-            ", max_degree=%d, avg=%.1f]  [fanin: "
-            "edges=%" PRIu64 ", max_degree=%d, avg=%.1f]",
-            thread_idx, cycles_to_us(sched_complete_cycle), sched_complete_cycle * 100.0 / sched_total,
-            static_cast<uint64_t>(notify_edges_total), notify_max_degree, notify_avg,
-            static_cast<uint64_t>(fanin_edges_total), fanin_max_degree, fanin_avg
-        );
-
-        // Level 2: complete sub-phases (percentage relative to complete)
-        uint64_t c_parent = sched_complete_cycle > 0 ? sched_complete_cycle : 1;
-        uint64_t complete_miss_count =
-            (complete_probe_count > complete_hit_count) ? (complete_probe_count - complete_hit_count) : 0;
-        double complete_hit_rate = complete_probe_count > 0 ? complete_hit_count * 100.0 / complete_probe_count : 0.0;
-        DEV_ALWAYS(
-            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
-            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
-            static_cast<uint64_t>(complete_hit_count), static_cast<uint64_t>(complete_miss_count), complete_hit_rate
-        );
-        DEV_ALWAYS(
-            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
-            static_cast<uint64_t>(sp.lock_atomic_count)
-        );
-        DEV_ALWAYS(
-            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
-            static_cast<uint64_t>(sp.fanout_atomic_count)
-        );
-        DEV_ALWAYS(
-            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.fanin_atomic_count)
-        );
-        DEV_ALWAYS(
-            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.self_atomic_count)
-        );
-        DEV_ALWAYS(
-            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_complete_perf_cycle),
-            sched_complete_perf_cycle * 100.0 / c_parent
-        );
-
-        // Level 1: dispatch
-        uint64_t pop_total = pop_hit + pop_miss;
-        double pop_hit_rate = pop_total > 0 ? pop_hit * 100.0 / pop_total : 0.0;
-        DEV_ALWAYS(
-            "Thread %d:   dispatch       : %.3fus (%.1f%%)  [pop: hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%]",
-            thread_idx, cycles_to_us(sched_dispatch_cycle), sched_dispatch_cycle * 100.0 / sched_total,
-            static_cast<uint64_t>(pop_hit), static_cast<uint64_t>(pop_miss), pop_hit_rate
-        );
-        uint64_t global_dispatch_count = pop_hit - local_dispatch_count;
-        uint64_t total_dispatched = local_dispatch_count + global_dispatch_count;
-        double local_hit_rate = total_dispatched > 0 ? local_dispatch_count * 100.0 / total_dispatched : 0.0;
-        DEV_ALWAYS(
-            "Thread %d:     local_disp   : local=%" PRIu64 ", global=%" PRIu64 ", overflow=%" PRIu64
-            ", local_rate=%.1f%%",
-            thread_idx, static_cast<uint64_t>(local_dispatch_count), static_cast<uint64_t>(global_dispatch_count),
-            static_cast<uint64_t>(local_overflow_count), local_hit_rate
-        );
-
-        // Level 2: dispatch sub-phases (percentage relative to dispatch)
-        uint64_t d_parent = sched_dispatch_cycle > 0 ? sched_dispatch_cycle : 1;
-        DEV_ALWAYS(
-            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
-            dispatch_poll * 100.0 / d_parent
-        );
-        DEV_ALWAYS(
-            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sched_dispatch_pop_cycle), sched_dispatch_pop_cycle * 100.0 / d_parent,
-            cycles_to_us(sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
-            static_cast<uint64_t>(sp.pop_atomic_count)
-        );
-        DEV_ALWAYS(
-            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_dispatch_setup_cycle),
-            sched_dispatch_setup_cycle * 100.0 / d_parent
-        );
-
-        // Level 1: scan
-        DEV_ALWAYS(
-            "Thread %d:   scan           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_scan_cycle),
-            sched_scan_cycle * 100.0 / sched_total
-        );
-
-        // Level 1: idle
-        DEV_ALWAYS(
-            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(sched_idle_cycle),
-            sched_idle_cycle * 100.0 / sched_total
-        );
-
-        // Average per completion
-        if (cur_thread_completed > 0) {
-            DEV_ALWAYS(
-                "Thread %d:   avg/complete   : %.3fus", thread_idx,
-                cycles_to_us(sched_complete_cycle) / cur_thread_completed
-            );
-        }
-    }
-#endif
-    // Summary line (always print when PTO2_PROFILING=1)
-    DEV_ALWAYS(
-        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
-        cycles_to_us(sched_total), static_cast<uint64_t>(sched_loop_count), cur_thread_completed
-    );
-#endif
-
-#if PTO2_PROFILING
-    // Flush performance buffers for cores managed by this thread
-    if (l2_perf_enabled) {
-        l2_perf_aicpu_flush_buffers(thread_idx, core_assignments_[thread_idx], core_num);
-        l2_perf_aicpu_flush_phase_buffers(thread_idx);
-    }
-    if (is_pmu_enabled()) {
-        pmu_aicpu_flush_buffers(thread_idx, core_assignments_[thread_idx], core_num);
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_dump_tensor_enabled()) {
-        dump_tensor_flush(thread_idx);
-    }
-#endif
-
-    return cur_thread_completed;
-}
-
-int32_t AicpuExecutor::run(Runtime *runtime) {
-    int32_t thread_idx = thread_idx_++;
-
-    DEV_ALWAYS("Thread %d: Start", thread_idx);
-
-    // Orchestrator check
-    if (thread_idx >= sched_thread_num_) {
-        if (runtime->get_orch_built_on_host()) {
-            DEV_INFO("Thread %d: Host orchestration mode, no-op", thread_idx);
-        } else {
-            DEV_INFO("Thread %d: Orchestrator, loading SO via dlopen", thread_idx);
-
-            const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
-            size_t so_size = runtime->get_dev_orch_so_size();
-
-            if (so_data == nullptr || so_size == 0) {
-                DEV_ERROR("Thread %d: Device orchestration SO not set", thread_idx);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-
-            // Try multiple paths that may allow execution on AICPU
-            char so_path[256];
-            bool file_created = false;
-            const char *candidate_dirs[] = {
-                "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
-            };
-            const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
-
-            for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                int32_t fd = create_orch_so_file(candidate_dirs[i], so_path, sizeof(so_path));
-                if (fd < 0) {
-                    DEV_INFO(
-                        "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
-                    );
-                    continue;
-                }
-                ssize_t written = write(fd, so_data, so_size);
-                close(fd);
-                if (written != static_cast<ssize_t>(so_size)) {
-                    DEV_INFO(
-                        "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno
-                    );
-                    unlink(so_path);
-                    continue;
-                }
-                file_created = true;
-                DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size);
-            }
-
-            if (!file_created) {
-                DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-
-            dlerror();
-            void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
-            const char *dlopen_err = dlerror();
-            if (handle == nullptr) {
-                DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown");
-                unlink(so_path);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-            DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
-
-            dlerror();
-            auto config_func =
-                reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, "aicpu_orchestration_config"));
-
-            dlerror();
-            DeviceOrchestrationFunc orch_func =
-                reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, "aicpu_orchestration_entry"));
-            const char *dlsym_error = dlerror();
-            if (dlsym_error != nullptr) {
-                DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error);
-                dlclose(handle);
-                unlink(so_path);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-            if (orch_func == nullptr) {
-                DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx);
-                dlclose(handle);
-                unlink(so_path);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-
-            const ChipStorageTaskArgs &args = runtime->get_orch_args();
-            int32_t arg_count = args.tensor_count() + args.scalar_count();
-            DEV_INFO("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count);
-            for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
-                const ContinuousTensor &t = args.tensor(i);
-                DEV_INFO(
-                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i,
-                    static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
-                );
-            }
-            for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
-                DEV_INFO(
-                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i,
-                    static_cast<uint64_t>(args.scalar(i))
-                );
-            }
-
-            uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
-            uint64_t heap_size = PTO2_HEAP_SIZE;
-            int32_t expected_arg_count = 0;
-            if (config_func) {
-                PTO2OrchestrationConfig cfg = config_func(args);
-                expected_arg_count = cfg.expected_arg_count;
-                DEV_INFO("Thread %d: Config: expected_args=%d", thread_idx, expected_arg_count);
-            } else {
-                DEV_INFO("Thread %d: No config function, using defaults", thread_idx);
-            }
-
-            if (expected_arg_count > 0 && arg_count < expected_arg_count) {
-                DEV_ERROR("Thread %d: arg_count %d < expected %d", thread_idx, arg_count, expected_arg_count);
-                dlclose(handle);
-                unlink(so_path);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-
-            if (runtime->task_window_size > 0) {
-                task_window_size = runtime->task_window_size;
-            }
-            if (runtime->heap_size > 0) {
-                heap_size = runtime->heap_size;
-            }
-            int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
-            if (runtime->dep_pool_size > 0) {
-                dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
-            }
-            DEV_INFO(
-                "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", thread_idx,
-                static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
-            );
-
-            void *sm_ptr = runtime->get_gm_sm_ptr();
-            void *gm_heap = runtime->get_gm_heap_ptr();
-
-            uint64_t sm_size = pto2_sm_calculate_size(task_window_size);
-            PTO2SharedMemoryHandle *sm_handle =
-                pto2_sm_create_from_buffer(sm_ptr, sm_size, task_window_size, heap_size);
-            if (!sm_handle) {
-                DEV_ERROR("Thread %d: Failed to create shared memory handle", thread_idx);
-                dlclose(handle);
-                unlink(so_path);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-
-            rt = pto2_runtime_create_from_sm(PTO2_MODE_EXECUTE, sm_handle, gm_heap, heap_size, dep_pool_capacity);
-            if (!rt) {
-                DEV_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
-                pto2_sm_destroy(sm_handle);
-                dlclose(handle);
-                unlink(so_path);
-                // Unblock scheduler threads before returning so they don't spin forever.
-                runtime_init_ready_.store(true, std::memory_order_release);
-                return -1;
-            }
-
-#if PTO2_PROFILING
-            rt->orchestrator.enable_l2_swimlane = is_l2_swimlane_enabled();
-#endif
-
-            // With multi-ring, slot_states are per-ring inside the scheduler.
-            runtime->set_slot_states_ptr(nullptr);
-
-            // Store shared state for orchestrator thread
-            orch_func_ = orch_func;
-            orch_args_cached_ = &args;
-            orch_so_handle_ = handle;
-            snprintf(orch_so_path_, sizeof(orch_so_path_), "%s", so_path);
-
-            runtime_init_ready_.store(true, std::memory_order_release);
-
-            // Wait for scheduler's one-time init to complete
-            while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-                SPIN_WAIT_HINT();
-            }
-
-#if PTO2_PROFILING
-            // Each orchestrator thread sets its own phase buffer index (thread-local)
-            if (is_l2_swimlane_enabled()) {
-                l2_perf_aicpu_set_orch_thread_idx(thread_idx);
-            }
-#endif
-
-            // Call orchestration function wrapped in an outer scope
-            DEV_ALWAYS("Thread %d: Calling aicpu_orchestration_entry from SO", thread_idx);
-#if PTO2_PROFILING
-            uint64_t orch_cycle_start = get_sys_cnt_aicpu();
-#endif
-            PTO2_SCOPE(rt) { orch_func_(rt, *orch_args_cached_); }
-#if PTO2_PROFILING
-            uint64_t orch_cycle_end = get_sys_cnt_aicpu();
-            DEV_ALWAYS(
-                "Thread %d: orch_start=%" PRIu64 " orch_func_cost=%.3fus", thread_idx,
-                static_cast<uint64_t>(orch_cycle_start), cycles_to_us(orch_cycle_end - orch_cycle_start)
-            );
-#endif
-
-            // Print orchestrator profiling data
-#if PTO2_ORCH_PROFILING
-            PTO2OrchProfilingData p = pto2_orchestrator_get_profiling();
-            uint64_t total = p.alloc_cycle + p.args_cycle + p.heap_cycle + p.fanin_cycle;
-            if (total == 0) total = 1;  // avoid div-by-zero
-            DEV_ALWAYS(
-                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx,
-                static_cast<int64_t>(p.submit_count), cycles_to_us(total)
-            );
-            DEV_ALWAYS(
-                "Thread %d:   task_ring_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
-                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
-                static_cast<uint64_t>(p.alloc_atomic_count)
-            );
-            DEV_ALWAYS(
-                "Thread %d:   heap_alloc     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total,
-                cycles_to_us(p.heap_cycle - p.heap_wait_cycle), cycles_to_us(p.heap_wait_cycle),
-                static_cast<uint64_t>(p.heap_atomic_count)
-            );
-            DEV_ALWAYS(
-                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-                cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
-            );
-            DEV_ALWAYS(
-                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
-                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle),
-                static_cast<uint64_t>(p.fanin_atomic_count)
-            );
-            DEV_ALWAYS(
-                "Thread %d:   avg/task       : %.3fus", thread_idx,
-                p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
-            );
-
-#if PTO2_PROFILING
-            // Write orchestrator summary to shared memory for host-side export (only if profiling enabled)
-            if (is_l2_swimlane_enabled()) {
-                AicpuOrchSummary orch_summary = {};
-                orch_summary.start_time = orch_cycle_start;
-                orch_summary.end_time = orch_cycle_end;
-                orch_summary.sync_cycle = 0;
-                orch_summary.alloc_cycle = p.alloc_cycle;
-                orch_summary.args_cycle = p.args_cycle;
-                orch_summary.lookup_cycle = 0;
-                orch_summary.heap_cycle = p.heap_cycle;
-                orch_summary.insert_cycle = 0;
-                orch_summary.fanin_cycle = p.fanin_cycle;
-                orch_summary.scope_end_cycle = p.scope_end_cycle;
-                orch_summary.submit_count = p.submit_count;
-                l2_perf_aicpu_write_orch_summary(&orch_summary);
-            }
-#endif
-#endif
-
-#if PTO2_PROFILING
-            // Write core-to-thread mapping (one-time, after orchestration)
-            if (is_l2_swimlane_enabled()) {
-                l2_perf_aicpu_init_core_assignments(cores_total_num_);
-                for (int32_t t = 0; t < sched_thread_num_; t++) {
-                    l2_perf_aicpu_write_core_assignments_for_thread(t, core_assignments_[t], core_count_per_thread_[t]);
-                }
-                // Flush orchestrator's phase record buffer
-                l2_perf_aicpu_flush_phase_buffers(thread_idx);
-            }
-#endif
-
-            // Signal completion and trigger core transition
-            rt_orchestration_done(rt);
-
-            void *sm = runtime->get_gm_sm_ptr();
-            PTO2SharedMemoryHeader *sm_header = static_cast<PTO2SharedMemoryHeader *>(sm);
-            int32_t pto2_task_count = 0;
-            if (sm_header) {
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    pto2_task_count += sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-                }
-            }
-#if PTO2_PROFILING
-            DEV_ALWAYS(
-                "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_task_count,
-                completed_tasks_.load(std::memory_order_acquire)
-            );
-#endif
-            total_tasks_ = pto2_task_count;
-            if (is_l2_swimlane_enabled() && pto2_task_count > 0) {
-                l2_perf_aicpu_update_total_tasks(static_cast<uint32_t>(pto2_task_count));
-            }
-            orchestrator_done_ = true;
-            {
-                int32_t orch_err = 0;
-                void *sm = runtime->get_gm_sm_ptr();
-                if (sm) {
-                    orch_err =
-                        static_cast<PTO2SharedMemoryHeader *>(sm)->orch_error_code.load(std::memory_order_relaxed);
-                }
-
-                // Fatal error: shutdown AICore immediately before core transition.
-                if (orch_err != PTO2_ERROR_NONE) {
-                    emergency_shutdown(runtime);
-                    completed_.store(true, std::memory_order_release);
-                }
-            }
-
-#if PTO2_ORCH_PROFILING
-            uint64_t reassign_cycle_start = get_sys_cnt_aicpu();
-#endif
-
-            // Skip core transition on fatal error — cores already shut down above
-            if (completed_.load(std::memory_order_acquire)) {
-                // Signal transition to unblock scheduler threads waiting at core transition
-                transition_requested_.store(true, std::memory_order_release);
-                reassigned_.store(true, std::memory_order_release);
-            } else if (orch_to_sched_) {
-                // Compute new core assignments for all threads and initialize donated slots
-                DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
-#if PTO2_PROFILING
-                uint64_t orch_stage_end_ts = get_sys_cnt_aicpu();
-#endif
-                transition_requested_.store(true, std::memory_order_release);
-#if PTO2_PROFILING
-                DEV_ALWAYS(
-                    "Thread %d: orch_stage_end=%" PRIu64 "", thread_idx, static_cast<uint64_t>(orch_stage_end_ts)
-                );
-#endif
-
-                // Wait for scheduler threads to acknowledge transition request
-                if (sched_thread_num_ > 0) {
-                    while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
-                        if (completed_.load(std::memory_order_acquire)) {
-                            break;
-                        }
-                        SPIN_WAIT_HINT();
-                    }
-                }
-                if (!completed_.load(std::memory_order_acquire)) {
-                    reassign_cores_for_all_threads();
-                    reassigned_.store(true, std::memory_order_release);
-                }
-            }
-
-#if PTO2_ORCH_PROFILING
-            uint64_t reassign_cycle_end = get_sys_cnt_aicpu();
-            DEV_ALWAYS(
-                "Thread %d: reassign, cost %.3fus", thread_idx, cycles_to_us(reassign_cycle_end - reassign_cycle_start)
-            );
-#endif
-        }
-        DEV_INFO("Thread %d: Orchestrator completed", thread_idx);
-    }
-
-    // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!completed_.load(std::memory_order_acquire) && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
-        DEV_ALWAYS("Thread %d: Starting PTO2 dispatch", thread_idx);
-        // Device orchestration: wait for primary orchestrator to initialize SM header
-        if (!runtime->get_orch_built_on_host()) {
-            while (!runtime_init_ready_.load(std::memory_order_acquire)) {
-                SPIN_WAIT_HINT();
-            }
-        }
-        if (rt == nullptr) {
-            DEV_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
-        } else {
-            int32_t completed = resolve_and_dispatch_pto2(runtime, thread_idx);
-            DEV_INFO("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
-        }
-    }
-
-    // Always shutdown AICore — even if completed_ was already true.
-    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
-    // core_count_per_thread_ == 0 so they skip the loop harmlessly.
-    {
-        const int32_t *shutdown_cores = core_assignments_[thread_idx];
-        int32_t shutdown_count = core_count_per_thread_[thread_idx];
-#if PTO2_PROFILING
-        if (shutdown_count > 0) {
-            uint64_t sched_end_ts = get_sys_cnt_aicpu();
-            DEV_ALWAYS("Thread %d: sched_end=%" PRIu64 "", thread_idx, static_cast<uint64_t>(sched_end_ts));
-        }
-#endif
-        if (shutdown_count > 0) {
-#if PTO2_PROFILING
-            // Restore PMU CTRL registers for this thread's cores before AICore shutdown
-            if (is_pmu_enabled()) {
-                pmu_aicpu_finalize(shutdown_cores, shutdown_count);
-            }
-#endif
-            auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count);
-            if (rc != 0) {
-                return rc;
-            }
-        }
-    }
-
-    DEV_INFO("Thread %d: Completed", thread_idx);
-
-    // Check if this is the last thread to finish
-    int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
-    if (prev_finished + 1 == thread_num_) {
-        finished_.store(true, std::memory_order_release);
-        // Destroy PTO2 runtime and close orchestration SO (moved from orchestrator path)
-        if (!runtime->get_orch_built_on_host() && orch_so_handle_ != nullptr) {
-            pto2_runtime_destroy(rt);
-        }
-        DEV_ALWAYS("Thread %d: Last thread, marking executor finished", thread_idx);
-    }
-
-    return 0;
-}
-
-void AicpuExecutor::deinit(Runtime *runtime) {
-    // 1. Invalidate AICPU cache for Runtime address range.
-    //    Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but
-    //    bypasses this cache. Invalidating now ensures next round reads from HBM.
-    cache_invalidate_range(runtime, sizeof(Runtime));
-
-    // Reset per-core dispatch timestamps and task counters
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        dispatch_timestamps_[i] = 0;
-        core_dispatch_counts_[i] = 0;
-    }
-
-    // Clear per-core dispatch payloads and subslot tracking
-    memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core));
-    memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_));
-    memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_));
-    memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_));
-
-    completed_tasks_.store(0, std::memory_order_release);
-    total_tasks_ = 0;
-    finished_count_.store(0, std::memory_order_release);
-    orchestrator_done_ = false;
-    pto2_init_done_.store(false, std::memory_order_release);
-    pto2_init_complete_.store(false, std::memory_order_release);
-    runtime_init_ready_.store(false, std::memory_order_release);
-
-    // Reset core transition state
-    transition_requested_.store(false, std::memory_order_release);
-    wait_reassign_.store(0, std::memory_order_release);
-    reassigned_.store(false, std::memory_order_release);
-    completed_.store(false, std::memory_order_release);
-
-    // Reset core discovery and assignment state
-    aic_count_ = 0;
-    aiv_count_ = 0;
-    cores_total_num_ = 0;
-    thread_num_ = 0;
-    sched_thread_num_ = 0;
-    thread_cores_num_ = 0;
-    orch_to_sched_ = false;
-    memset(trackers_, 0, sizeof(trackers_));
-    memset(core_idle_, 0, sizeof(core_idle_));
-    memset(core_assignments_, 0, sizeof(core_assignments_));
-    memset(core_count_per_thread_, 0, sizeof(core_count_per_thread_));
-
-    // Reset orchestration SO state (handle freed by last thread before deinit)
-    orch_func_ = nullptr;
-    orch_args_cached_ = nullptr;
-    if (orch_so_handle_ != nullptr) {
-        dlclose(orch_so_handle_);
-    }
-    if (orch_so_path_[0] != '\0') {
-        unlink(orch_so_path_);
-    }
-    orch_so_handle_ = nullptr;
-    orch_so_path_[0] = '\0';
-
-    // Reset register-related state
-    for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) {
-        core_id_to_reg_addr_[i] = 0;
-        executing_reg_task_ids_[i] = AICPU_TASK_INVALID;
-    }
-    regs_ = 0;
-
-    // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
-    rt = nullptr;
-
-    DEV_INFO("DeInit: Runtime execution state reset");
-
-    initialized_.store(false, std::memory_order_release);
-    init_done_.store(false, std::memory_order_release);
-    init_failed_.store(false, std::memory_order_release);
-    thread_idx_.store(0, std::memory_order_release);
-    finished_.store(false, std::memory_order_release);
-
-    DEV_INFO("DeInit: AicpuExecutor reset complete");
-}
-
-void AicpuExecutor::emergency_shutdown(Runtime *runtime) {
-    DEV_WARN("Emergency shutdown: sending exit signal to all initialized cores");
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-        if (core_id_to_reg_addr_[i] != 0) {
-            platform_deinit_aicore_regs(core_id_to_reg_addr_[i]);
-        }
-    }
-
-    DEV_WARN("Emergency shutdown complete");
-}
-
-void AicpuExecutor::diagnose_stuck_state(
-    Runtime *runtime, int32_t thread_idx, const int32_t *cur_thread_cores, int32_t core_num, Handshake *hank
-) {
-    (void)runtime;
-    PTO2SchedulerState *sched = &rt->scheduler;
-    DEV_ALWAYS("========== DIAGNOSTIC REPORT: Thread %d ==========", thread_idx);
-
-    int32_t completed = completed_tasks_.load(std::memory_order_acquire);
-    int32_t total = total_tasks_;
-    DEV_ALWAYS("Progress: %d/%d tasks (%.1f%%)", completed, total, total > 0 ? completed * 100.0 / total : 0.0);
-
-    uint64_t aic_ready = 0, aiv_ready = 0, aiv_x2_ready = 0, mixed_x1_ready = 0, mixed_x2_ready = 0;
-    if (rt) {
-        aic_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_ONLY)].size();
-        aiv_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIV_X1)].size();
-        aiv_x2_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIV_X2)].size();
-        mixed_x1_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_AIV_X1)].size();
-        mixed_x2_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_AIV_X2)].size();
-    }
-    DEV_ALWAYS(
-        "Ready Queues: AIC=%lu, AIV=%lu, AIV_X2=%lu, AIC_AIV_X1=%lu, AIC_AIV_X2=%lu", aic_ready, aiv_ready,
-        aiv_x2_ready, mixed_x1_ready, mixed_x2_ready
-    );
-
-    int32_t busy_cores = 0;
-    int32_t idle_cores = 0;
-
-    DEV_ALWAYS("Core Status:");
-    for (int32_t i = 0; i < core_num; i++) {
-        int32_t core_id = cur_thread_cores[i];
-        Handshake *h = &hank[core_id];
-        const char *core_type_str = core_type_to_string(h->core_type);
-
-        uint64_t reg_addr = core_id_to_reg_addr_[core_id];
-        uint64_t reg_val = read_reg(reg_addr, RegId::COND);
-        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
-        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-        int32_t task_id = executing_reg_task_ids_[core_id];
-
-        if (reg_state != TASK_FIN_STATE || task_id >= 0) {
-            busy_cores++;
-            if (task_id >= 0) {
-                int32_t kernel_id = -1;
-                if (rt && rt->sm_handle && executing_slot_state_by_core_[core_id]) {
-                    int32_t diag_slot = static_cast<int32_t>(executing_subslot_by_core_[core_id]);
-                    kernel_id = executing_slot_state_by_core_[core_id]->task->kernel_id[diag_slot];
-                }
-                DEV_ALWAYS(
-                    "  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s), executing_reg_task_id=%d, "
-                    "kernel_id=%d",
-                    core_id, core_type_str, reg_val, reg_task_id, reg_state == TASK_FIN_STATE ? "FIN" : "ACK", task_id,
-                    kernel_id
-                );
-            } else {
-                DEV_ALWAYS(
-                    "  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s) but task_id not tracked", core_id,
-                    core_type_str, reg_val, reg_task_id, reg_state == TASK_FIN_STATE ? "FIN" : "ACK"
-                );
-            }
-        } else {
-            idle_cores++;
-        }
-    }
-
-    DEV_ALWAYS("Summary: %d busy, %d idle", busy_cores, idle_cores);
-
-    // Diagnose deadlock vs livelock
-    if (busy_cores == 0 && aic_ready == 0 && aiv_ready == 0 && completed < total) {
-        DEV_ALWAYS("*** DEADLOCK DETECTED ***");
-        DEV_ALWAYS("All cores idle, no ready tasks, but %d tasks incomplete", total - completed);
-        DEV_ALWAYS("Check PTO2 shared memory for task dependency state");
-    } else if (busy_cores > 0) {
-        DEV_ALWAYS("*** LIVELOCK / HUNG TASK ***");
-        DEV_ALWAYS("%d cores executing but no progress", busy_cores);
-    }
-
-    DEV_ALWAYS("========== END DIAGNOSTIC ==========");
-}
-
-// ===== Public Entry Point =====
-
-/**
- * aicpu_execute - Main AICPU kernel execution entry point
- *
- * This is called by DynTileFwkBackendKernelServer in kernel.cpp.
- * Orchestrates the complete task runtime execution:
- * 1. Initialize executor (thread-safe, first thread only)
- * 2. Wait for initialization to complete
- * 3. Execute tasks on managed cores
- * 4. Cleanup when last thread finishes
- *
- * @param runtime Pointer to Runtime structure
- * @return 0 on success, non-zero on error
- */
-extern "C" int32_t aicpu_execute(Runtime *runtime) {
-    if (runtime == nullptr) {
-        DEV_ERROR("%s", "Invalid argument: null Runtime pointer");
-        return -1;
-    }
-
-    DEV_INFO("%s", "aicpu_execute: Starting AICPU kernel execution");
-
-    // Get platform register addresses from platform-level global
-    g_aicpu_executor.regs_ = get_platform_regs();
-
-    g_aicpu_executor.init(runtime);
-
-    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) {
-        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) {
-            DEV_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution");
-            return -1;
-        }
-    }
-
-    int32_t rc = g_aicpu_executor.run(runtime);
-    if (rc != 0) {
-        DEV_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
-        return rc;
-    }
-
-    // Last thread cleans up
-    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
-        DEV_INFO("aicpu_execute: Last thread finished, cleaning up");
-        g_aicpu_executor.deinit(runtime);
-    }
-
-    DEV_INFO("%s", "aicpu_execute: Kernel execution completed successfully");
-    return 0;
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/build_config.py b/src/a2a3/runtime/aicpu_build_graph/build_config.py
deleted file mode 100644
index 17569e35f..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/build_config.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# AICPU Build Graph Runtime build configuration
-# All paths are relative to this file's directory (src/runtime/aicpu_build_graph/)
-#
-# This is a device-orchestration runtime where:
-# - AICPU thread 3 runs the orchestrator (builds task graph on device)
-# - AICPU threads 0/1/2 run schedulers (dispatch tasks to AICore)
-# - AICore executes tasks via PTO2DispatchPayload
-#
-# The "orchestration" directory contains source files compiled into both
-# runtime targets AND the orchestration .so (e.g., tensor methods needed
-# by the Tensor constructor's validation logic).
-
-BUILD_CONFIG = {
-    "aicore": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["aicore", "orchestration"]
-    },
-    "aicpu": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["aicpu", "runtime", "orchestration"]
-    },
-    "host": {
-        "include_dirs": ["runtime"],
-        "source_dirs": ["host", "runtime", "orchestration"]
-    },
-    "orchestration": {
-        "include_dirs": ["runtime", "orchestration"],
-        "source_dirs": ["orchestration"]
-    }
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md
deleted file mode 100644
index 561b1bb73..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Runtime Logic: aicpu_build_graph
-
-## Overview
-The aicpu_build_graph runtime builds the task graph on AICPU using a small orchestration plugin. A dedicated builder thread runs the plugin and emits tasks into the shared Runtime object, while scheduler threads dispatch published tasks to AICore. This enables concurrent build and schedule on device.
-
-## Core Data Structures
-- `Runtime` stores task state, orchestration arguments, kernel address table, and the embedded orchestration plugin. See `src/runtime/aicpu_build_graph/runtime/runtime.h`.
-- `Task` adds two concurrency flags, `published` and `completed`, so tasks can be made visible to schedulers only when fully defined.
-- `AicpuBuildApi` is a device-side function table used by orchestration plugins to add tasks, add edges, and publish tasks without linking against runtime symbols.
-- `HostApi` provides device memory ops used during host-side initialization.
-
-## Host Init Flow
-1. `init_runtime_impl` registers kernel binaries and fills `Runtime::kernel_addrs[]` so AICPU-side builders can resolve `func_id` to `function_bin_addr`. See `src/runtime/aicpu_build_graph/host/runtime_maker.cpp`.
-2. The host marshals orchestration arguments. Pointer args are allocated on device and copied; scalars are passed directly. Output and inout buffers are recorded with `runtime->record_tensor_pair`.
-3. The orchestration plugin SO is embedded into `Runtime` (`try_set_aicpu_orch_so`), and the entry symbol name is stored in `Runtime::aicpu_orch_func_name`.
-4. The build mode is set from `PTO_AICPU_BUILD_GRAPH_BUILD_MODE` (0 = sequential build then schedule, 1 = concurrent build and schedule).
-
-## Device Build And Schedule Flow
-1. AICPU thread 0 loads the embedded orchestration plugin via `dlopen` and calls its entry function. See `src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp`.
-2. The plugin uses `Runtime::aicpu_build_api` to build the graph. Typical sequence per task is `add_task`, `add_successor_conditional`, then `publish_task`.
-3. In concurrent mode, scheduler threads start immediately and only see tasks that have been published. In sequential mode, schedulers wait for the builder to finish.
-4. When a task completes, the scheduler decrements fanin counters and pushes newly-ready tasks to the ready queues.
-5. Tasks are dispatched to AICore using the same per-core handshake protocol as host_build_graph.
-
-## Finalize And Cleanup
-`validate_runtime_impl` copies recorded output tensors back to the host and frees any recorded device allocations. It also clears `tensor_pairs` and `device_allocs` for reuse. See `src/runtime/aicpu_build_graph/host/runtime_maker.cpp`.
-
-## Key Files
-- `src/runtime/aicpu_build_graph/runtime/runtime.h`
-- `src/runtime/aicpu_build_graph/host/runtime_maker.cpp`
-- `src/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp`
diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp
deleted file mode 100644
index 5dc3cf69d..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_compile_info.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "host/platform_compile_info.h"
-#include "host/runtime_compile_info.h"
-#include <string.h>
-
-extern "C" {
-
-ToolchainType get_incore_compiler(void) {
-    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_CCEC;
-    return TOOLCHAIN_HOST_GXX_15;
-}
-
-ToolchainType get_orchestration_compiler(void) {
-    // aicpu_build_graph: a2a3 needs aarch64 cross-compile (AICPU is aarch64)
-    if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX;
-    return TOOLCHAIN_HOST_GXX;
-}
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
deleted file mode 100644
index 32072f707..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/host/runtime_maker.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Runtime Builder - rt2 Implementation (Device Orchestration)
- *
- * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime.
- * Supports device orchestration where AICPU thread 3 runs the orchestrator.
- *
- * init_runtime_impl:
- *   - Converts host tensor pointers to device pointers (all tensors copied both directions)
- *   - Copies orchestration SO to device memory
- *   - Sets up runtime state for device orchestration
- *
- * validate_runtime_impl:
- *   - Copies recorded tensors back from device to host
- *   - Frees device memory
- */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/time.h>
-
-#include <cerrno>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-
-#include "../runtime/pto_shared_memory.h"
-#include "../runtime/runtime.h"
-#include "callable.h"
-#include "common/platform_config.h"
-#include "common/unified_log.h"
-
-// Helper: return current time in milliseconds
-static int64_t _now_ms() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<int64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
-}
-
-/**
- * Parse an environment variable as uint64_t with optional power-of-2 constraint.
- * Returns the parsed value on success, or 0 if unset or validation fails.
- */
-static uint64_t parse_env_uint64(const char *name, uint64_t min_val, bool require_power_of_2) {
-    const char *env = std::getenv(name);
-    if (!env) return 0;
-    char *endptr;
-    errno = 0;
-    uint64_t val = strtoull(env, &endptr, 10);
-    if (errno == ERANGE || endptr == env || *endptr != '\0' || val < min_val) {
-        LOG_WARN("%s=%s invalid (must be a valid integer >= %" PRIu64 "), ignored", name, env, min_val);
-        return 0;
-    }
-    if (require_power_of_2 && (val & (val - 1)) != 0) {
-        LOG_WARN("%s=%s invalid (must be a power of 2, >= %" PRIu64 "), ignored", name, env, min_val);
-        return 0;
-    }
-    return static_cast<uint64_t>(val);
-}
-
-/**
- * Initialize a pre-allocated runtime for device orchestration.
- *
- * For rt2 runtime, orchestration runs on AICPU thread 3 (device-side).
- * This function:
- * - Converts host pointers to device pointers
- * - Copies all tensor data to device
- * - Records all tensors for copy-back
- * - Copies orchestration SO to device memory
- * - Sets up runtime state for device orchestration
- *
- * @param runtime   Pointer to pre-constructed Runtime
- * @param callable  ChipCallable containing orch binary, func_name, and child kernels
- * @param orch_args Separated tensor/scalar arguments
- * @return 0 on success, -1 on failure
- */
-extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const ChipStorageTaskArgs *orch_args) {
-    // Validate inputs
-    if (runtime == nullptr) {
-        LOG_ERROR("Runtime pointer is null");
-        return -1;
-    }
-
-    // Register kernel binaries from ChipCallable children
-    if (callable->child_count() > 0) {
-        LOG_INFO("Registering %d kernel(s) in init_runtime_impl", callable->child_count());
-        for (int32_t i = 0; i < callable->child_count(); i++) {
-            int func_id = callable->child_func_id(i);
-            if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-                LOG_ERROR("func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-                return -1;
-            }
-            const auto &kernel = callable->child(i);
-            uint64_t addr = runtime->host_api.upload_kernel_binary(
-                func_id, reinterpret_cast<const uint8_t *>(&kernel),
-                CoreCallable::binary_data_offset() + kernel.binary_size()
-            );
-            if (addr == 0) {
-                LOG_ERROR("Failed to upload kernel binary for func_id=%d", func_id);
-                return -1;
-            }
-            runtime->set_function_bin_addr(func_id, addr);
-        }
-    }
-
-    const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
-    size_t orch_so_size = callable->binary_size();
-
-    if (orch_so_binary == nullptr || orch_so_size == 0) {
-        LOG_ERROR("Orchestration SO binary is required for device orchestration");
-        return -1;
-    }
-
-    if (orch_args == nullptr) {
-        LOG_ERROR("orch_args pointer is null");
-        return -1;
-    }
-
-    int tensor_count = orch_args->tensor_count();
-    int scalar_count = orch_args->scalar_count();
-    LOG_INFO("RT2 init: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
-
-    int64_t t_total_start = _now_ms();
-
-    // Build device args: copy from input, replace host tensor pointers with device pointers
-    ChipStorageTaskArgs device_args;
-
-    int64_t t_args_start = _now_ms();
-    for (int i = 0; i < tensor_count; i++) {
-        ContinuousTensor t = orch_args->tensor(i);
-
-        if (t.is_child_memory()) {
-            LOG_INFO("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.data);
-            device_args.add_tensor(t);
-            continue;
-        }
-
-        void *host_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(t.data));
-        size_t size = static_cast<size_t>(t.nbytes());
-
-        void *dev_ptr = runtime->host_api.device_malloc(size);
-        if (dev_ptr == nullptr) {
-            LOG_ERROR("Failed to allocate device memory for tensor %d", i);
-            return -1;
-        }
-
-        int rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
-        if (rc != 0) {
-            LOG_ERROR("Failed to copy tensor %d to device", i);
-            runtime->host_api.device_free(dev_ptr);
-            return -1;
-        }
-        runtime->record_tensor_pair(host_ptr, dev_ptr, size);
-        LOG_INFO("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
-
-        t.data = reinterpret_cast<uint64_t>(dev_ptr);
-        device_args.add_tensor(t);
-    }
-    for (int i = 0; i < scalar_count; i++) {
-        device_args.add_scalar(orch_args->scalar(i));
-    }
-    int64_t t_args_end = _now_ms();
-
-    // Stage the orchestration SO for DeviceRunner::prepare_orch_so to consume.
-    int64_t t_so_start = _now_ms();
-    runtime->pending_orch_so_data_ = orch_so_binary;
-    runtime->pending_orch_so_size_ = orch_so_size;
-    LOG_INFO("Orchestration SO: %zu bytes staged (host-only)", orch_so_size);
-    int64_t t_so_end = _now_ms();
-
-    // Read ready queue shard count from environment for AICPU scheduler
-    {
-        const char *env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
-        if (env_shards) {
-            char *endptr;
-            int64_t val = strtol(env_shards, &endptr, 10);
-            if (endptr != env_shards && *endptr == '\0' && val >= 1 && val <= PLATFORM_MAX_AICPU_THREADS) {
-                runtime->ready_queue_shards = static_cast<int>(val);
-            } else {
-                LOG_WARN(
-                    "PTO2_READY_QUEUE_SHARDS=%s is invalid or out of range [1,%d], using default %d", env_shards,
-                    PLATFORM_MAX_AICPU_THREADS, RUNTIME_DEFAULT_READY_QUEUE_SHARDS
-                );
-                runtime->ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
-            }
-        }
-        LOG_INFO("Ready queue shards: %d", runtime->ready_queue_shards);
-    }
-
-    // Read orchestrator-to-scheduler transition flag from environment
-    {
-        const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED");
-        if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) {
-            runtime->orch_to_sched = true;
-        }
-        LOG_INFO("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled");
-    }
-
-    // Read ring buffer size overrides from environment
-    {
-        runtime->task_window_size = parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true);
-        runtime->heap_size = parse_env_uint64("PTO2_RING_HEAP", 1024, true);
-        runtime->dep_pool_size = parse_env_uint64("PTO2_RING_DEP_POOL", 4, false);
-        if (runtime->task_window_size || runtime->heap_size || runtime->dep_pool_size) {
-            LOG_INFO(
-                "Ring buffer overrides: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%" PRIu64,
-                static_cast<uint64_t>(runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE),
-                static_cast<uint64_t>(runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE),
-                static_cast<uint64_t>(runtime->dep_pool_size ? runtime->dep_pool_size : PTO2_DEP_LIST_POOL_SIZE)
-            );
-        }
-    }
-
-    // Resolve effective sizes (env override or compile-time default)
-    uint64_t eff_heap_size = runtime->heap_size ? runtime->heap_size : PTO2_HEAP_SIZE;
-    uint64_t eff_task_window_size = runtime->task_window_size ? runtime->task_window_size : PTO2_TASK_WINDOW_SIZE;
-
-    // Allocate GM heap for orchestrator output buffers (all rings combined)
-    uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
-    int64_t t_heap_start = _now_ms();
-    void *gm_heap = runtime->host_api.device_malloc(total_heap_size);
-    int64_t t_heap_end = _now_ms();
-    if (gm_heap == nullptr) {
-        LOG_ERROR("Failed to allocate GM heap");
-        return -1;
-    }
-    runtime->record_tensor_pair(nullptr, gm_heap, total_heap_size);
-    runtime->set_gm_heap(gm_heap);
-
-    // Allocate PTO2 shared memory
-    int64_t t_sm_start = _now_ms();
-    uint64_t sm_size = pto2_sm_calculate_size(eff_task_window_size);
-    void *sm_ptr = runtime->host_api.device_malloc(sm_size);
-    int64_t t_sm_end = _now_ms();
-    if (sm_ptr == nullptr) {
-        LOG_ERROR("Failed to allocate PTO2 shared memory");
-        return -1;
-    }
-    runtime->set_gm_sm_ptr(sm_ptr);
-    runtime->record_tensor_pair(nullptr, sm_ptr, static_cast<size_t>(sm_size));
-
-    // Set up device orchestration state
-    runtime->set_orch_built_on_host(false);
-    runtime->set_orch_args(device_args);
-
-    LOG_INFO("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
-
-    int64_t t_total_end = _now_ms();
-    LOG_INFO("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-    LOG_INFO("TIMING: orch_so_copy = %" PRId64 "ms", t_so_end - t_so_start);
-    LOG_INFO("TIMING: gm_heap_alloc(1GB) = %" PRId64 "ms", t_heap_end - t_heap_start);
-    LOG_INFO("TIMING: shared_mem_alloc = %" PRId64 "ms", t_sm_end - t_sm_start);
-    LOG_INFO("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
-
-    return 0;
-}
-
-/**
- * Validate runtime results and cleanup.
- *
- * This function:
- * 1. Copies recorded tensors from device back to host
- * 2. Frees device memory for recorded tensors
- * 3. Clears tensor pair state
- *
- * @param runtime  Pointer to Runtime
- * @return 0 on success, -1 on failure
- */
-extern "C" int validate_runtime_impl(Runtime *runtime) {
-    if (runtime == nullptr) {
-        LOG_ERROR("Runtime pointer is null");
-        return -1;
-    }
-
-    int rc = 0;
-
-    LOG_INFO("=== Copying Results Back to Host ===");
-
-    // Copy all recorded tensors from device back to host
-    TensorPair *tensor_pairs = runtime->get_tensor_pairs();
-    int tensor_pair_count = runtime->get_tensor_pair_count();
-
-    LOG_INFO("Tensor pairs to process: %d", tensor_pair_count);
-
-    // PTO2 (device orchestration): graph output may be in packed buffer
-    void *pto2_sm = runtime->get_gm_sm_ptr();
-    uint64_t graph_out_ptr = 0;
-    uint64_t graph_out_size = 0;
-
-    if (pto2_sm != nullptr) {
-        // Copy header from device to host to read graph_output_ptr/size
-        PTO2SharedMemoryHeader host_header;
-        int hdr_rc = runtime->host_api.copy_from_device(&host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
-        if (hdr_rc == 0) {
-            graph_out_ptr = host_header.graph_output_ptr;
-            graph_out_size = host_header.graph_output_size;
-            if (graph_out_ptr != 0) {
-                LOG_INFO("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size);
-            }
-        } else {
-            LOG_WARN("Failed to copy PTO2 header from device");
-        }
-    }
-
-    bool first_output_tensor = true;
-    for (int i = 0; i < tensor_pair_count; i++) {
-        const TensorPair &pair = tensor_pairs[i];
-
-        // Skip if device pointer is null
-        if (pair.dev_ptr == nullptr) {
-            LOG_WARN("Tensor %d has null device pointer, skipping", i);
-            continue;
-        }
-
-        // If host pointer is null, this is a device-only allocation (no copy-back)
-        if (pair.host_ptr == nullptr) {
-            LOG_INFO("Tensor %d: device-only allocation (no copy-back)", i);
-            continue;
-        }
-
-        void *src_ptr = pair.dev_ptr;
-        size_t copy_size = pair.size;
-
-        // Use graph_output_ptr for the first output tensor if available
-        if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) {
-            src_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(graph_out_ptr));
-            copy_size = static_cast<size_t>(graph_out_size);
-            LOG_INFO("Using packed output buffer for tensor %d", i);
-            first_output_tensor = false;
-        }
-
-        int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size);
-        if (copy_rc != 0) {
-            LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc);
-            rc = copy_rc;
-        } else {
-            LOG_INFO("Tensor %d: %zu bytes copied to host", i, pair.size);
-        }
-    }
-
-    // Cleanup device tensors
-    LOG_INFO("=== Cleaning Up ===");
-    for (int i = 0; i < tensor_pair_count; i++) {
-        if (tensor_pairs[i].dev_ptr != nullptr) {
-            runtime->host_api.device_free(tensor_pairs[i].dev_ptr);
-        }
-    }
-    LOG_INFO("Freed %d device allocations", tensor_pair_count);
-
-    // Cleanup kernel binaries
-    int kernel_count = runtime->get_registered_kernel_count();
-    for (int i = 0; i < kernel_count; i++) {
-        int func_id = runtime->get_registered_kernel_func_id(i);
-        runtime->host_api.remove_kernel_binary(func_id);
-        runtime->set_function_bin_addr(func_id, 0);
-    }
-    if (kernel_count > 0) {
-        LOG_INFO("Freed %d kernel binaries", kernel_count);
-    }
-    runtime->clear_registered_kernels();
-
-    // Clear tensor pairs
-    runtime->clear_tensor_pairs();
-
-    LOG_INFO("=== Finalize Complete ===");
-
-    return rc;
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp b/src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp
deleted file mode 100644
index 8ac00ea30..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/orchestration/common.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "common.h"
-
-#ifdef __linux__
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <execinfo.h>
-#include <unistd.h>
-
-#include <array>
-#include <cstring>
-#include <vector>
-#endif
-
-/**
- * Use addr2line to convert an address to file:line information.
- * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
- * If inlining is present, also returns the outer call chain via inline_chain.
- */
-#ifdef __linux__
-static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
-    char cmd[512];
-    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
-
-    std::array<char, 256> buffer;
-    std::string raw_output;
-
-    FILE *pipe = popen(cmd, "r");
-    if (pipe) {
-        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-            raw_output += buffer.data();
-        }
-        pclose(pipe);
-    }
-
-    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
-        return "";
-    }
-
-    // Split by lines
-    std::vector<std::string> lines;
-    size_t pos = 0;
-    while (pos < raw_output.size()) {
-        size_t nl = raw_output.find('\n', pos);
-        if (nl == std::string::npos) nl = raw_output.size();
-        std::string line = raw_output.substr(pos, nl - pos);
-        while (!line.empty() && line.back() == '\r')
-            line.pop_back();
-        if (!line.empty()) lines.push_back(line);
-        pos = nl + 1;
-    }
-
-    if (lines.empty()) return "";
-
-    // First line is the innermost actual code location; subsequent lines are outer inline callers
-    if (inline_chain && lines.size() > 1) {
-        *inline_chain = "";
-        for (size_t j = 1; j < lines.size(); j++) {
-            *inline_chain += "    [inlined by] " + lines[j] + "\n";
-        }
-    }
-
-    return lines.front();
-}
-#endif
-
-/**
- * Get current stack trace information (including file paths and line numbers).
- * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
- */
-std::string get_stacktrace(int skip_frames) {
-    (void)skip_frames;  // May be unused on non-Linux platforms
-    std::string result;
-#ifdef __linux__
-    const int max_frames = 64;
-    void *buffer[max_frames];
-    int nframes = backtrace(buffer, max_frames);
-    char **symbols = backtrace_symbols(buffer, nframes);
-
-    if (symbols) {
-        result = "Stack trace:\n";
-        for (int i = skip_frames; i < nframes; i++) {
-            std::string frame_info;
-
-            void *addr = (void *)((char *)buffer[i] - 1);
-
-            Dl_info dl_info;
-            std::string inline_chain;
-            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
-                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
-                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
-
-                if (addr2line_result.empty()) {
-                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
-                }
-
-                if (!addr2line_result.empty()) {
-                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
-                }
-            }
-
-            if (frame_info.empty()) {
-                std::string frame(symbols[i]);
-
-                size_t start = frame.find('(');
-                size_t end = frame.find('+', start);
-                if (start != std::string::npos && end != std::string::npos) {
-                    std::string mangled = frame.substr(start + 1, end - start - 1);
-                    int status;
-                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
-                    if (status == 0 && demangled) {
-                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
-                        free(demangled);
-                    }
-                }
-                frame_info = frame;
-            }
-
-            char buf[16];
-            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
-            result += buf + frame_info + "\n";
-            if (!inline_chain.empty()) {
-                result += inline_chain;
-            }
-        }
-        free(symbols);
-    }
-#else
-    result = "(Stack trace is only available on Linux)\n";
-#endif
-    return result;
-}
-
-// AssertionError constructor
-static std::string build_assert_message(const char *condition, const char *file, int line) {
-    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
-    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
-    msg += get_stacktrace(3);
-    return msg;
-}
-
-AssertionError::AssertionError(const char *condition, const char *file, int line) :
-    std::runtime_error(build_assert_message(condition, file, line)),
-    condition_(condition),
-    file_(file),
-    line_(line) {}
-
-[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
-    fprintf(stderr, "\n========================================\n");
-    fprintf(stderr, "Assertion failed: %s\n", condition);
-    fprintf(stderr, "Location: %s:%d\n", file, line);
-    fprintf(stderr, "%s", get_stacktrace(2).c_str());
-    fprintf(stderr, "========================================\n\n");
-    fflush(stderr);
-
-    throw AssertionError(condition, file, line);
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h
deleted file mode 100644
index 25c6cbc23..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/orchestration/pto_orchestration_api.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Orchestration API - Slim header for orchestration .so files
- *
- * This header provides everything an orchestration source needs without
- * pulling in runtime implementation headers.  The orchestration .so has
- * zero link dependencies on runtime .cpp files; all runtime calls go
- * through the PTO2RuntimeOps function-pointer table embedded in
- * PTO2Runtime.
- *
- * Orchestration sources include ONLY this header:
- *   #include "pto_orchestration_api.h"
- *
- * Runtime sources continue to use pto_runtime2.h (which defines the
- * full PTO2Runtime struct with all internal fields).
- */
-
-#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_ORCHESTRATION_PTO_ORCHESTRATION_API_H_
-#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_ORCHESTRATION_PTO_ORCHESTRATION_API_H_
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-// Type headers needed by orchestration
-#include "pto_runtime2_types.h"  // PTO2TaskId
-#include "pto_submit_types.h"    // MixedKernels, INVALID_KERNEL_ID, subtask slots
-#include "pto_types.h"           // Arg, PTOTensorEntry, TensorArgType
-#include "task_args.h"           // ChipStorageTaskArgs, ContinuousTensor
-#include "tensor.h"              // Tensor, TensorCreateInfo, make_tensor_external
-
-// Convert ContinuousTensor to Tensor (needs make_tensor_external from tensor.h)
-static_assert(
-    CONTINUOUS_TENSOR_MAX_DIMS == RUNTIME_MAX_TENSOR_DIMS, "ContinuousTensor and runtime max dims must match"
-);
-inline Tensor from_tensor_arg(const ContinuousTensor &t, bool manual_dep = false, int32_t version = 0) {
-    return make_tensor_external(
-        reinterpret_cast<void *>(static_cast<uintptr_t>(t.data)), t.shapes, t.ndims, t.dtype, manual_dep, version
-    );
-}
-
-// =============================================================================
-// Ops Table and Opaque Runtime
-// =============================================================================
-
-/**
- * Forward declaration — the orchestration sees PTO2Runtime as a partial
- * struct whose first field is the ops pointer.  The full definition
- * lives in pto_runtime2.h (used only by runtime .cpp files).
- */
-typedef struct PTO2Runtime PTO2Runtime;
-
-/**
- * Function-pointer table for runtime operations.
- * Populated by the runtime; called by orchestration through inline wrappers.
- */
-typedef struct PTO2RuntimeOps {
-    SubmitResult (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args);
-    void (*add_dependency)(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer);
-    void (*scope_begin)(PTO2Runtime *rt);
-    void (*scope_end)(PTO2Runtime *rt);
-    void (*orchestration_done)(PTO2Runtime *rt);
-    bool (*is_fatal)(PTO2Runtime *rt);
-
-    // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_info)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    void (*log_always)(const char *func, const char *fmt, ...);
-} PTO2RuntimeOps;
-
-/**
- * Partial PTO2Runtime definition for orchestration.
- *
- * Only the ops pointer is visible.  The real struct (in pto_runtime2.h)
- * has the same first field, so accessing rt->ops through this definition
- * is well-defined (C struct layout guarantee).
- */
-struct PTO2Runtime {
-    const PTO2RuntimeOps *ops;
-};
-
-// =============================================================================
-// Inline Convenience Wrappers (call through ops table)
-// =============================================================================
-
-static inline SubmitResult rt_submit_task(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) {
-    return rt->ops->submit_task(rt, mixed_kernels, args);
-}
-
-/**
- * Convenience wrapper: submit an AIC-only task.
- */
-static inline SubmitResult rt_submit_aic_task(PTO2Runtime *rt, int32_t kernel_id, const Arg &args) {
-    MixedKernels mk;
-    mk.aic_kernel_id = kernel_id;
-    return rt->ops->submit_task(rt, mk, args);
-}
-
-/**
- * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
- */
-static inline SubmitResult rt_submit_aiv_task(PTO2Runtime *rt, int32_t kernel_id, const Arg &args) {
-    MixedKernels mk;
-    mk.aiv0_kernel_id = kernel_id;
-    return rt->ops->submit_task(rt, mk, args);
-}
-
-/**
- * Add an explicit dependency: consumer waits for producer to complete.
- */
-static inline void rt_add_dependency(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer) {
-    rt->ops->add_dependency(rt, producer, consumer);
-}
-
-static inline void rt_scope_begin(PTO2Runtime *rt) { rt->ops->scope_begin(rt); }
-
-static inline void rt_scope_end(PTO2Runtime *rt) { rt->ops->scope_end(rt); }
-
-static inline void rt_orchestration_done(PTO2Runtime *rt) { rt->ops->orchestration_done(rt); }
-
-static inline bool rt_is_fatal(PTO2Runtime *rt) { return rt->ops->is_fatal(rt); }
-
-// =============================================================================
-// Logging Macros for Orchestration (call through ops table)
-// =============================================================================
-
-#define LOG_ERROR(rt, fmt, ...) (rt)->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_WARN(rt, fmt, ...) (rt)->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_INFO(rt, fmt, ...) (rt)->ops->log_info(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_DEBUG(rt, fmt, ...) (rt)->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_ALWAYS(rt, fmt, ...) (rt)->ops->log_always(__FUNCTION__, fmt, ##__VA_ARGS__)
-
-// =============================================================================
-// C++ Scope Guards and Macros
-// =============================================================================
-
-/**
- * RAII Scope Guard (calls through ops table)
- */
-class PTO2ScopeGuard {
-public:
-    explicit PTO2ScopeGuard(PTO2Runtime *rt) :
-        rt_(rt) {
-        rt_->ops->scope_begin(rt_);
-    }
-    ~PTO2ScopeGuard() { rt_->ops->scope_end(rt_); }
-
-private:
-    PTO2Runtime *rt_;
-};
-
-#define _PTO2_CONCATENATE_IMPL(x, y) x##y
-#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y)
-
-#define PTO2_SCOPE_GUARD(rt) [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)(rt)
-
-/**
- * Scoped block macro:
- *   PTO2_SCOPE(rt) {
- *       rt_submit_task(rt, ...);
- *   }
- */
-#define PTO2_SCOPE(rt) if (PTO2_SCOPE_GUARD(rt); true)
-
-// =============================================================================
-// Orchestration Config
-// =============================================================================
-
-/**
- * Configuration exported by orchestration .so via aicpu_orchestration_config().
- * The executor reads these values to set up shared memory and runtime.
- *
- * This struct is defined identically in pto_runtime2.h (with an include
- * guard) so the executor can use the same type without including this header.
- */
-#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
-#define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
-    int expected_arg_count;
-};
-#endif
-
-#endif  // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_ORCHESTRATION_PTO_ORCHESTRATION_API_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/common.h b/src/a2a3/runtime/aicpu_build_graph/runtime/common.h
deleted file mode 100644
index 1cb9647ce..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/common.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdexcept>
-#include <string>
-
-/**
- * Get current stack trace information (including file paths and line numbers).
- * Implemented in common.cpp.
- */
-std::string get_stacktrace(int skip_frames = 1);
-
-/**
- * Assertion failure exception, containing file, line number, condition, and stack trace information.
- */
-class AssertionError : public std::runtime_error {
-public:
-    AssertionError(const char *condition, const char *file, int line);
-
-    const char *condition() const { return condition_; }
-    const char *file() const { return file_; }
-    int line() const { return line_; }
-
-private:
-    const char *condition_;
-    const char *file_;
-    int line_;
-};
-
-/**
- * Handler function for assertion failures.
- * Implemented in common.cpp.
- */
-[[noreturn]] void assert_impl(const char *condition, const char *file, int line);
-
-/**
- * debug_assert macro - checks condition in debug mode; throws exception and prints stack trace on failure.
- * No-op in release mode (NDEBUG).
- */
-#ifdef NDEBUG
-#define debug_assert(cond) ((void)0)
-#else
-#define debug_assert(cond)                          \
-    do {                                            \
-        if (!(cond)) {                              \
-            assert_impl(#cond, __FILE__, __LINE__); \
-        }                                           \
-    } while (0)
-#endif
-
-/**
- * always_assert macro - checks condition in both debug and release modes.
- */
-#define always_assert(cond)                         \
-    do {                                            \
-        if (!(cond)) {                              \
-            assert_impl(#cond, __FILE__, __LINE__); \
-        }                                           \
-    } while (0)
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h
deleted file mode 100644
index aa847eea4..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto2_dispatch_payload.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * @file pto2_dispatch_payload.h
- * @brief Minimal dispatch payload for AICore kernel execution
- *
- * Shared between AICPU (builds in-place) and AICore (reads to run kernel).
- * Handshake.task points to PTO2DispatchPayload embedded in PTO2TaskPayload.
- *
- * Only contains fields AICore needs to execute: function address + arguments.
- * Metadata (task_id, kernel_id, core_type) lives in PTO2TaskDescriptor and
- * is accessed by AICPU when needed (profiling, diagnostics).
- */
-
-#ifndef RT2_PTO2_DISPATCH_PAYLOAD_H_
-#define RT2_PTO2_DISPATCH_PAYLOAD_H_
-
-#include <stdint.h>
-
-/** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */
-#ifndef PTO2_DISPATCH_MAX_ARGS
-#define PTO2_DISPATCH_MAX_ARGS 128
-#endif
-
-/**
- * Dispatch payload: minimal execution interface for AICore.
- * Layout: function_bin_addr followed by args[].
- * AICore reads function_bin_addr, casts to UnifiedKernelFunc, calls with args.
- */
-struct PTO2DispatchPayload {
-    uint64_t function_bin_addr;            /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */
-    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars) */
-};
-
-#endif  // RT2_PTO2_DISPATCH_PAYLOAD_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp
deleted file mode 100644
index adabc68e0..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.cpp
+++ /dev/null
@@ -1,608 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Orchestrator Implementation (Explicit Dependency Variant)
- *
- * Implements orchestrator state management, scope handling, task submission
- * with explicit dependencies, and scope-end batch publish.
- *
- * Key differences from tensormap_and_ringbuffer:
- * - No TensorMap: submit_task is a 3-step process (alloc, heap, write)
- * - add_dependency: explicitly wires producer -> consumer edges
- * - scope_end: batch-publishes all tasks (releases +1 fanin redundance)
- */
-
-#include "pto_orchestrator.h"
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/unified_log.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-#include "pto_types.h"
-#include "tensor.h"
-
-// =============================================================================
-// Orchestrator Profiling (compile-time toggle)
-// =============================================================================
-#if PTO2_ORCH_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
-// Weak fallback for builds that don't link device_time.cpp (e.g. host).
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-__attribute__((weak, visibility("hidden"))) void
-l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
-static uint64_t g_orch_alloc_cycle = 0;
-static uint64_t g_orch_args_cycle = 0;
-static uint64_t g_orch_heap_cycle = 0;
-static uint64_t g_orch_fanin_cycle = 0;
-static uint64_t g_orch_scope_end_cycle = 0;
-static int64_t g_orch_submit_count = 0;
-static uint32_t g_orch_submit_idx = 0;
-uint64_t g_orch_alloc_wait_cycle = 0;
-uint64_t g_orch_heap_wait_cycle = 0;
-uint64_t g_orch_fanin_wait_cycle = 0;
-uint64_t g_orch_alloc_atomic_count = 0;
-uint64_t g_orch_args_atomic_count = 0;
-uint64_t g_orch_heap_atomic_count = 0;
-uint64_t g_orch_fanin_atomic_count = 0;
-uint64_t g_orch_scope_end_atomic_count = 0;
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                       \
-    do {                                                                                 \
-        _t1 = get_sys_cnt_aicpu();                                                       \
-        acc += (_t1 - _t0);                                                              \
-        l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
-        _t0 = _t1;                                                                       \
-    } while (0)
-#elif PTO2_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_perf_collector_aicpu.h"
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-__attribute__((weak, visibility("hidden"))) void
-l2_perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
-static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                       \
-    bool _prof_active = orch->enable_l2_swimlane; \
-    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0
-#define CYCLE_COUNT_LAP(acc) \
-    do {                     \
-    } while (0)
-#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                           \
-    do {                                                                                     \
-        if (_prof_active) {                                                                  \
-            _t1 = get_sys_cnt_aicpu();                                                       \
-            l2_perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
-            _t0 = _t1;                                                                       \
-        }                                                                                    \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)
-#endif
-
-// =============================================================================
-// Orchestrator Initialization
-// =============================================================================
-
-bool pto2_orchestrator_init(
-    PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size,
-    int32_t dep_pool_capacity
-) {
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_handle = sm_handle;
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    // Initialize per-ring resources
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto &fc = sm_handle->header->rings[r].fc;
-
-        pto2_heap_ring_init(&orch->rings[r].heap_ring, ring_heap_base, heap_size, &fc.heap_tail, &fc.heap_top);
-        orch->rings[r].heap_ring.error_code_ptr = &sm_handle->header->orch_error_code;
-
-        pto2_task_ring_init(
-            &orch->rings[r].task_ring, sm_handle->task_descriptors[r], sm_handle->header->rings[r].task_window_size,
-            &fc.last_task_alive, &fc.current_task_index
-        );
-        orch->rings[r].task_ring.error_code_ptr = &sm_handle->header->orch_error_code;
-
-        PTO2DepListEntry *dep_entries =
-            reinterpret_cast<PTO2DepListEntry *>(calloc(dep_pool_capacity, sizeof(PTO2DepListEntry)));
-        if (!dep_entries) {
-            for (int j = 0; j < r; j++) {
-                free(orch->rings[j].dep_pool.base);
-            }
-            return false;
-        }
-        orch->rings[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_handle->header->orch_error_code);
-    }
-
-    // Initialize scope stack
-    uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH;
-    int32_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP;
-    orch->scope_tasks = reinterpret_cast<PTO2TaskSlotState **>(malloc(init_cap * sizeof(PTO2TaskSlotState *)));
-    orch->scope_begins = reinterpret_cast<int32_t *>(malloc(max_depth * sizeof(int32_t)));
-    if (!orch->scope_tasks || !orch->scope_begins) {
-        free(orch->scope_tasks);
-        free(orch->scope_begins);
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            free(orch->rings[r].dep_pool.base);
-        }
-        return false;
-    }
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = init_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = max_depth;
-
-    return true;
-}
-
-void pto2_orchestrator_destroy(PTO2OrchestratorState *orch) {
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        free(orch->rings[r].dep_pool.base);
-        orch->rings[r].dep_pool.base = NULL;
-    }
-
-    free(orch->scope_tasks);
-    orch->scope_tasks = NULL;
-    free(orch->scope_begins);
-    orch->scope_begins = NULL;
-}
-
-void pto2_orchestrator_set_scheduler(PTO2OrchestratorState *orch, PTO2SchedulerState *scheduler) {
-    orch->scheduler = scheduler;
-}
-
-// =============================================================================
-// Scope Management
-// =============================================================================
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
-    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
-        int32_t new_cap = orch->scope_tasks_capacity * 2;
-        PTO2TaskSlotState **new_buf =
-            reinterpret_cast<PTO2TaskSlotState **>(realloc(orch->scope_tasks, new_cap * sizeof(PTO2TaskSlotState *)));
-        assert(new_buf && "Failed to grow scope task buffer");
-        orch->scope_tasks = new_buf;
-        orch->scope_tasks_capacity = new_cap;
-    }
-    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
-}
-
-void pto2_scope_begin(PTO2OrchestratorState *orch) {
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
-
-    ++orch->scope_stack_top;
-    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
-}
-
-void pto2_scope_end(PTO2OrchestratorState *orch) {
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se0 = get_sys_cnt_aicpu();
-#endif
-
-    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
-    int32_t count = orch->scope_tasks_size - begin;
-
-    if (orch->scheduler && count > 0) {
-        PTO2TaskSlotState **tasks = &orch->scope_tasks[begin];
-
-        // Batch publish: release the "+1 redundance" in fanin for each task.
-        // Tasks whose fanin is fully satisfied become READY and are pushed
-        // to the scheduler's ready queues.
-        for (int32_t i = 0; i < count; i++) {
-            PTO2TaskSlotState *slot = tasks[i];
-            if (!slot) continue;
-
-            // task_state is already PENDING from submit_task (defensive store)
-            slot->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
-
-            // Release the +1 fanin redundance
-            int32_t new_rc = slot->fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-            if (new_rc >= slot->fanin_count) {
-                PTO2ResourceShape shape = pto2_active_mask_to_shape(slot->active_mask);
-                orch->scheduler->ready_queues[static_cast<int32_t>(shape)].push(slot);
-            }
-        }
-
-        // Release the scope's fanout reference on each task (enables CONSUMED transition)
-        orch->scheduler->on_scope_end(tasks, count);
-    }
-
-    // Rewind the task buffer
-    orch->scope_tasks_size = begin;
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se1 = get_sys_cnt_aicpu();
-    g_orch_scope_end_cycle += (_se1 - _se0);
-#endif
-}
-
-// =============================================================================
-// Task Submission (3-step: alloc, heap, write — no TensorMap)
-// =============================================================================
-SubmitResult pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args) {
-    CYCLE_COUNT_START();
-
-    SubmitResult result;
-
-    if (orch->fatal) {
-        return result;
-    }
-
-    // Validate Arg
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg Detected!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("========================================");
-        orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_INVALID_ARGS, std::memory_order_release);
-        orch->fatal = true;
-        return result;
-    }
-
-    uint8_t ring_id = orch->current_ring_id();
-    auto &task_ring = orch->rings[ring_id].task_ring;
-    PTO2SchedulerState *sched = orch->scheduler;
-
-    // Validate submit inputs
-    uint8_t active_mask = pto2_mixed_kernels_to_active_mask(mixed_kernels);
-    always_assert(active_mask != 0 && "MixedKernels must have at least one active slot");
-
-    // Normalize single-AIV tasks
-    MixedKernels normalized = mixed_kernels;
-    bool has_aiv0 = (active_mask & PTO2_SUBTASK_MASK_AIV0) != 0;
-    bool has_aiv1 = (active_mask & PTO2_SUBTASK_MASK_AIV1) != 0;
-    if (has_aiv1 && !has_aiv0) {
-        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
-        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
-        active_mask = pto2_mixed_kernels_to_active_mask(normalized);
-    }
-
-    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
-
-    // Scope deadlock pre-check
-    {
-        int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
-        if (scope_task_count >= task_ring.window_size - 1) {
-            int32_t total_submitted = task_ring.current_index_ptr->load(std::memory_order_acquire);
-            int32_t last_alive = task_ring.last_alive_ptr->load(std::memory_order_acquire);
-            int32_t active_count = total_submitted - last_alive;
-
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
-            LOG_ERROR("========================================");
-            LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, task_ring.window_size);
-            LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
-            LOG_ERROR("  ring_id:            %d", ring_id);
-            LOG_ERROR("  scope_task_count:   %d", scope_task_count);
-            LOG_ERROR("  total_submitted:    %d", total_submitted);
-            LOG_ERROR("  last_task_alive:    %d", last_alive);
-            LOG_ERROR("  active_tasks:       %d / %d", active_count, task_ring.window_size);
-            LOG_ERROR("========================================");
-            orch->sm_handle->header->orch_error_code.store(PTO2_ERROR_SCOPE_DEADLOCK, std::memory_order_release);
-            orch->fatal = true;
-            return result;
-        }
-    }
-
-    // === STEP 1: Allocate task slot from Task Ring ===
-    int32_t local_id = task_ring.pto2_task_ring_alloc();
-    if (local_id < 0) {
-        orch->fatal = true;
-        return result;
-    }
-    int32_t slot = task_ring.get_task_slot(local_id);
-    PTO2TaskId task_id = pto2_make_task_id(ring_id, static_cast<uint32_t>(local_id));
-
-    PTO2TaskDescriptor &task = task_ring.get_task_by_slot(slot);
-    PTO2TaskPayload *payload = &orch->sm_handle->task_payloads[ring_id][slot];
-
-    // Prefetch payload cache lines for write
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        __builtin_prefetch(&payload->tensors[i], 1, 3);
-        __builtin_prefetch(reinterpret_cast<char *>(&payload->tensors[i]) + 64, 1, 3);
-    }
-    for (int32_t i = 0; i < args.scalar_count(); i += 8) {
-        __builtin_prefetch(&payload->scalars[i], 1, 3);
-    }
-    __builtin_prefetch(payload, 1, 3);
-    __builtin_prefetch(reinterpret_cast<char *>(payload) + 64, 1, 3);
-    __builtin_prefetch(reinterpret_cast<char *>(payload) + 128, 1, 3);
-
-    // Initialize slot state
-    if (sched) {
-        auto &rs = sched->ring_sched_states[ring_id];
-        PTO2TaskSlotState &slot_state = rs.get_slot_state_by_slot(slot);
-        // fanin_count starts at 1: the "+1 redundance" released at scope_end
-        slot_state.fanin_count = 1;
-        slot_state.fanout_head = nullptr;
-        slot_state.fanout_lock.store(0, std::memory_order_relaxed);
-        // fanout_count = 1 (owning scope holds one reference)
-        slot_state.fanout_count = 1;
-        slot_state.fanout_refcount.store(0, std::memory_order_release);
-        slot_state.fanin_refcount.store(0, std::memory_order_release);
-        slot_state.payload = payload;
-        slot_state.task = &task;
-        slot_state.active_mask = active_mask;
-        slot_state.subtask_done_mask.store(0, std::memory_order_relaxed);
-        slot_state.ring_id = ring_id;
-        // Reset task_state so add_dependency doesn't see stale COMPLETED/CONSUMED
-        // from a previously-reused slot. The scheduler won't act on PENDING tasks
-        // until they're pushed to a ready queue at scope_end.
-        slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
-        scope_tasks_push(orch, &slot_state);
-    } else {
-        scope_tasks_push(orch, nullptr);
-    }
-
-    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id.raw);
-
-    // === STEP 2: Heap allocation for OUTPUT tensors ===
-    int32_t total_output_size = 0;
-    for (int i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) == TensorArgType::OUTPUT) {
-            total_output_size +=
-                PTO2_ALIGN_UP(args.tensor(i).create_info.buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
-        }
-    }
-
-    void *local_packed_base = nullptr;
-    void *local_packed_end = nullptr;
-    if (total_output_size > 0) {
-        local_packed_base = orch->pto2_alloc_packed_buffer(total_output_size);
-        if (!local_packed_base) {
-            orch->fatal = true;
-            return result;
-        }
-        local_packed_end = reinterpret_cast<char *>(local_packed_base) + total_output_size;
-    }
-
-    // Materialize OUTPUT tensors into TaskOutputTensors
-    int32_t offset = 0;
-    for (int i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) == TensorArgType::OUTPUT) {
-            const TensorCreateInfo &ci = args.tensor(i).create_info;
-            uint64_t buffer_size = ci.buffer_size_bytes();
-            uint64_t alloc_addr = reinterpret_cast<uint64_t>(reinterpret_cast<char *>(local_packed_base) + offset);
-            offset += PTO2_ALIGN_UP(buffer_size, PTO2_PACKED_OUTPUT_ALIGN);
-            result.outputs.materialize_output(ci, reinterpret_cast<void *>(alloc_addr), /*version=*/0);
-        }
-    }
-
-    CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, task_id.raw);
-
-    // Periodically reclaim dep_pool entries from retired tasks
-    if (sched) {
-        int32_t sm_last_task_alive = task_ring.last_alive_ptr->load(std::memory_order_acquire);
-        orch->rings[ring_id].dep_pool.reclaim(*sched, ring_id, sm_last_task_alive);
-    }
-
-    // === STEP 3: Write task descriptor and payload ===
-    __builtin_prefetch(&task, 1, 1);
-    task.task_id = task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = normalized.aic_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id;
-    task.packed_buffer_base = local_packed_base;
-    task.packed_buffer_end = local_packed_end;
-
-    payload->fanin_actual_count = 0;
-    payload->init(args, result.outputs);
-
-    CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw);
-
-    // Record dep pool watermark
-    if (sched) {
-        auto &rs = sched->ring_sched_states[ring_id];
-        PTO2TaskSlotState &slot_state = rs.get_slot_state_by_slot(slot);
-        slot_state.dep_pool_mark = orch->rings[ring_id].dep_pool.top;
-    }
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-
-    result.task_id = task_id;
-    return result;
-}
-
-// =============================================================================
-// Explicit Dependency Management
-// =============================================================================
-
-void pto2_add_dependency(PTO2OrchestratorState *orch, PTO2TaskId producer_id, PTO2TaskId consumer_id) {
-    if (orch->fatal) return;
-
-    PTO2SchedulerState *sched = orch->scheduler;
-    if (!sched) return;
-
-    uint8_t prod_ring = producer_id.ring();
-    uint32_t prod_local = producer_id.local();
-    uint8_t cons_ring = consumer_id.ring();
-    uint32_t cons_local = consumer_id.local();
-
-    auto &prod_rs = sched->ring_sched_states[prod_ring];
-    auto &cons_rs = sched->ring_sched_states[cons_ring];
-
-    PTO2TaskSlotState &prod_state = prod_rs.get_slot_state_by_task_id(prod_local);
-    PTO2TaskSlotState &cons_state = cons_rs.get_slot_state_by_task_id(cons_local);
-
-    // Increment consumer's fanin_count (+1 for this dependency)
-    cons_state.fanin_count += 1;
-
-    // Record producer in consumer's payload for DFX/debugging
-    PTO2TaskPayload *cons_payload = cons_state.payload;
-    if (cons_payload->fanin_actual_count < PTO2_MAX_INPUTS) {
-        cons_payload->fanin_slot_states[cons_payload->fanin_actual_count] = &prod_state;
-        cons_payload->fanin_actual_count++;
-    }
-
-    // Wire the fanout edge from producer to consumer.
-    // Always use fanout_lock: the producer may be from a previous scope
-    // and already visible to the scheduler.
-    auto &dep_pool = orch->rings[cons_ring].dep_pool;
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    pto2_fanout_lock(prod_state, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle);
-#else
-    pto2_fanout_lock(prod_state);
-#endif
-
-    prod_state.fanout_count += 1;
-    int32_t prod_task_state = prod_state.task_state.load(std::memory_order_acquire);
-
-    if (prod_task_state >= PTO2_TASK_COMPLETED) {
-        // Producer already completed — count as early finish
-        cons_state.fanin_refcount.fetch_add(1, std::memory_order_relaxed);
-    } else {
-        // Producer not yet completed — add consumer to producer's fanout list
-        prod_state.fanout_head = dep_pool.prepend(prod_state.fanout_head, &cons_state);
-    }
-
-    pto2_fanout_unlock(prod_state);
-
-#if PTO2_ORCH_PROFILING
-    g_orch_fanin_atomic_count += 3;  // lock CAS + load(task_state) + unlock store
-#endif
-}
-
-// =============================================================================
-// Flow Control
-// =============================================================================
-
-void pto2_orchestrator_done(PTO2OrchestratorState *orch) {
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        int32_t total_tasks = orch->rings[r].task_ring.current_index_ptr->load(std::memory_order_acquire);
-        if (total_tasks > 0) {
-            LOG_INFO("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
-        }
-        auto &pool = orch->rings[r].dep_pool;
-        if (pool.top > 0) {
-            LOG_INFO(
-                "=== [DepPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, pool.top, pool.tail,
-                pool.top - pool.tail, pool.high_water, pool.capacity
-            );
-        }
-    }
-    orch->sm_handle->header->orchestrator_done.store(1, std::memory_order_release);
-#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
-    g_orch_submit_idx = 0;
-#endif
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void pto2_orchestrator_print_stats(PTO2OrchestratorState *orch) {
-    LOG_INFO("=== Orchestrator Statistics ===");
-#if PTO2_PROFILING
-    LOG_INFO("Tasks submitted:     %" PRId64, orch->tasks_submitted);
-    LOG_INFO("Buffers allocated:   %" PRId64, orch->buffers_allocated);
-    LOG_INFO("Bytes allocated:     %" PRId64, orch->bytes_allocated);
-#endif
-    LOG_INFO("Current scope depth: %d", orch->scope_stack_top + 1);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        int32_t active = pto2_task_ring_active_count(&orch->rings[r].task_ring);
-        if (active > 0) {
-            LOG_INFO("Ring %d task active:  %d", r, active);
-            LOG_INFO(
-                "Ring %d heap used:    %" PRIu64 " / %" PRIu64, r,
-                orch->rings[r].heap_ring.top_ptr->load(std::memory_order_relaxed), orch->rings[r].heap_ring.size
-            );
-            LOG_INFO(
-                "Ring %d dep pool:     %d / %d", r, orch->rings[r].dep_pool.used(), orch->rings[r].dep_pool.capacity
-            );
-        }
-    }
-    LOG_INFO("===============================");
-}
-
-void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState *orch) {
-    LOG_INFO("=== Scope Stack ===");
-    LOG_INFO("Depth: %d", orch->scope_stack_top + 1);
-
-    for (int i = 0; i <= orch->scope_stack_top; i++) {
-        int32_t begin = orch->scope_begins[i];
-        int32_t end = (i < orch->scope_stack_top) ? orch->scope_begins[i + 1] : orch->scope_tasks_size;
-        LOG_INFO("  [%d] tasks_owned = %d", i, end - begin);
-    }
-
-    LOG_INFO("==================");
-}
-
-#if PTO2_ORCH_PROFILING
-PTO2OrchProfilingData pto2_orchestrator_get_profiling() {
-    PTO2OrchProfilingData d;
-    d.alloc_cycle = g_orch_alloc_cycle;
-    d.args_cycle = g_orch_args_cycle;
-    d.heap_cycle = g_orch_heap_cycle;
-    d.fanin_cycle = g_orch_fanin_cycle;
-    d.scope_end_cycle = g_orch_scope_end_cycle;
-    d.submit_count = g_orch_submit_count;
-    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
-    d.heap_wait_cycle = g_orch_heap_wait_cycle;
-    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
-    d.alloc_atomic_count = g_orch_alloc_atomic_count;
-    d.args_atomic_count = g_orch_args_atomic_count;
-    d.heap_atomic_count = g_orch_heap_atomic_count;
-    d.fanin_atomic_count = g_orch_fanin_atomic_count;
-    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
-
-    // Reset
-    g_orch_alloc_cycle = g_orch_args_cycle = 0;
-    g_orch_heap_cycle = g_orch_fanin_cycle = 0;
-    g_orch_scope_end_cycle = 0;
-    g_orch_submit_count = 0;
-    g_orch_submit_idx = 0;
-    g_orch_alloc_wait_cycle = 0;
-    g_orch_heap_wait_cycle = 0;
-    g_orch_fanin_wait_cycle = 0;
-    g_orch_alloc_atomic_count = 0;
-    g_orch_args_atomic_count = 0;
-    g_orch_heap_atomic_count = 0;
-    g_orch_fanin_atomic_count = 0;
-    g_orch_scope_end_atomic_count = 0;
-    return d;
-}
-#endif
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h
deleted file mode 100644
index 1e1ce5e2d..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_orchestrator.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Orchestrator Interface (Explicit Dependency Variant)
- *
- * The Orchestrator is responsible for:
- * 1. Executing the orchestration function (Turing-complete control flow)
- * 2. Allocating intermediate buffers from the heap
- * 3. Submitting tasks via async InCore function calls
- * 4. Building the dependency graph via explicit add_dependency calls
- * 5. Managing buffer scopes for lifecycle control
- *
- * Key differences from the tensormap_and_ringbuffer variant:
- * - No TensorMap: dependencies are explicitly specified by orchestration code
- * - Scope-end batch publish: tasks are invisible to the scheduler until scope_end
- * - submit_task returns PTO2TaskId for use in add_dependency calls
- */
-
-#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_ORCHESTRATOR_H_
-#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_ORCHESTRATOR_H_
-
-#include "pto_ring_buffer.h"
-#include "pto_runtime2_types.h"
-#include "pto_scheduler.h"
-#include "pto_shared_memory.h"
-#include "pto_submit_types.h"
-#include "pto_types.h"
-
-// =============================================================================
-// Orchestrator State
-// =============================================================================
-
-/**
- * Orchestrator state structure (private to Orchestrator)
- *
- * Contains all state needed for task graph construction and buffer management.
- * No TensorMap — dependencies are added explicitly via pto2_add_dependency().
- */
-struct PTO2OrchestratorState {
-    // === SHARED MEMORY ACCESS ===
-    PTO2SharedMemoryHandle *sm_handle;
-
-    // === PER-RING RESOURCES ===
-    PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
-
-    // === SCOPE STACK (Private) ===
-    // Single contiguous buffer of task IDs, partitioned by scope level.
-    // scope_begins[i] is the index into scope_tasks where scope i starts.
-    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
-    PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
-    int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
-    int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
-    int32_t *scope_begins;            // scope_begins[i] = start index of scope i in scope_tasks
-    int32_t scope_stack_top;          // Current top of stack (-1 = no scope open)
-    uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
-
-    // === SCHEDULER REFERENCE ===
-    // Note: In simulated mode, orchestrator and scheduler share address space
-    // In real mode, they communicate via shared memory only
-    PTO2SchedulerState *scheduler;  // For simulated mode only
-#if PTO2_PROFILING
-    // Runtime profiling switch copied from Runtime::enable_l2_swimlane.
-    bool enable_l2_swimlane;
-#endif
-
-    // === GM HEAP (for output buffers) ===
-    void *gm_heap_base;     // Base address of GM heap
-    uint64_t gm_heap_size;  // Total size of GM heap (all rings)
-
-    // === FATAL ERROR ===
-    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
-    // Cross-thread notification uses shared memory orch_error_code (atomic)
-    bool fatal;
-
-    // === STATISTICS ===
-#if PTO2_PROFILING
-    int64_t tasks_submitted;
-    int64_t buffers_allocated;
-    int64_t bytes_allocated;
-#endif
-
-    /**
-     * Get current ring index from scope depth.
-     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-     */
-    uint8_t current_ring_id() const {
-        int32_t depth = scope_stack_top;
-        if (depth < 0) depth = 0;
-        return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
-    }
-
-    /**
-     * Allocate packed output buffer from current ring's heap
-     */
-    void *pto2_alloc_packed_buffer(int32_t total_size) {
-        if (total_size <= 0) {
-            return NULL;
-        }
-
-        uint8_t rid = current_ring_id();
-        void *buffer = rings[rid].heap_ring.pto2_heap_ring_alloc(total_size);
-
-#if PTO2_PROFILING
-        buffers_allocated++;
-        bytes_allocated += total_size;
-#endif
-
-        return buffer;
-    }
-};
-
-// =============================================================================
-// Orchestrator API
-// =============================================================================
-
-/**
- * Initialize orchestrator state
- *
- * @param orch       Orchestrator state to initialize
- * @param sm_handle  Shared memory handle
- * @param gm_heap    GM heap memory for output buffers
- * @param heap_size  Size of GM heap
- * @return true on success
- */
-bool pto2_orchestrator_init(
-    PTO2OrchestratorState *orch, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size,
-    int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-);
-
-/**
- * Destroy orchestrator state and free resources
- */
-void pto2_orchestrator_destroy(PTO2OrchestratorState *orch);
-
-/**
- * Set scheduler reference (for simulated mode)
- */
-void pto2_orchestrator_set_scheduler(PTO2OrchestratorState *orch, PTO2SchedulerState *scheduler);
-
-// =============================================================================
-// Scope Management
-// =============================================================================
-
-/**
- * Begin a new scope
- *
- * Pushes a new empty task list onto the scope stack.
- * Tasks submitted while this scope is at the top of the stack are
- * owned by it and have their fanout_count initialized to 1.
- */
-void pto2_scope_begin(PTO2OrchestratorState *orch);
-
-/**
- * End current scope
- *
- * Batch-publishes all tasks in the scope:
- * 1. For each task, releases the "+1 redundance" in fanin_refcount
- * 2. Tasks with all deps satisfied are pushed to the ready queue
- * 3. Releases the scope's fanout reference (enables CONSUMED transition)
- *
- * This is the scope-end batch publish mechanism: tasks are invisible
- * to the scheduler until this point.
- */
-void pto2_scope_end(PTO2OrchestratorState *orch);
-
-// =============================================================================
-// Task Submission
-// =============================================================================
-
-/**
- * Submit a task with InCore function and parameters
- *
- * Simplified flow (no TensorMap):
- * 1. Allocates task slot from TaskRing (blocks until available)
- * 2. Allocates packed output buffer from HeapRing (blocks until available)
- * 3. Writes task descriptor and payload
- * 4. Initializes fanin with +1 redundance (released at scope_end)
- *
- * The task is NOT visible to the scheduler until scope_end.
- * Dependencies must be added via pto2_add_dependency() before scope_end.
- *
- * @param orch        Orchestrator state
- * @param mixed_kernels  Kernel IDs for AIC/AIV0/AIV1 slots
- * @param args      Aggregated tensor and scalar parameters
- * @return PTO2TaskId for use in pto2_add_dependency()
- */
-SubmitResult pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_kernels, const Arg &args);
-
-// =============================================================================
-// Explicit Dependency Management
-// =============================================================================
-
-/**
- * Add a dependency edge: producer -> consumer
- *
- * The consumer task will not become ready until the producer completes.
- * Both tasks must have been created via pto2_submit_mixed_task().
- *
- * For cross-scope dependencies (producer from a previous scope that is
- * already visible to the scheduler), this uses the fanout_lock for
- * thread safety and handles the case where the producer has already
- * completed (early-finish optimization).
- *
- * @param orch      Orchestrator state
- * @param producer  Producer task ID (must complete before consumer starts)
- * @param consumer  Consumer task ID (depends on producer)
- */
-void pto2_add_dependency(PTO2OrchestratorState *orch, PTO2TaskId producer, PTO2TaskId consumer);
-
-// =============================================================================
-// Flow Control
-// =============================================================================
-
-/**
- * Mark orchestration as complete
- *
- * Signals to scheduler that no more tasks will be submitted.
- */
-void pto2_orchestrator_done(PTO2OrchestratorState *orch);
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-/**
- * Print orchestrator statistics
- */
-void pto2_orchestrator_print_stats(PTO2OrchestratorState *orch);
-
-/**
- * Print scope stack state
- */
-void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState *orch);
-
-// =============================================================================
-// Orchestrator Profiling Data
-// =============================================================================
-
-#if PTO2_ORCH_PROFILING
-struct PTO2OrchProfilingData {
-    uint64_t alloc_cycle;
-    uint64_t args_cycle;
-    uint64_t heap_cycle;
-    uint64_t fanin_cycle;
-    uint64_t scope_end_cycle;
-    int64_t submit_count;
-    // Wait time tracking for blocking phases
-    uint64_t alloc_wait_cycle;  // Cycles spent waiting in task_ring_alloc
-    uint64_t heap_wait_cycle;   // Cycles spent waiting in heap_ring_alloc
-    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
-    // Atomic operation counts per phase
-    uint64_t alloc_atomic_count;
-    uint64_t args_atomic_count;
-    uint64_t heap_atomic_count;
-    uint64_t fanin_atomic_count;
-    uint64_t scope_end_atomic_count;
-};
-
-/**
- * Get and reset orchestrator profiling data.
- * Returns accumulated profiling data and resets counters.
- */
-PTO2OrchProfilingData pto2_orchestrator_get_profiling();
-#endif
-
-#endif  // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_ORCHESTRATOR_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp
deleted file mode 100644
index 3ac6c8e31..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Ring Buffer Implementation
- *
- * Implements HeapRing, TaskRing, and DepListPool ring buffers
- * for zero-overhead memory management.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_ring_buffer.h"
-#include <inttypes.h>
-#include <string.h>
-#include <stdlib.h>  // for exit()
-#include "common/unified_log.h"
-#include "pto_scheduler.h"
-
-// =============================================================================
-// Heap Ring Buffer Implementation
-// =============================================================================
-
-void pto2_heap_ring_init(
-    PTO2HeapRing *ring, void *base, uint64_t size, std::atomic<uint64_t> *tail_ptr, std::atomic<uint64_t> *top_ptr
-) {
-    ring->base = base;
-    ring->size = size;
-    ring->top_ptr = top_ptr;
-    ring->tail_ptr = tail_ptr;
-}
-
-// =============================================================================
-// Task Ring Buffer Implementation
-// =============================================================================
-
-void pto2_task_ring_init(
-    PTO2TaskRing *ring, PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *last_alive_ptr,
-    std::atomic<int32_t> *current_index_ptr
-) {
-    ring->descriptors = descriptors;
-    ring->window_size = window_size;
-    ring->current_index_ptr = current_index_ptr;
-    ring->last_alive_ptr = last_alive_ptr;
-}
-
-// =============================================================================
-// Dependency List Pool Implementation
-// =============================================================================
-void PTO2DepListPool::reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
-        int32_t mark = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
-        if (mark > 0) {
-            advance_tail(mark);
-        }
-        last_reclaimed = sm_last_task_alive;
-    }
-}
-
-void PTO2DepListPool::ensure_space(
-    PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed
-) {
-    if (available() >= needed) return;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire);
-    while (available() < needed) {
-        reclaim(sched, ring_id, prev_last_alive);
-        if (available() >= needed) return;
-
-        spin_count++;
-
-        // Progress detection: reset spin counter if last_task_alive advances
-        int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-        }
-
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            int32_t current = fc.current_task_index.load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected! (ring %d)", ring_id);
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
-            LOG_ERROR(
-                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-            );
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-            LOG_ERROR("Diagnosis:");
-            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
-            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-            LOG_ERROR("========================================");
-            exit(1);
-        }
-        SPIN_WAIT_HINT();
-    }
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h
deleted file mode 100644
index cc0c1bd56..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_ring_buffer.h
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Ring Buffer Data Structures
- *
- * Implements ring buffer designs for zero-overhead memory management:
- *
- * 1. HeapRing - Output buffer allocation from GM Heap
- *    - O(1) bump allocation
- *    - Wrap-around at end, skip to beginning if buffer doesn't fit
- *    - Implicit reclamation via heap_tail advancement
- *    - Back-pressure: stalls when no space available
- *
- * 2. TaskRing - Task slot allocation
- *    - Fixed window size (TASK_WINDOW_SIZE)
- *    - Wrap-around modulo window size
- *    - Implicit reclamation via last_task_alive advancement
- *    - Back-pressure: stalls when window is full
- *
- * 3. DepListPool - Dependency list entry allocation
- *    - Ring buffer for linked list entries
- *    - O(1) prepend operation
- *    - Implicit reclamation with task ring
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#ifndef PTO_RING_BUFFER_H
-#define PTO_RING_BUFFER_H
-
-#include <inttypes.h>
-
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-#include "common/unified_log.h"
-
-struct PTO2SchedulerState;  // Forward declaration for dep_pool reclaim
-
-// Set to 1 to enable periodic BLOCKED/Unblocked messages during spin-wait.
-#ifndef PTO2_SPIN_VERBOSE_LOGGING
-#define PTO2_SPIN_VERBOSE_LOGGING 1
-#endif
-
-// Block notification interval (in spin counts)
-#define PTO2_BLOCK_NOTIFY_INTERVAL 10000
-// Heap ring spin limit - after this, report deadlock and exit
-#define PTO2_HEAP_SPIN_LIMIT 100000
-
-// Flow control spin limit - if exceeded, likely deadlock due to scope/fanout_count
-#define PTO2_FLOW_CONTROL_SPIN_LIMIT 100000
-
-// Dep pool spin limit - if exceeded, dep pool capacity too small for workload
-#define PTO2_DEP_POOL_SPIN_LIMIT 100000
-
-// =============================================================================
-// Heap Ring Buffer
-// =============================================================================
-
-/**
- * Heap ring buffer structure
- *
- * Allocates output buffers from a contiguous GM Heap.
- * Wrap-around design with implicit reclamation.
- */
-struct PTO2HeapRing {
-    void *base;                      // GM_Heap_Base pointer
-    uint64_t size;                   // GM_Heap_Size (total heap size in bytes)
-    std::atomic<uint64_t> *top_ptr;  // Allocation pointer (shared atomic in SM header)
-
-    // Reference to shared memory tail (for back-pressure)
-    std::atomic<uint64_t> *tail_ptr;  // Points to header->heap_tail
-
-    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    /**
-     * Allocate memory from heap ring
-     *
-     * O(1) bump allocation with wrap-around.
-     * May STALL (spin-wait) if insufficient space (back-pressure).
-     * Never splits a buffer across the wrap-around boundary.
-     *
-     * @param size  Requested size in bytes
-     * @return Pointer to allocated memory, or nullptr on fatal error
-     */
-    void *pto2_heap_ring_alloc(uint64_t size) {
-        // Align size for DMA efficiency
-        size = PTO2_ALIGN_UP(size, PTO2_ALIGN_SIZE);
-
-        // Spin-wait if insufficient space (back-pressure from Scheduler)
-        int spin_count = 0;
-        uint64_t prev_tail = tail_ptr->load(std::memory_order_acquire);
-#if PTO2_SPIN_VERBOSE_LOGGING
-        bool notified = false;
-#endif
-#if PTO2_ORCH_PROFILING
-        uint64_t wait_start = 0;
-        bool waiting = false;
-#endif
-
-        while (1) {
-            void *ptr = pto2_heap_ring_try_alloc(size);
-            if (ptr != NULL) {
-#if PTO2_SPIN_VERBOSE_LOGGING
-                if (notified) {
-                    LOG_INFO("[HeapRing] Unblocked after %d spins", spin_count);
-                }
-#endif
-#if PTO2_ORCH_PROFILING
-                if (waiting) {
-                    extern uint64_t g_orch_heap_wait_cycle;
-                    g_orch_heap_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
-                }
-                {
-                    extern uint64_t g_orch_heap_atomic_count;
-                    g_orch_heap_atomic_count +=
-                        spin_count + 1;  // spin_count retries + 1 success (each try_alloc = 1 load)
-                }
-#endif
-                return ptr;
-            }
-
-            // No space available, spin-wait
-            spin_count++;
-#if PTO2_ORCH_PROFILING
-            if (!waiting) {
-                wait_start = get_sys_cnt_aicpu();
-                waiting = true;
-            }
-#endif
-
-            // Progress detection: reset spin counter if heap_tail advances
-            uint64_t cur_tail = tail_ptr->load(std::memory_order_acquire);
-            if (cur_tail != prev_tail) {
-#if PTO2_SPIN_VERBOSE_LOGGING
-                LOG_INFO(
-                    "[HeapRing] Progress: tail %" PRIu64 " -> %" PRIu64 " (reset spin_count=%d)", prev_tail, cur_tail,
-                    spin_count
-                );
-#endif
-                spin_count = 0;
-                prev_tail = cur_tail;
-            }
-
-#if PTO2_SPIN_VERBOSE_LOGGING
-            // Periodic block notification
-            if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count > 0 && spin_count < PTO2_HEAP_SPIN_LIMIT) {
-                uint64_t top = top_ptr->load(std::memory_order_acquire);
-                LOG_WARN(
-                    "[HeapRing] BLOCKED: requesting %" PRIu64 " bytes"
-                    ", top=%" PRIu64 ", tail=%" PRIu64 ", spins=%d",
-                    size, top, cur_tail, spin_count
-                );
-                notified = true;
-            }
-#endif
-
-            if (spin_count >= PTO2_HEAP_SPIN_LIMIT) {
-                uint64_t top = top_ptr->load(std::memory_order_acquire);
-                LOG_ERROR("========================================");
-                LOG_ERROR("FATAL: Heap Ring Deadlock Detected!");
-                LOG_ERROR("========================================");
-                LOG_ERROR("Orchestrator blocked waiting for heap space after %d spins (no tail progress).", spin_count);
-                LOG_ERROR("  - Requested:     %" PRIu64 " bytes", size);
-                LOG_ERROR("  - Heap top:      %" PRIu64, top);
-                LOG_ERROR("  - Heap tail:     %" PRIu64 " (stuck here)", cur_tail);
-                LOG_ERROR("  - Heap size:     %" PRIu64, this->size);
-                LOG_ERROR("  - Available:     %" PRIu64 " bytes", pto2_heap_ring_available());
-                LOG_ERROR("Diagnosis:");
-                LOG_ERROR("  heap_tail is not advancing, which means last_task_alive");
-                LOG_ERROR("  is stuck. Check TaskRing diagnostics for root cause.");
-                LOG_ERROR("Solution: Increase heap size or investigate task stall.");
-                LOG_ERROR("  Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h");
-                LOG_ERROR(
-                    "  Runtime env:  PTO2_RING_HEAP=<power-of-2 bytes> (e.g. %lu)", (unsigned long)(this->size * 2)
-                );
-                LOG_ERROR("========================================");
-                if (error_code_ptr) {
-                    error_code_ptr->store(PTO2_ERROR_HEAP_RING_DEADLOCK, std::memory_order_release);
-                }
-                return nullptr;
-            }
-
-            SPIN_WAIT_HINT();
-        }
-    }
-
-    /**
-     * Try to allocate memory without stalling (thread-safe via CAS)
-     *
-     * @param size  Requested size in bytes
-     * @return Pointer to allocated memory, or NULL if no space
-     */
-    void *pto2_heap_ring_try_alloc(uint64_t alloc_size) {
-        // Align size for DMA efficiency
-        alloc_size = PTO2_ALIGN_UP(alloc_size, PTO2_ALIGN_SIZE);
-
-        while (true) {
-            uint64_t top = top_ptr->load(std::memory_order_acquire);
-            // Read latest tail from shared memory (Scheduler updates this)
-            uint64_t tail = tail_ptr->load(std::memory_order_acquire);
-            uint64_t new_top;
-            void *result;
-
-            if (top >= tail) {
-                // Case 1: top is at or ahead of tail (normal case)
-                uint64_t space_at_end = size - top;
-
-                if (space_at_end >= alloc_size) {
-                    new_top = top + alloc_size;
-                    result = (char *)base + top;
-                } else if (tail > alloc_size) {
-                    // Wrap to beginning
-                    new_top = alloc_size;
-                    result = base;
-                } else {
-                    return NULL;
-                }
-            } else {
-                // Case 2: top has wrapped, tail is ahead
-                uint64_t gap = tail - top;
-                if (gap >= alloc_size) {
-                    new_top = top + alloc_size;
-                    result = (char *)base + top;
-                } else {
-                    return NULL;
-                }
-            }
-
-            if (top_ptr->compare_exchange_weak(top, new_top, std::memory_order_acq_rel, std::memory_order_acquire)) {
-                return result;
-            }
-            // CAS failed, retry with updated top
-        }
-    }
-
-    /**
-     * Get available space in heap ring
-     */
-    uint64_t pto2_heap_ring_available() {
-        uint64_t top = top_ptr->load(std::memory_order_acquire);
-        uint64_t tail = tail_ptr->load(std::memory_order_acquire);
-
-        if (top >= tail) {
-            uint64_t at_end = size - top;
-            uint64_t at_begin = tail;
-            return at_end > at_begin ? at_end : at_begin;
-        } else {
-            return tail - top;
-        }
-    }
-};
-
-/**
- * Initialize heap ring buffer
- *
- * @param ring      Heap ring to initialize
- * @param base      Base address of heap memory
- * @param size      Total heap size in bytes
- * @param tail_ptr  Pointer to shared memory heap_tail
- */
-void pto2_heap_ring_init(
-    PTO2HeapRing *ring, void *base, uint64_t size, std::atomic<uint64_t> *tail_ptr, std::atomic<uint64_t> *top_ptr
-);
-
-// =============================================================================
-// Task Ring Buffer
-// =============================================================================
-
-/**
- * Task ring buffer structure
- *
- * Fixed-size sliding window for task management.
- * Provides back-pressure when window is full.
- */
-struct PTO2TaskRing {
-    PTO2TaskDescriptor *descriptors;          // Task descriptor array (from shared memory)
-    int32_t window_size;                      // Window size (power of 2)
-    std::atomic<int32_t> *current_index_ptr;  // Shared atomic in SM header
-
-    // Reference to shared memory last_task_alive (for back-pressure)
-    std::atomic<int32_t> *last_alive_ptr;  // Points to header->last_task_alive
-
-    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    /**
-     * Allocate a task slot from task ring
-     *
-     * May STALL (spin-wait) if window is full (back-pressure).
-     * Initializes the task descriptor to default values.
-     *
-     * @return Allocated task ID (absolute, not wrapped)
-     */
-    int32_t pto2_task_ring_alloc() {
-        // Spin-wait if window is full (back-pressure from Scheduler)
-        int spin_count = 0;
-        int32_t prev_last_alive = last_alive_ptr->load(std::memory_order_acquire);
-#if PTO2_SPIN_VERBOSE_LOGGING
-        bool notified = false;
-#endif
-#if PTO2_ORCH_PROFILING
-        uint64_t wait_start = 0;
-        bool waiting = false;
-#endif
-
-        while (1) {
-            int32_t task_id = pto2_task_ring_try_alloc();
-            if (task_id >= 0) {
-#if PTO2_SPIN_VERBOSE_LOGGING
-                if (notified) {
-                    LOG_INFO("[TaskRing] Unblocked after %d spins, task_id=%d", spin_count, task_id);
-                }
-#endif
-#if PTO2_ORCH_PROFILING
-                if (waiting) {
-                    extern uint64_t g_orch_alloc_wait_cycle;
-                    g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
-                }
-                {
-                    extern uint64_t g_orch_alloc_atomic_count;
-                    g_orch_alloc_atomic_count +=
-                        spin_count + 1;  // spin_count retries + 1 success (each try_alloc = 1 load)
-                }
-#endif
-                return task_id;
-            }
-
-            // Window is full, spin-wait (with yield to prevent CPU starvation)
-            spin_count++;
-#if PTO2_ORCH_PROFILING
-            if (!waiting) {
-                wait_start = get_sys_cnt_aicpu();
-                waiting = true;
-            }
-#endif
-
-            // Progress detection: reset spin counter if last_task_alive advances
-            int32_t cur_last_alive = last_alive_ptr->load(std::memory_order_acquire);
-            if (cur_last_alive > prev_last_alive) {
-#if PTO2_SPIN_VERBOSE_LOGGING
-                LOG_INFO(
-                    "[TaskRing] Progress: last_alive %d -> %d (reset spin_count=%d)", prev_last_alive, cur_last_alive,
-                    spin_count
-                );
-#endif
-                spin_count = 0;
-                prev_last_alive = cur_last_alive;
-            }
-
-#if PTO2_SPIN_VERBOSE_LOGGING
-            // Periodic block notification
-            if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count > 0 &&
-                spin_count < PTO2_FLOW_CONTROL_SPIN_LIMIT) {
-                int32_t current = current_index_ptr->load(std::memory_order_acquire);
-                int32_t active_count = current - cur_last_alive;
-                LOG_WARN(
-                    "[TaskRing] BLOCKED (Flow Control): current=%d, last_alive=%d, "
-                    "active=%d/%d (%.1f%%), spins=%d",
-                    current, cur_last_alive, active_count, window_size, 100.0 * active_count / window_size, spin_count
-                );
-                notified = true;
-            }
-#endif
-
-            // Deadlock: no progress after SPIN_LIMIT spins
-            if (spin_count >= PTO2_FLOW_CONTROL_SPIN_LIMIT) {
-                int32_t current = current_index_ptr->load(std::memory_order_acquire);
-                int32_t active_count = current - cur_last_alive;
-
-                LOG_ERROR("========================================");
-                LOG_ERROR("FATAL: Flow Control Deadlock Detected!");
-                LOG_ERROR("========================================");
-                LOG_ERROR("Task Ring is FULL and no progress after %d spins.", spin_count);
-                LOG_ERROR("  - Current task index:  %d", current);
-                LOG_ERROR("  - Last task alive:     %d (stuck here)", cur_last_alive);
-                LOG_ERROR("  - Active tasks:        %d / %d", active_count, window_size);
-                LOG_ERROR("  - Window utilization:  %.1f%%", 100.0 * active_count / window_size);
-                LOG_ERROR("Diagnosis:");
-                LOG_ERROR("  last_task_alive is stuck at %d, meaning task %d", cur_last_alive, cur_last_alive);
-                LOG_ERROR("  cannot transition to CONSUMED. Possible causes:");
-                LOG_ERROR("  1. Task %d still executing (subtasks not complete)", cur_last_alive);
-                LOG_ERROR("  2. Task %d fanout not fully released (downstream not done)", cur_last_alive);
-                LOG_ERROR("  3. Scope reference not released (scope_end not called)");
-                LOG_ERROR("  4. Orchestrator blocked here -> can't call scope_end -> circular wait");
-                LOG_ERROR("Solution:");
-                LOG_ERROR("  Increase task window size (current: %d, recommended: %d)", window_size, active_count * 2);
-                LOG_ERROR("  Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
-                LOG_ERROR("  Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2> (e.g. %d)", active_count * 2);
-                LOG_ERROR("========================================");
-                if (error_code_ptr) {
-                    error_code_ptr->store(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, std::memory_order_release);
-                }
-                return -1;
-            }
-
-            SPIN_WAIT_HINT();
-        }
-    }
-
-    /**
-     * Try to allocate task slot without stalling (thread-safe via fetch_add)
-     *
-     * @return Task ID, or -1 if window is full
-     */
-    int32_t pto2_task_ring_try_alloc() {
-        // Optimistically allocate a task ID
-        int32_t task_id = current_index_ptr->fetch_add(1, std::memory_order_acq_rel);
-        int32_t last_alive = last_alive_ptr->load(std::memory_order_acquire);
-        int32_t active_count = task_id - last_alive;
-
-        // Check if there's room (leave at least 1 slot empty)
-        if (active_count < window_size - 1) {
-            return task_id;
-        }
-
-        // Window is full — roll back the optimistic increment
-        current_index_ptr->fetch_sub(1, std::memory_order_release);
-        return -1;
-    }
-
-    int32_t get_task_slot(int32_t task_id) const { return task_id & (window_size - 1); }
-
-    /**
-     * Get task descriptor by ID
-     */
-    PTO2TaskDescriptor &get_task(int32_t task_id) { return descriptors[task_id & (window_size - 1)]; }
-
-    /**
-     * Get task descriptor by task slot
-     */
-    PTO2TaskDescriptor &get_task_by_slot(int32_t task_slot) { return descriptors[task_slot]; }
-};
-
-/**
- * Initialize task ring buffer
- *
- * @param ring            Task ring to initialize
- * @param descriptors     Task descriptor array from shared memory
- * @param window_size     Window size (must be power of 2)
- * @param last_alive_ptr  Pointer to shared memory last_task_alive
- */
-void pto2_task_ring_init(
-    PTO2TaskRing *ring, PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *last_alive_ptr,
-    std::atomic<int32_t> *current_index_ptr
-);
-
-/**
- * Get number of active tasks in window
- */
-static inline int32_t pto2_task_ring_active_count(PTO2TaskRing *ring) {
-    int32_t last_alive = ring->last_alive_ptr->load(std::memory_order_acquire);
-    return ring->current_index_ptr->load(std::memory_order_acquire) - last_alive;
-}
-
-/**
- * Check if task ring has space for more tasks
- */
-static inline bool pto2_task_ring_has_space(PTO2TaskRing *ring) {
-    int32_t active = pto2_task_ring_active_count(ring);
-    return active < ring->window_size - 1;
-}
-
-/**
- * Get task descriptor by ID
- */
-static inline PTO2TaskDescriptor *pto2_task_ring_get(PTO2TaskRing *ring, int32_t task_id) {
-    return &ring->descriptors[task_id & (ring->window_size - 1)];
-}
-
-// =============================================================================
-// Dependency List Pool
-// =============================================================================
-
-/**
- * Dependency list pool structure
- *
- * True ring buffer for allocating linked list entries.
- * Entries are reclaimed when their producer tasks become CONSUMED,
- * as tracked by the orchestrator via dep_pool_mark per task.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2DepListPool {
-    PTO2DepListEntry *base;     // Pool base address
-    int32_t capacity;           // Total number of entries
-    int32_t top;                // Linear next-allocation counter (starts from 1)
-    int32_t tail;               // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;         // Peak concurrent usage (top - tail)
-    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
-
-    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    /**
-     * Initialize dependency list pool
-     *
-     * @param base      Pool base address from shared memory
-     * @param capacity  Total number of entries
-     */
-    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;   // Start from 1, 0 means NULL/empty
-        tail = 1;  // Match initial top (no reclaimable entries yet)
-        high_water = 0;
-        last_reclaimed = 0;
-
-        // Initialize entry 0 as NULL marker
-        base[0].slot_state = nullptr;
-        base[0].next = nullptr;
-
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    /**
-     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
-     * Safe to call multiple times — only advances tail forward.
-     *
-     * @param sched              Scheduler state (for reading slot dep_pool_mark)
-     * @param ring_id            Ring layer index
-     * @param sm_last_task_alive Current last_task_alive from shared memory
-     */
-    void reclaim(PTO2SchedulerState &sched, uint8_t ring_id, int32_t sm_last_task_alive);
-
-    /**
-     * Ensure dep pool for a specific ring has at least `needed` entries available.
-     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
-     */
-    void ensure_space(PTO2SchedulerState &sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed);
-
-    /**
-     * Allocate a single entry from the pool (single-thread per pool instance)
-     *
-     * @return Pointer to allocated entry, or nullptr on fatal error
-     */
-    PTO2DepListEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
-            return nullptr;
-        }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
-    }
-
-    /**
-     * Advance the tail pointer, reclaiming dead entries.
-     * Called by the orchestrator based on last_task_alive advancement.
-     */
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
-        }
-    }
-
-    /**
-     * Prepend a task ID to a dependency list
-     *
-     * O(1) operation: allocates new entry and links to current head.
-     *
-     * @param current_head  Current list head offset (0 = empty list)
-     * @param task_slot     Task slot to prepend
-     * @return New head offset
-     */
-    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
-        PTO2DepListEntry *new_entry = alloc();
-        if (!new_entry) return nullptr;
-        new_entry->slot_state = slot_state;
-        new_entry->next = cur;
-        return new_entry;
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
-};
-
-// =============================================================================
-// Ring Set (per-depth aggregate)
-// =============================================================================
-
-/**
- * Groups a HeapRing, TaskRing, and DepPool into one per-depth unit.
- * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
- */
-struct PTO2RingSet {
-    PTO2HeapRing heap_ring;
-    PTO2TaskRing task_ring;
-    PTO2DepListPool dep_pool;
-};
-
-#endif  // PTO_RING_BUFFER_H
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp
deleted file mode 100644
index 97d5486e9..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Main Implementation
- *
- * Implements the unified runtime API that combines orchestrator and scheduler.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_runtime2.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/unified_log.h"
-
-// =============================================================================
-// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
-// =============================================================================
-
-static SubmitResult submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) {
-    return pto2_submit_mixed_task(&rt->orchestrator, mixed_kernels, args);
-}
-
-static void add_dependency_impl(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer) {
-    pto2_add_dependency(&rt->orchestrator, producer, consumer);
-}
-
-void rt_scope_begin(PTO2Runtime *rt) { pto2_scope_begin(&rt->orchestrator); }
-
-void rt_scope_end(PTO2Runtime *rt) { pto2_scope_end(&rt->orchestrator); }
-
-void rt_orchestration_done(PTO2Runtime *rt) { pto2_orchestrator_done(&rt->orchestrator); }
-
-static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
-
-static const PTO2RuntimeOps s_runtime_ops = {
-    .submit_task = submit_task_impl,
-    .add_dependency = add_dependency_impl,
-    .scope_begin = rt_scope_begin,
-    .scope_end = rt_scope_end,
-    .orchestration_done = rt_orchestration_done,
-    .is_fatal = is_fatal_impl,
-    .log_error = unified_log_error,
-    .log_warn = unified_log_warn,
-    .log_info = unified_log_info,
-    .log_debug = unified_log_debug,
-    .log_always = unified_log_always,
-};
-
-// =============================================================================
-// Runtime Creation and Destruction
-// =============================================================================
-
-PTO2Runtime *pto2_runtime_create(PTO2RuntimeMode mode) {
-    return pto2_runtime_create_custom(mode, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE);
-}
-
-PTO2Runtime *pto2_runtime_create_custom(
-    PTO2RuntimeMode mode, uint64_t task_window_size, uint64_t heap_size, int32_t dep_pool_capacity
-) {
-    // Allocate runtime context
-    PTO2Runtime *rt = reinterpret_cast<PTO2Runtime *>(calloc(1, sizeof(PTO2Runtime)));
-    if (!rt) {
-        return NULL;
-    }
-
-    rt->ops = &s_runtime_ops;
-    rt->mode = mode;
-    rt->sm_handle = pto2_sm_create(task_window_size, heap_size);
-    if (!rt->sm_handle) {
-        free(rt);
-        return NULL;
-    }
-
-    // Allocate GM heap for output buffers (all rings combined)
-    uint64_t total_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    rt->gm_heap_size = total_heap_size;
-#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
-    if (posix_memalign(&rt->gm_heap, PTO2_ALIGN_SIZE, total_heap_size) != 0) {
-        pto2_sm_destroy(rt->sm_handle);
-        free(rt);
-        return NULL;
-    }
-#else
-    rt->gm_heap = aligned_alloc(PTO2_ALIGN_SIZE, total_heap_size);
-    if (!rt->gm_heap) {
-        pto2_sm_destroy(rt->sm_handle);
-        free(rt);
-        return NULL;
-    }
-#endif
-    rt->gm_heap_owned = true;
-
-    // Initialize orchestrator
-    if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) {
-        free(rt->gm_heap);
-        pto2_sm_destroy(rt->sm_handle);
-        free(rt);
-        return NULL;
-    }
-
-    // Initialize scheduler (heap_size = per-ring heap size)
-    if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap, heap_size)) {
-        pto2_orchestrator_destroy(&rt->orchestrator);
-        free(rt->gm_heap);
-        pto2_sm_destroy(rt->sm_handle);
-        free(rt);
-        return NULL;
-    }
-
-    // Connect orchestrator to scheduler (for simulated mode)
-    pto2_orchestrator_set_scheduler(&rt->orchestrator, &rt->scheduler);
-
-    return rt;
-}
-
-PTO2Runtime *pto2_runtime_create_from_sm(
-    PTO2RuntimeMode mode, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size,
-    int32_t dep_pool_capacity
-) {
-    if (!sm_handle) return NULL;
-
-    PTO2Runtime *rt = reinterpret_cast<PTO2Runtime *>(calloc(1, sizeof(PTO2Runtime)));
-    if (!rt) return NULL;
-
-    rt->ops = &s_runtime_ops;
-    rt->mode = mode;
-    rt->sm_handle = sm_handle;
-    rt->gm_heap = gm_heap;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-
-    if (!pto2_orchestrator_init(&rt->orchestrator, rt->sm_handle, rt->gm_heap, heap_size, dep_pool_capacity)) {
-        free(rt);
-        return NULL;
-    }
-
-    // Initialize scheduler (heap_size = per-ring heap size)
-    if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap, heap_size)) {
-        pto2_orchestrator_destroy(&rt->orchestrator);
-        free(rt);
-        return NULL;
-    }
-
-    pto2_orchestrator_set_scheduler(&rt->orchestrator, &rt->scheduler);
-
-    return rt;
-}
-
-void pto2_runtime_destroy(PTO2Runtime *rt) {
-    if (!rt) return;
-
-    pto2_scheduler_destroy(&rt->scheduler);
-    pto2_orchestrator_destroy(&rt->orchestrator);
-
-    if (rt->gm_heap_owned && rt->gm_heap) {
-        free(rt->gm_heap);
-    }
-
-    if (rt->sm_handle) {
-        pto2_sm_destroy(rt->sm_handle);
-    }
-
-    free(rt);
-}
-
-void pto2_runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
-    if (rt) {
-        rt->mode = mode;
-    }
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h
deleted file mode 100644
index cfc4e394d..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Main Interface
- *
- * This is the main header for the PTO Runtime2 system.
- * It provides a unified API for task graph construction and execution.
- *
- * Key Features:
- * - Ring buffer based memory management (zero allocation overhead)
- * - Explicit dependency management via add_dependency()
- * - Scope-based buffer lifecycle management with batch publish
- * - Per-task spinlocks for concurrent fanout updates
- * - Orchestrator-Scheduler decoupling via shared memory
- *
- * Usage:
- *   1. Create runtime: pto2_runtime_create()
- *   2. Build task graph in orchestration function:
- *      - pto2_scope_begin() / pto2_scope_end()
- *      - pto2_submit_task()
- *   3. Mark orchestration complete: pto2_orchestrator_done()
- *   4. Destroy runtime: pto2_runtime_destroy()
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_H_
-#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_H_
-
-#include "pto_orchestrator.h"
-#include "pto_ring_buffer.h"
-#include "pto_runtime2_types.h"
-#include "pto_scheduler.h"
-#include "pto_shared_memory.h"
-#include "pto_submit_types.h"
-
-// =============================================================================
-// Runtime Context
-// =============================================================================
-
-/**
- * Runtime execution mode
- */
-enum PTO2RuntimeMode {
-    PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
-    PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
-    PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
-};
-
-/**
- * Function-pointer ops table for runtime operations.
- *
- * The orchestration .so calls runtime functions through this table
- * (via pto_orchestration_api.h inline wrappers), so it has zero link
- * dependencies on runtime .cpp files.
- */
-typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
-
-struct PTO2RuntimeOps {
-    SubmitResult (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args);
-    void (*add_dependency)(PTO2Runtime *rt, PTO2TaskId producer, PTO2TaskId consumer);
-    void (*scope_begin)(PTO2Runtime *rt);
-    void (*scope_end)(PTO2Runtime *rt);
-    void (*orchestration_done)(PTO2Runtime *rt);
-    bool (*is_fatal)(PTO2Runtime *rt);
-
-    // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_info)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    void (*log_always)(const char *func, const char *fmt, ...);
-};
-
-/**
- * PTO Runtime2 context
- *
- * Contains all state for orchestration and scheduling.
- * In simulated mode, runs in single process with shared address space.
- */
-struct PTO2Runtime {
-    // Ops table (first field — used by orchestration .so via function pointers)
-    const PTO2RuntimeOps *ops;
-
-    // Components
-    PTO2SharedMemoryHandle *sm_handle;
-    PTO2OrchestratorState orchestrator;
-    PTO2SchedulerState scheduler;
-
-    // GM Heap for output buffers
-    void *gm_heap;
-    uint64_t gm_heap_size;
-    bool gm_heap_owned;  // True if we allocated it
-
-    // Mode
-    PTO2RuntimeMode mode;
-
-    // Statistics
-    int64_t total_cycles;
-};
-
-// =============================================================================
-// Runtime Lifecycle API
-// =============================================================================
-
-/**
- * Create a new runtime instance
- *
- * @param mode Execution mode
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *pto2_runtime_create(PTO2RuntimeMode mode);
-
-/**
- * Create runtime with custom sizes
- *
- * @param mode             Execution mode
- * @param task_window_size Number of task slots
- * @param heap_size        Size of GM heap
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *pto2_runtime_create_custom(
-    PTO2RuntimeMode mode, uint64_t task_window_size, uint64_t heap_size,
-    int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-);
-
-/**
- * Create runtime from existing shared memory and GM heap (e.g. on device).
- * Does not allocate sm_handle or gm_heap; caller owns them.
- *
- * @param mode      Execution mode
- * @param sm_handle Pre-created shared memory handle (e.g. from pto2_sm_create_from_buffer)
- * @param gm_heap   GM heap base for output buffers (or NULL if not used)
- * @param heap_size GM heap size in bytes
- * @return Runtime context, or NULL on failure
- */
-PTO2Runtime *pto2_runtime_create_from_sm(
-    PTO2RuntimeMode mode, PTO2SharedMemoryHandle *sm_handle, void *gm_heap, uint64_t heap_size,
-    int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-);
-
-/**
- * Destroy runtime and free all resources
- */
-void pto2_runtime_destroy(PTO2Runtime *rt);
-
-/**
- * Set execution mode
- */
-void pto2_runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
-
-// =============================================================================
-// Orchestration API (called by orchestration function)
-// =============================================================================
-
-/**
- * Begin a new scope
- *
- * All tasks submitted within this scope will have their lifetime
- * bounded by the scope. When scope_end() is called, the scope
- * releases its reference to all enclosed tasks.
- */
-void rt_scope_begin(PTO2Runtime *rt);
-
-/**
- * End current scope
- *
- * Releases scope reference for all tasks submitted since scope_begin().
- * Tasks whose refcount reaches zero will have their buffers released.
- */
-void rt_scope_end(PTO2Runtime *rt);
-
-/**
- * Mark orchestration as complete
- *
- * Signals that no more tasks will be submitted.
- */
-void rt_orchestration_done(PTO2Runtime *rt);
-
-/**
- * Scope helper macros for C
- *
- * These macros provide scope management for C code.
- * For C++, prefer using PTO2_SCOPE_GUARD or PTO2_SCOPE (see below).
- *
- * Usage (C):
- *   PTO2_SCOPE_BEGIN(rt);
- *   rt_submit_task(...);
- *   rt_submit_task(...);
- *   PTO2_SCOPE_END(rt);
- */
-#define PTO2_SCOPE_BEGIN(rt) rt_scope_begin(rt)
-#define PTO2_SCOPE_END(rt) rt_scope_end(rt)
-
-/**
- * RAII Scope Guard for C++
- *
- * PTO2ScopeGuard is a C++ RAII wrapper that automatically manages scope lifetime.
- * It calls rt_scope_begin() on construction and rt_scope_end() on destruction,
- * ensuring proper cleanup even in error paths.
- *
- * Usage Option 1 - Direct instantiation (recommended):
- *   PTO2ScopeGuard scope_guard(rt);
- *   rt_submit_task(...);
- *   rt_submit_task(...);
- *   // scope automatically ends here when scope_guard destructor is called
- *
- * Usage Option 2 - Macro for anonymous guard:
- *   PTO2_SCOPE_GUARD(rt);
- *   rt_submit_task(...);
- *   // scope automatically ends at end of current block
- *
- * Usage Option 3 - Scoped block with if statement:
- *   PTO2_SCOPE(rt) {
- *       rt_submit_task(...);
- *       rt_submit_task(...);
- *   } // scope automatically ends here
- *
- * Benefits:
- * - Exception-safe: scope ends even if exceptions are thrown
- * - Error-safe: no need to manually call PTO2_SCOPE_END in error paths
- * - Cleaner code: less boilerplate, automatic cleanup
- * - Less error-prone: impossible to forget scope cleanup
- */
-class PTO2ScopeGuard {
-public:
-    explicit PTO2ScopeGuard(PTO2Runtime *rt) :
-        rt_(rt) {
-        rt_scope_begin(rt_);
-    }
-    ~PTO2ScopeGuard() { rt_scope_end(rt_); }
-
-private:
-    PTO2Runtime *rt_;
-};
-
-/**
- * Macro to create an anonymous scope guard with a unique name.
- * The [[maybe_unused]] attribute suppresses warnings if the guard
- * variable is not explicitly used.
- *
- * Example:
- *   PTO2_SCOPE_GUARD(rt);
- *   rt_submit_task(...);
- */
-#define _PTO2_CONCATENATE_IMPL(x, y) x##y
-#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y)
-#define PTO2_SCOPE_GUARD(rt) [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)(rt)
-
-/**
- * Macro to create a scoped block with automatic scope management.
- * Uses if-statement initialization (C++17) to create guard and execute block.
- *
- * Example:
- *   PTO2_SCOPE(rt) {
- *       rt_submit_task(...);
- *   } // scope automatically ends here
- */
-#define PTO2_SCOPE(rt) if (PTO2_SCOPE_GUARD(rt); true)
-
-/**
- * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
- * Shared definition with pto_orchestration_api.h (same layout, guarded).
- */
-#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
-#define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
-    int expected_arg_count;
-};
-#endif
-
-#endif  // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h
deleted file mode 100644
index b75834dfa..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Core Type Definitions
- *
- * This header defines all fundamental types used by the PTO Runtime2 system:
- * - Configuration constants
- * - Worker types and task states
- * - Tensor regions and task parameters
- * - Task descriptors with fanin/fanout tracking
- * - Dependency list entries
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
-#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include <atomic>
-
-#include "pto_submit_types.h"
-#include "pto_types.h"
-
-// =============================================================================
-// Profiling Configuration
-// =============================================================================
-
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
-
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 0
-#endif
-
-#ifndef PTO2_SCHED_PROFILING
-#define PTO2_SCHED_PROFILING 0
-#endif
-
-#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
-#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
-#endif
-
-#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
-#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
-#endif
-
-// =============================================================================
-// AICPU Error Codes (written to shared memory for Host-side diagnosis)
-// =============================================================================
-
-// Orchestrator errors (1-99): detected in orchestrator thread
-#define PTO2_ERROR_NONE 0
-#define PTO2_ERROR_SCOPE_DEADLOCK 1
-#define PTO2_ERROR_HEAP_RING_DEADLOCK 2
-#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3
-#define PTO2_ERROR_DEP_POOL_OVERFLOW 4
-#define PTO2_ERROR_INVALID_ARGS 5  // Arg construction error (invalid args)
-
-// Scheduler errors (100+): detected in scheduler threads
-#define PTO2_ERROR_SCHEDULER_TIMEOUT 100
-
-// =============================================================================
-// Configuration Constants
-// =============================================================================
-
-// Task management
-// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
-// Actual window size is passed at runtime to pto2_runtime_create_threaded_custom().
-// Use pto2_task_slot(sched, task_id) for slot calculation.
-#define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
-
-// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
-// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-#define PTO2_MAX_RING_DEPTH 4
-
-// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
-#define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB per ring (1GB total)
-#define PTO2_DEP_LIST_POOL_SIZE 16384       // Per-ring dependency list pool entries
-
-// Scope management
-#define PTO2_MAX_SCOPE_DEPTH 64          // Maximum nesting depth
-#define PTO2_SCOPE_TASKS_INIT_CAP 65536  // Initial capacity for scope task buffer
-
-// Ready queue
-#define PTO2_READY_QUEUE_SIZE 65536  // Per-shape queue size
-
-// Memory alignment
-#define PTO2_ALIGN_SIZE 64             // Cache line alignment
-#define PTO2_PACKED_OUTPUT_ALIGN 1024  // Each output in packed buffer aligned to 1024B; gap is padding
-#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1))
-
-// Dep pool cleanup interval
-#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
-
-// =============================================================================
-// Multi-Ring task_id Encoding
-// =============================================================================
-
-/**
- * TaskId: 64-bit encoding used across Runtime2.
- *
- * raw encoding: (ring_id << 32) | local_id
- *
- * ring_id:  which ring layer (0..PTO2_MAX_RING_DEPTH-1)
- * local_id: per-ring monotonic counter
- */
-struct PTO2TaskId {
-    uint64_t raw;
-
-    constexpr PTO2TaskId() :
-        raw(0) {}
-    constexpr explicit PTO2TaskId(uint64_t v) :
-        raw(v) {}
-
-    constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
-    constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
-
-    constexpr bool operator==(const PTO2TaskId &other) const { return raw == other.raw; }
-    constexpr bool operator!=(const PTO2TaskId &other) const { return raw != other.raw; }
-};
-
-static_assert(sizeof(PTO2TaskId) == 8, "PTO2TaskId must stay 8 bytes (shared memory ABI)");
-
-static inline PTO2TaskId pto2_make_task_id(uint8_t ring_id, uint32_t local_id) {
-    return PTO2TaskId{(static_cast<uint64_t>(ring_id) << 32) | static_cast<uint64_t>(local_id)};
-}
-
-static inline uint8_t pto2_task_id_ring(PTO2TaskId task_id) { return task_id.ring(); }
-
-static inline uint32_t pto2_task_id_local(PTO2TaskId task_id) { return task_id.local(); }
-
-static inline uint64_t pto2_task_id_raw(PTO2TaskId task_id) { return task_id.raw; }
-
-/**
- * SubmitResult — return value from pto2_submit_mixed_task.
- * Bundles the task_id (for explicit dependencies) and the materialized
- * output tensors (for referencing runtime-allocated outputs).
- */
-struct SubmitResult {
-    PTO2TaskId task_id;
-    TaskOutputTensors outputs;
-};
-
-// =============================================================================
-// Worker Types
-// =============================================================================
-
-/**
- * Worker type enumeration
- * Each worker type has its own ready queue for load balancing
- */
-typedef enum {
-    PTO2_WORKER_CUBE = 0,         // AICore CUBE unit (matrix ops)
-    PTO2_WORKER_VECTOR = 1,       // AICore VECTOR unit (element-wise ops)
-    PTO2_WORKER_AI_CPU = 2,       // AI_CPU (scalar ops, control flow)
-    PTO2_WORKER_ACCELERATOR = 3,  // Fixed-function accelerators (DMA, etc.)
-    PTO2_NUM_WORKER_TYPES = 4
-} PTO2WorkerType;
-
-// =============================================================================
-// Task States
-// =============================================================================
-
-/**
- * Task state enumeration
- *
- * State transitions:
- *   PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED
- *
- * Conditions:
- *   PENDING->READY:     fanin_refcount == fanin_count
- *   COMPLETED->CONSUMED: fanout_refcount == fanout_count && state == COMPLETED
- */
-typedef enum {
-    PTO2_TASK_PENDING = 0,    // Waiting for dependencies (fanin_refcount < fanin_count)
-    PTO2_TASK_READY = 1,      // All dependencies satisfied, waiting in ready queue
-    PTO2_TASK_RUNNING = 2,    // Currently executing on a worker
-    PTO2_TASK_COMPLETED = 3,  // Execution finished, output may still be in use
-    PTO2_TASK_CONSUMED = 4    // Output fully consumed, buffers can be released
-} PTO2TaskState;
-
-// =============================================================================
-// Dependency List Entry
-// =============================================================================
-
-/**
- * Dependency list entry (singly-linked list node)
- * Stored in DepListPool ring buffer
- *
- * Used for both fanin_list and fanout_list
- */
-struct PTO2TaskSlotState;  // Forward declaration
-struct PTO2DepListEntry {
-    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
-    PTO2DepListEntry *next;         // next entry
-};
-
-// =============================================================================
-// Task Descriptor
-// =============================================================================
-
-/**
- * Task descriptor structure (shared memory)
- *
- * Stored in the TaskDescriptor ring buffer in shared memory.
- * Contains static identification and buffer pointers only.
- * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
- *
- * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
- */
-struct PTO2TaskDescriptor {
-    // Mixed-task identification (encodes ring_id in upper 32 bits)
-    PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
-
-    // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive)
-    int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
-
-    // Packed output buffer (all outputs packed into single contiguous buffer)
-    void *packed_buffer_base;  // Start of packed buffer in GM Heap
-    void *packed_buffer_end;   // End of packed buffer (for heap reclamation)
-};
-
-// =============================================================================
-// Per-Slot Scheduling State
-// =============================================================================
-
-/**
- * Task payload data (cold path - only accessed during orchestration and dispatch)
- *
- * Layout: metadata (counts, fanin pointers) packed in the first 3 cache lines,
- * followed by bulk tensor and scalar data. This gives sequential write access
- * during orchestration and groups scheduler-hot fields (fanin_actual_count +
- * fanin_slot_states) together for on_task_release.
- */
-struct PTO2TaskPayload {
-    // === Cache line 0 (64B) — metadata ===
-    int32_t tensor_count{0};
-    int32_t scalar_count{0};
-    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
-    int32_t _reserved{0};           // Reserved (dep_pool_mark moved to SlotState for local access)
-    PTO2TaskSlotState *fanin_slot_states[PTO2_MAX_INPUTS];  // Producer slot states (used by on_task_release)
-    // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) ===
-    Tensor tensors[MAX_TENSOR_ARGS];
-    // === Cache lines 35-50 (1024B) — scalars ===
-    uint64_t scalars[MAX_SCALAR_ARGS];
-
-    void init(const Arg &args, const TaskOutputTensors &materialized_outputs) {
-        tensor_count = args.tensor_count();
-        scalar_count = args.scalar_count();
-        int32_t out_idx = 0;
-        for (int32_t i = 0; i < args.tensor_count(); i++) {
-            const Tensor *src;
-            if (args.tag(i) == TensorArgType::OUTPUT) {
-                src = materialized_outputs.output_ptr(out_idx++);
-            } else {
-                src = args.tensor(i).ptr;
-            }
-            tensors[i].copy(*src);
-        }
-        // Round up to cache line boundary. Both arrays are 1024B so no overrun.
-        // Eliminates branches; extra bytes within the same CL have zero additional cost.
-        memcpy(scalars, args.scalar_data(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64));
-    }
-};
-
-/**
- * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
- *
- * Consolidates all hot-path scheduling fields into a single cache-friendly
- * structure (32 bytes = half a cache line). Accessing any field of a task's
- * slot state brings all related fields into the same cache line.
- *
- * Concurrency notes:
- * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
- * - fanin_count set once at submission, read-only after (hot path for ready check)
- * - task_state, fanin_refcount, fanout_refcount updated atomically
- */
-struct alignas(64) PTO2TaskSlotState {
-    // Fanout lock + list (accessed together under lock in on_task_complete)
-    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
-    int32_t fanout_count;              // 1 (owning scope) + number of consumers
-
-    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
-
-    // Task state (completion, consumed check, ready check)
-    std::atomic<PTO2TaskState> task_state;  // PENDING/READY/RUNNING/COMPLETED/CONSUMED
-
-    // Fanin (accessed together in release_fanin_and_check_ready)
-    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
-    int32_t fanin_count;                  // Number of producer dependencies (set once)
-
-    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
-    std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
-
-    PTO2TaskPayload *payload;
-
-    PTO2TaskDescriptor *task;
-
-    // Hot-path completion fields (moved from TaskDescriptor to avoid cross-struct access)
-    uint8_t active_mask;                     // Bitmask of active subtask slots (set once)
-    std::atomic<uint8_t> subtask_done_mask;  // Each subtask sets its done bit on completion
-    uint8_t ring_id;                         // Ring layer this task belongs to (for per-ring reclamation)
-    int32_t dep_pool_mark{0};  // Dep pool top after this task's submission (orchestrator-only, local memory)
-};
-
-static_assert(sizeof(PTO2TaskSlotState) == 64);
-
-// =============================================================================
-// Cycle Cost Function Type
-// =============================================================================
-
-/**
- * Cycle cost function pointer type
- * Returns estimated cycle count for the InCore function
- */
-typedef int64_t (*PTO2CycleCostFunc)(void **args, int32_t num_args);
-
-// =============================================================================
-// InCore Function Type
-// =============================================================================
-
-/**
- * InCore function signature
- * All InCore functions must match this signature
- */
-typedef void (*PTO2InCoreFunc)(void **args, int32_t num_args);
-
-// =============================================================================
-// Utility Macros
-// =============================================================================
-
-/**
- * Memory barrier macros for different architectures
- */
-#if defined(__aarch64__)
-#define PTO2_MEMORY_BARRIER() __asm__ __volatile__("dmb sy" ::: "memory")
-#elif defined(__x86_64__)
-#define PTO2_MEMORY_BARRIER() __asm__ __volatile__("mfence" ::: "memory")
-#else
-#define PTO2_MEMORY_BARRIER() __sync_synchronize()
-#endif
-
-// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
-// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
-// all threads share host CPU cores, so we yield to prevent starvation.
-// This header is also compiled into the Host .so (for struct definitions only),
-// where the hint is never called — the fallback no-op keeps Host builds clean.
-#if __has_include("spin_hint.h")
-#include "spin_hint.h"
-#else
-#define SPIN_WAIT_HINT() ((void)0)
-#endif
-
-// =============================================================================
-// Per-task fanout spinlock helpers
-//
-// Used by BOTH the orchestrator (pto_orchestrator.cpp) and the scheduler
-// (aicpu_executor.cpp). Placing them here ensures both translation units use
-// identical acquire/release semantics.
-//
-// The fanout_lock MUST be held whenever reading or writing fanout_head /
-// fanout_count, because the orchestrator adds consumers concurrently with the
-// scheduler traversing the list after task completion.
-// =============================================================================
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-#include "aicpu/device_time.h"
-#endif
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-static inline void pto2_fanout_lock(PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
-    uint64_t t0 = get_sys_cnt_aicpu();
-    bool contended = false;
-    uint32_t atomic_ops = 0;
-
-    for (;;) {
-        while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) {
-            contended = true;
-            atomic_ops++;  // each load = 1 atomic
-            SPIN_WAIT_HINT();
-        }
-        int32_t expected = 0;
-        if (slot_state.fanout_lock.compare_exchange_weak(
-                expected, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            atomic_ops++;  // successful CAS = 1 atomic
-            atomic_count += atomic_ops;
-            if (contended) {
-                wait_cycle += (get_sys_cnt_aicpu() - t0);
-            }
-            return;
-        }
-        contended = true;
-        atomic_ops++;  // failed CAS = 1 atomic
-    }
-}
-#endif
-
-static inline void pto2_fanout_lock(PTO2TaskSlotState &slot_state) {
-    for (;;) {
-        while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) {
-            SPIN_WAIT_HINT();
-        }
-        int32_t expected = 0;
-        if (slot_state.fanout_lock.compare_exchange_weak(
-                expected, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            return;
-        }
-    }
-}
-
-static inline void pto2_fanout_unlock(PTO2TaskSlotState &slot_state) {
-    slot_state.fanout_lock.store(0, std::memory_order_release);
-}
-
-#endif  // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp
deleted file mode 100644
index 38308ff81..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Scheduler Implementation
- *
- * Implements scheduler state management, ready queues, and task lifecycle.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_scheduler.h"
-#include <inttypes.h>
-#include <new>
-#include <stdlib.h>
-#include <utility>
-#include "common/unified_log.h"
-
-// =============================================================================
-// Scheduler Profiling Counters
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-#include "common/platform_config.h"
-
-uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
-
-PTO2SchedProfilingData pto2_scheduler_get_profiling(int thread_idx) {
-    PTO2SchedProfilingData d;
-    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
-    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
-    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
-    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
-    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
-    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
-    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
-    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
-    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
-    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
-    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
-    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
-    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
-    return d;
-}
-#endif
-
-// =============================================================================
-// Task State Names
-// =============================================================================
-
-const char *pto2_task_state_name(PTO2TaskState state) {
-    switch (state) {
-    case PTO2_TASK_PENDING:
-        return "PENDING";
-    case PTO2_TASK_READY:
-        return "READY";
-    case PTO2_TASK_RUNNING:
-        return "RUNNING";
-    case PTO2_TASK_COMPLETED:
-        return "COMPLETED";
-    case PTO2_TASK_CONSUMED:
-        return "CONSUMED";
-    default:
-        return "UNKNOWN";
-    }
-}
-
-// =============================================================================
-// Ready Queue Implementation
-// =============================================================================
-
-bool pto2_ready_queue_init(PTO2ReadyQueue *queue, uint64_t capacity) {
-    queue->slots = (PTO2ReadyQueueSlot *)malloc(capacity * sizeof(PTO2ReadyQueueSlot));
-    if (!queue->slots) {
-        return false;
-    }
-
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        queue->slots[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void pto2_ready_queue_destroy(PTO2ReadyQueue *queue) {
-    if (queue->slots) {
-        free(queue->slots);
-        queue->slots = NULL;
-    }
-}
-
-// =============================================================================
-// Scheduler Initialization
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init(
-    PTO2SharedMemoryHandle *sm_handle, int32_t ring_id, void *gm_heap_base, uint64_t per_ring_heap_size
-) {
-    task_descriptors = sm_handle->task_descriptors[ring_id];
-    heap_base = (char *)gm_heap_base + ring_id * per_ring_heap_size;
-    task_window_size = sm_handle->header->rings[ring_id].task_window_size;
-    task_window_mask = static_cast<int32_t>(task_window_size - 1);
-    last_task_alive = 0;
-    last_heap_consumed = 0;
-    heap_tail = 0;
-    slot_states = nullptr;
-    advance_lock.store(0, std::memory_order_relaxed);
-
-    // Allocate per-task slot state array (dynamically sized based on runtime window_size)
-    slot_states = new (std::nothrow) PTO2TaskSlotState[task_window_size];
-    if (!slot_states) {
-        return false;
-    }
-
-    // Zero-initialize all per-task slot state fields.
-    for (uint64_t i = 0; i < task_window_size; i++) {
-        slot_states[i].fanout_lock.store(0, std::memory_order_relaxed);
-        slot_states[i].fanout_count = 0;
-        slot_states[i].fanout_head = nullptr;
-        slot_states[i].task_state.store(static_cast<PTO2TaskState>(0), std::memory_order_relaxed);
-        slot_states[i].fanin_refcount.store(0, std::memory_order_relaxed);
-        slot_states[i].fanin_count = 0;
-        slot_states[i].fanout_refcount.store(0, std::memory_order_relaxed);
-        slot_states[i].payload = nullptr;
-        slot_states[i].task = nullptr;
-        slot_states[i].active_mask = 0;
-        slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed);
-        slot_states[i].ring_id = 0;
-    }
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() {
-    if (!slot_states) return;
-    delete[] slot_states;
-    slot_states = nullptr;
-}
-
-bool pto2_scheduler_init(
-    PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, void *gm_heap_base, uint64_t per_ring_heap_size
-) {
-    sched->sm_handle = sm_handle;
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    // Initialize per-ring state
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init(sm_handle, r, gm_heap_base, per_ring_heap_size)) {
-            for (int j = 0; j < r; j++) {
-                sched->ring_sched_states[j].destroy();
-            }
-            return false;
-        }
-    }
-
-    // Initialize ready queues (one per resource shape, global)
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!pto2_ready_queue_init(&sched->ready_queues[i], PTO2_READY_QUEUE_SIZE)) {
-            // Cleanup on failure
-            for (int j = 0; j < i; j++) {
-                pto2_ready_queue_destroy(&sched->ready_queues[j]);
-            }
-            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                sched->ring_sched_states[r].destroy();
-            }
-            return false;
-        }
-    }
-
-    return true;
-}
-
-void pto2_scheduler_destroy(PTO2SchedulerState *sched) {
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-    }
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        pto2_ready_queue_destroy(&sched->ready_queues[i]);
-    }
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void pto2_scheduler_print_stats(PTO2SchedulerState *sched) {
-    LOG_INFO("=== Scheduler Statistics ===");
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (sched->ring_sched_states[r].last_task_alive > 0 || sched->ring_sched_states[r].heap_tail > 0) {
-            LOG_INFO("Ring %d:", r);
-            LOG_INFO("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
-            LOG_INFO("  heap_tail:       %" PRIu64, sched->ring_sched_states[r].heap_tail);
-        }
-    }
-#if PTO2_SCHED_PROFILING
-    LOG_INFO("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
-    LOG_INFO("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
-#endif
-    LOG_INFO("============================");
-}
-
-void pto2_scheduler_print_queues(PTO2SchedulerState *sched) {
-    LOG_INFO("=== Ready Queues ===");
-
-    const char *shape_names[] = {"AIC_ONLY", "AIV_X1", "AIV_X2", "AIC_AIV_X1", "AIC_AIV_X2"};
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        LOG_INFO("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
-    }
-
-    LOG_INFO("====================");
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h
deleted file mode 100644
index 080e9e598..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_scheduler.h
+++ /dev/null
@@ -1,729 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Scheduler Interface
- *
- * The Scheduler is responsible for:
- * 1. Maintaining per-resource-shape ready queues
- * 2. Tracking task state (PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED)
- * 3. Managing fanin/fanout refcounts for dependency resolution
- * 4. Advancing last_task_alive for heap reclamation
- * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
- *
- * The Scheduler runs on Device AI_CPU and processes:
- * - Task state transitions based on fanin_refcount
- * - Buffer lifecycle based on fanout_refcount
- * - Ring pointer advancement for flow control
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#ifndef PTO_SCHEDULER_H
-#define PTO_SCHEDULER_H
-
-#include <atomic>
-
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-#include "pto_ring_buffer.h"
-
-#include "common/core_type.h"
-
-#if PTO2_SCHED_PROFILING
-#include "aicpu/device_time.h"
-#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
-#define PTO2_SCHED_CYCLE_LAP(acc)   \
-    do {                            \
-        _st1 = get_sys_cnt_aicpu(); \
-        acc += (_st1 - _st0);       \
-        _st0 = _st1;                \
-    } while (0)
-#endif
-
-// =============================================================================
-// Ready Queue (Lock-free bounded MPMC — Vyukov design)
-// =============================================================================
-
-/**
- * Per-slot entry: sequence counter for ABA safety + task payload
- */
-struct PTO2ReadyQueueSlot {
-    std::atomic<int64_t> sequence;
-    PTO2TaskSlotState *slot_state;
-};
-
-/**
- * Thread-local ready buffer for local-first dispatch optimization.
- *
- * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
- * Initialized once before the scheduling loop; must be empty at
- * the start of each iteration (verified by always_assert).
- *
- * Phase 1 fills per-CoreType buffers via on_task_complete().
- * dispatch_ready_tasks_to_idle_cores drains them: local-first via
- * get_ready_task, then remaining tasks pushed to global readyQ.
- */
-// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
-static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
-
-struct PTO2LocalReadyBuffer {
-    PTO2TaskSlotState **slot_states = nullptr;
-    int count = 0;
-    int capacity = 0;
-
-    void reset(PTO2TaskSlotState **buf, int cap) {
-        slot_states = buf;
-        count = 0;
-        capacity = cap;
-    }
-
-    bool try_push(PTO2TaskSlotState *s) {
-        if (slot_states && count < capacity) {
-            slot_states[count++] = s;
-            return true;
-        }
-        return false;
-    }
-
-    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
-};
-
-/**
- * Lock-free bounded MPMC queue (Dmitry Vyukov design)
- *
- * Key properties:
- * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
- * - Per-slot sequence counter prevents ABA problem
- * - Empty queue pop returns immediately (single atomic load, no lock)
- * - CAS contention is split: producers only touch enqueue_pos,
- *   consumers only touch dequeue_pos
- */
-struct alignas(64) PTO2ReadyQueue {
-    PTO2ReadyQueueSlot *slots;
-    uint64_t capacity;
-    uint64_t mask;        // capacity - 1
-    char _pad0[64 - 24];  // Pad to own cache line
-
-    std::atomic<uint64_t> enqueue_pos;
-    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
-
-    std::atomic<uint64_t> dequeue_pos;
-    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
-
-    uint64_t size() {
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        return (e >= d) ? (e - d) : 0;
-    }
-
-    bool push(PTO2TaskSlotState *slot_state) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - (int64_t)pos;
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    break;
-                }
-            } else if (diff < 0) {
-                return false;  // Queue full
-            }
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store((int64_t)(pos + 1), std::memory_order_release);
-        return true;
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - (int64_t)pos;
-            atomic_ops += 2;  // enqueue_pos.load + sequence.load
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                return false;  // Queue full
-            } else {
-                contended = true;  // diff > 0: slot not yet released, spin
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store((int64_t)(pos + 1), std::memory_order_release);
-        return true;
-    }
-#endif
-
-    PTO2TaskSlotState *pop() {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - (int64_t)(pos + 1);
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    ))
-                    break;
-            } else if (diff < 0) {
-                return nullptr;  // Queue empty
-            }
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store((int64_t)(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-
-#if PTO2_SCHED_PROFILING
-    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - (int64_t)(pos + 1);
-            atomic_ops += 2;  // dequeue_pos.load + sequence.load
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                atomic_count += atomic_ops;
-                return nullptr;  // Queue empty
-            } else {
-                contended = true;
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store((int64_t)(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-#endif
-};
-
-// Cold-path ready queue operations (defined in pto_scheduler.cpp)
-bool pto2_ready_queue_init(PTO2ReadyQueue *queue, uint64_t capacity);
-void pto2_ready_queue_destroy(PTO2ReadyQueue *queue);
-
-// =============================================================================
-// Scheduler State
-// =============================================================================
-
-/**
- * Statistics returned by mixed-task completion processing
- */
-struct PTO2CompletionStats {
-    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
-    int32_t tasks_enqueued;     // Number of consumers that became READY
-    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
-    bool mixed_task_completed;  // True only when this callback completed a mixed task
-};
-
-/**
- * Scheduler state structure
- *
- * Contains dynamic state updated during task execution.
- * Separated from shared memory for cache efficiency.
- * Hot-path methods are defined inline (implicitly inline as member functions).
- */
-struct PTO2SchedulerState {
-    // Shared memory access
-    PTO2SharedMemoryHandle *sm_handle;
-
-    // Per-ring state
-    struct RingSchedState {
-        PTO2TaskDescriptor *task_descriptors;
-        PTO2TaskSlotState *slot_states;
-        int32_t last_task_alive;
-        int32_t last_heap_consumed;
-        uint64_t heap_tail;
-        void *heap_base;
-        int32_t task_window_mask;
-        uint64_t task_window_size;
-        // Try-lock used to advance this ring's pointers (CONSUMED scanning + heap tail update).
-        std::atomic<int32_t> advance_lock;
-
-        bool init(PTO2SharedMemoryHandle *sm_handle, int32_t ring_id, void *gm_heap_base, uint64_t per_ring_heap_size);
-        void destroy();
-
-        PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) {
-            return slot_states[local_id & task_window_mask];
-        }
-        PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
-
-        void sync_to_sm(PTO2SharedMemoryRingHeader &ring) {
-            ring.fc.last_task_alive.store(last_task_alive, std::memory_order_release);
-            ring.fc.heap_tail.store(heap_tail, std::memory_order_release);
-        }
-
-        void advance_ring_pointers(PTO2SharedMemoryRingHeader &ring) {
-            int32_t current_task_index = ring.fc.current_task_index.load(std::memory_order_acquire);
-
-            while (last_task_alive < current_task_index) {
-                PTO2TaskSlotState &slot_state = get_slot_state_by_task_id(last_task_alive);
-                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
-                    break;
-                }
-                last_task_alive++;
-            }
-
-            if (last_task_alive > 0) {
-                int32_t last_consumed_id = last_task_alive - 1;
-                PTO2TaskSlotState &slot_state = get_slot_state_by_task_id(last_consumed_id);
-                PTO2TaskDescriptor &task = *slot_state.task;
-                if (task.packed_buffer_end != NULL) {
-                    heap_tail = (uint64_t)((char *)task.packed_buffer_end - (char *)heap_base);
-                }
-            }
-
-            sync_to_sm(ring);
-        }
-    } ring_sched_states[PTO2_MAX_RING_DEPTH];
-
-    // Ready queues remain global (scheduling is ring-agnostic)
-    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
-
-    // Statistics
-#if PTO2_SCHED_PROFILING
-    std::atomic<int64_t> tasks_completed;
-    std::atomic<int64_t> tasks_consumed;
-#endif
-    // =========================================================================
-    // Inline hot-path methods
-    // =========================================================================
-    PTO2TaskSlotState &get_slot_state(int32_t ring_id, int32_t local_id) {
-        return ring_sched_states[ring_id].get_slot_state_by_task_id(local_id);
-    }
-    PTO2TaskSlotState &get_slot_state_by_slot(int32_t ring_id, int32_t slot) {
-        return ring_sched_states[ring_id].get_slot_state_by_slot(slot);
-    }
-
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state) {
-        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            )) {
-            return;
-        }
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]);
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-        }
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        int32_t fc = slot_state.fanout_count;
-        int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
-
-        atomic_count += 2;  // fanout_count.load + fanout_refcount.load
-
-        if (rc != fc) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            )) {
-            atomic_count += 1;  // failed CAS
-            return;
-        }
-
-        atomic_count += 1;  // successful CAS
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]);
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-            atomic_count += 2;  // try-lock CAS + unlock store
-        } else {
-            atomic_count += 1;  // failed try-lock CAS
-        }
-    }
-#endif
-
-    void release_producer(PTO2TaskSlotState &slot_state) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot_state);
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        atomic_count += 1;  // fanout_refcount.fetch_add
-        check_and_handle_consumed(slot_state, atomic_count);
-    }
-#endif
-
-    bool release_fanin_and_check_ready(PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr) {
-        // Atomically increment fanin_refcount and check if all producers are done
-        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
-        // init release, making fanin_count visible — plain load suffices.
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Local-first: try per-CoreType thread-local buffer before global queue
-            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
-            PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask);
-            bool pushed_local = false;
-            if (local_bufs) {
-                int32_t buf_idx = (slot_state.active_mask & 0x01) ? 0 : 1;
-                pushed_local = local_bufs[buf_idx].try_push(&slot_state);
-            }
-            if (!pushed_local) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
-            }
-            return true;
-        }
-        return false;
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool release_fanin_and_check_ready(
-        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
-        PTO2LocalReadyBuffer *local_bufs = nullptr
-    ) {
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-        atomic_count += 1;  // fanin_refcount.fetch_add
-
-        if (new_refcount == slot_state.fanin_count) {
-            PTO2TaskState expected = PTO2_TASK_PENDING;
-            if (slot_state.task_state.compare_exchange_strong(
-                    expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire
-                )) {
-                atomic_count += 1;  // CAS(task_state PENDING→READY)
-                // Local-first: try per-CoreType thread-local buffer before global queue
-                PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask);
-                bool pushed_local = false;
-                if (local_bufs) {
-                    int32_t buf_idx = (slot_state.active_mask & 0x01) ? 0 : 1;
-                    pushed_local = local_bufs[buf_idx].try_push(&slot_state);
-                }
-                if (!pushed_local) {
-                    ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
-                }
-                return true;
-            }
-        }
-        return false;
-    }
-#endif
-
-    PTO2TaskSlotState *get_ready_task(PTO2ResourceShape shape) {
-        return ready_queues[static_cast<int32_t>(shape)].pop();
-    }
-
-    template <CoreType CT>
-    PTO2TaskSlotState *get_ready_task(PTO2LocalReadyBuffer *local_bufs) {
-        constexpr int ct = static_cast<int>(CT);
-        if (local_bufs && local_bufs[ct].count > 0) {
-            return local_bufs[ct].pop();
-        }
-        return ready_queues[ct].pop();
-    }
-
-#if PTO2_SCHED_PROFILING
-    PTO2TaskSlotState *get_ready_task(PTO2ResourceShape shape, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        return ready_queues[static_cast<int32_t>(shape)].pop(atomic_count, wait_cycle);
-    }
-
-    template <CoreType CT>
-    PTO2TaskSlotState *get_ready_task(PTO2LocalReadyBuffer *local_bufs, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        constexpr int ct = static_cast<int>(CT);
-        if (local_bufs && local_bufs[ct].count > 0) {
-            return local_bufs[ct].pop();
-        }
-        return ready_queues[ct].pop(atomic_count, wait_cycle);
-    }
-#endif
-
-    /**
-     * Requeue a ready task that could not be dispatched (no suitable cluster).
-     * Pushes the task back into its shape-based queue.
-     */
-    void requeue_ready_task(PTO2TaskSlotState &slot_state) {
-        PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask);
-        ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
-    }
-
-    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
-#if PTO2_ORCH_PROFILING
-        extern uint64_t g_orch_scope_end_atomic_count;
-        for (int32_t i = 0; i < count; i++) {
-            release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count);
-        }
-#else
-        for (int32_t i = 0; i < count; i++) {
-            release_producer(*task_slot_states[i]);
-        }
-#endif
-    }
-
-    /**
-     * Two-stage completion: first stage.
-     * Called when a single subtask (AIC, AIV0, or AIV1) finishes.
-     * Sets the corresponding done bit in subtask_done_mask.
-     *
-     * @return true if this subtask was the last one, completing the mixed task.
-     */
-    bool on_subtask_complete(PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot) {
-        uint8_t done_bit = (1u << static_cast<uint8_t>(subslot));
-        uint8_t prev_mask = slot_state.subtask_done_mask.fetch_or(done_bit, std::memory_order_acq_rel);
-        uint8_t new_mask = prev_mask | done_bit;
-
-        return new_mask == slot_state.active_mask;
-    }
-
-    /**
-     * Two-stage completion: second stage.
-     * Called exactly once when all subtasks of a mixed task are done
-     * (i.e., on_subtask_complete returned true).
-     * Handles fanout notification, fanin release, and self-consumption check.
-     */
-#if PTO2_SCHED_PROFILING
-    PTO2CompletionStats
-#else
-    void
-#endif
-    on_mixed_task_complete(
-        PTO2TaskSlotState &slot_state,
-#if PTO2_SCHED_PROFILING
-        int thread_idx,
-#endif
-
-        PTO2LocalReadyBuffer *local_bufs = nullptr
-    ) {
-#if PTO2_SCHED_PROFILING
-        PTO2CompletionStats stats = {0, 0, 0, true};
-#endif
-#if PTO2_SCHED_PROFILING
-        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
-        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
-        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
-        uint64_t lock_atomics = 0, lock_wait = 0;
-        PTO2_SCHED_CYCLE_START();
-#endif
-
-#if PTO2_SCHED_PROFILING
-        pto2_fanout_lock(slot_state, lock_atomics, lock_wait);
-#else
-        pto2_fanout_lock(slot_state);
-#endif
-        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
-        pto2_fanout_unlock(slot_state);
-
-#if PTO2_SCHED_PROFILING
-        lock_atomics += 2;  // state.store + unlock.store
-        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
-        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
-#endif
-
-        // Fanout: notify consumers
-#if PTO2_SCHED_PROFILING
-        uint64_t fanout_atomics = 0, push_wait = 0;
-#endif
-        while (current != nullptr) {
-            PTO2TaskSlotState &consumer_slot = *current->slot_state;
-#if PTO2_SCHED_PROFILING
-            stats.fanout_edges++;
-            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs)) {
-                stats.tasks_enqueued++;
-            }
-#else
-            release_fanin_and_check_ready(consumer_slot, local_bufs);
-#endif
-            current = current->next;
-        }
-
-#if PTO2_SCHED_PROFILING
-        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
-        g_sched_push_wait_cycle[thread_idx] += push_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
-        return stats;
-#endif
-    }
-
-    /**
-     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
-     * Returns fanin edge count for profiling.
-     */
-
-#if PTO2_SCHED_PROFILING
-    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
-        PTO2_SCHED_CYCLE_START();
-        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
-        extern uint64_t g_sched_self_atomic_count[];
-        extern uint64_t g_sched_self_consumed_cycle[];
-        extern uint64_t g_sched_complete_count[];
-        uint64_t fanin_atomics = 0;
-#else
-    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
-#endif
-        PTO2TaskPayload *payload = slot_state.payload;
-        int32_t fanin_edges = payload->fanin_actual_count;
-        for (int32_t i = 0; i < fanin_edges; i++) {
-#if PTO2_SCHED_PROFILING
-            release_producer(*payload->fanin_slot_states[i], fanin_atomics);
-#else
-            release_producer(*payload->fanin_slot_states[i]);
-#endif
-        }
-#if PTO2_SCHED_PROFILING
-        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
-#endif
-
-        // Self consumed check
-#if PTO2_SCHED_PROFILING
-        uint64_t self_atomics = 0;
-        check_and_handle_consumed(slot_state, self_atomics);
-        g_sched_self_atomic_count[thread_idx] += self_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
-        g_sched_complete_count[thread_idx]++;
-#else
-        check_and_handle_consumed(slot_state);
-#endif
-        return fanin_edges;
-    }
-};
-
-// =============================================================================
-// Scheduler API (cold path, defined in pto_scheduler.cpp)
-// =============================================================================
-
-bool pto2_scheduler_init(
-    PTO2SchedulerState *sched, PTO2SharedMemoryHandle *sm_handle, void *gm_heap_base, uint64_t per_ring_heap_size
-);
-void pto2_scheduler_destroy(PTO2SchedulerState *sched);
-
-// =============================================================================
-// Debug Utilities (cold path, defined in pto_scheduler.cpp)
-// =============================================================================
-
-void pto2_scheduler_print_stats(PTO2SchedulerState *sched);
-void pto2_scheduler_print_queues(PTO2SchedulerState *sched);
-const char *pto2_task_state_name(PTO2TaskState state);
-
-// =============================================================================
-// Scheduler Profiling Data
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-struct PTO2SchedProfilingData {
-    // Sub-phase cycle breakdown within on_mixed_task_complete
-    uint64_t lock_cycle;           // pto2_fanout_lock + state store + unlock
-    uint64_t fanout_cycle;         // fanout traversal
-    uint64_t fanin_cycle;          // fanin traversal
-    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
-
-    // Wait times
-    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
-    uint64_t push_wait_cycle;  // CAS contention in push()
-    uint64_t pop_wait_cycle;   // CAS contention in pop()
-
-    // Atomic counts per sub-phase
-    uint64_t lock_atomic_count;
-    uint64_t fanout_atomic_count;
-    uint64_t fanin_atomic_count;
-    uint64_t self_atomic_count;
-    uint64_t pop_atomic_count;
-
-    int64_t complete_count;
-};
-
-/**
- * Get and reset scheduler profiling data for a specific thread.
- * Returns accumulated profiling data and resets counters.
- */
-PTO2SchedProfilingData pto2_scheduler_get_profiling(int thread_idx);
-#endif
-
-#endif  // PTO_SCHEDULER_H
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp
deleted file mode 100644
index 4c511d0f8..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Shared Memory Implementation
- *
- * Implements shared memory allocation, initialization, and management
- * for Orchestrator-Scheduler communication.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_shared_memory.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include "common/unified_log.h"
-
-// =============================================================================
-// Size Calculation
-// =============================================================================
-
-uint64_t pto2_sm_calculate_size(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    return pto2_sm_calculate_size_per_ring(task_window_sizes);
-}
-
-uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    uint64_t size = 0;
-
-    // Header (aligned to cache line)
-    size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Per-ring task descriptors and payloads
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-    }
-
-    return size;
-}
-
-// =============================================================================
-// Creation and Destruction
-// =============================================================================
-
-static void
-pto2_sm_setup_pointers_per_ring(PTO2SharedMemoryHandle *handle, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    char *ptr = (char *)handle->sm_base;
-
-    // Header
-    handle->header = (PTO2SharedMemoryHeader *)ptr;
-    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Per-ring task descriptors and payloads
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        handle->task_descriptors[r] = (PTO2TaskDescriptor *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-
-        handle->task_payloads[r] = (PTO2TaskPayload *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-    }
-}
-
-static void pto2_sm_setup_pointers(PTO2SharedMemoryHandle *handle, uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    pto2_sm_setup_pointers_per_ring(handle, task_window_sizes);
-}
-
-PTO2SharedMemoryHandle *pto2_sm_create(uint64_t task_window_size, uint64_t heap_size) {
-    // Allocate handle
-    PTO2SharedMemoryHandle *handle = (PTO2SharedMemoryHandle *)calloc(1, sizeof(PTO2SharedMemoryHandle));
-    if (!handle) {
-        return NULL;
-    }
-
-    // Calculate total size
-    uint64_t sm_size = pto2_sm_calculate_size(task_window_size);
-
-// Allocate shared memory (aligned for DMA efficiency)
-#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
-    if (posix_memalign(&handle->sm_base, PTO2_ALIGN_SIZE, static_cast<size_t>(sm_size)) != 0) {
-        free(handle);
-        return NULL;
-    }
-#else
-    handle->sm_base = aligned_alloc(PTO2_ALIGN_SIZE, static_cast<size_t>(sm_size));
-    if (!handle->sm_base) {
-        free(handle);
-        return NULL;
-    }
-#endif
-
-    handle->sm_size = sm_size;
-    handle->is_owner = true;
-
-    // Initialize to zero
-    memset(handle->sm_base, 0, static_cast<size_t>(sm_size));
-
-    // Set up pointers
-    pto2_sm_setup_pointers(handle, task_window_size);
-
-    // Initialize header
-    pto2_sm_init_header(handle, task_window_size, heap_size);
-
-    return handle;
-}
-
-PTO2SharedMemoryHandle *pto2_sm_create_default(void) { return pto2_sm_create(PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE); }
-
-PTO2SharedMemoryHandle *
-pto2_sm_create_from_buffer(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size) {
-    if (!sm_base || sm_size == 0) return NULL;
-
-    uint64_t required = pto2_sm_calculate_size(task_window_size);
-    if (sm_size < required) return NULL;
-
-    PTO2SharedMemoryHandle *handle = (PTO2SharedMemoryHandle *)calloc(1, sizeof(PTO2SharedMemoryHandle));
-    if (!handle) return NULL;
-
-    handle->sm_base = sm_base;
-    handle->sm_size = sm_size;
-    handle->is_owner = false;
-
-    pto2_sm_setup_pointers(handle, task_window_size);
-    pto2_sm_init_header(handle, task_window_size, heap_size);
-
-    return handle;
-}
-
-void pto2_sm_destroy(PTO2SharedMemoryHandle *handle) {
-    if (!handle) return;
-
-    if (handle->is_owner && handle->sm_base) {
-        free(handle->sm_base);
-    }
-
-    free(handle);
-}
-
-// =============================================================================
-// Initialization
-// =============================================================================
-//
-// no need init data in pool, init pool data when used
-void pto2_sm_init_header(PTO2SharedMemoryHandle *handle, uint64_t task_window_size, uint64_t heap_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-        heap_sizes[r] = heap_size;
-    }
-    pto2_sm_init_header_per_ring(handle, task_window_sizes, heap_sizes);
-}
-
-void pto2_sm_init_header_per_ring(
-    PTO2SharedMemoryHandle *handle, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    PTO2SharedMemoryHeader *header = handle->header;
-
-    // Per-ring flow control (start at 0)
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].fc.init();
-    }
-
-    header->orchestrator_done.store(0, std::memory_order_relaxed);
-
-    // Per-ring layout info
-    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].task_window_size = task_window_sizes[r];
-        header->rings[r].heap_size = heap_sizes[r];
-        header->rings[r].task_descriptors_offset = offset;
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-    }
-
-    header->total_size = handle->sm_size;
-    header->graph_output_ptr.store(0, std::memory_order_relaxed);
-    header->graph_output_size.store(0, std::memory_order_relaxed);
-
-    // Error reporting
-    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
-    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_thread.store(-1, std::memory_order_relaxed);
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void pto2_sm_print_layout(PTO2SharedMemoryHandle *handle) {
-    if (!handle || !handle->header) return;
-
-    PTO2SharedMemoryHeader *h = handle->header;
-
-    LOG_INFO("=== PTO2 Shared Memory Layout ===");
-    LOG_INFO("Base address:       %p", handle->sm_base);
-    LOG_INFO("Total size:         %" PRIu64 " bytes", h->total_size);
-    LOG_INFO("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO("Ring %d:", r);
-        LOG_INFO("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
-        LOG_INFO("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
-        LOG_INFO(
-            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset,
-            h->rings[r].task_descriptors_offset
-        );
-        LOG_INFO("  heap_top:         %" PRIu64, h->rings[r].fc.heap_top.load(std::memory_order_acquire));
-        LOG_INFO("  heap_tail:        %" PRIu64, h->rings[r].fc.heap_tail.load(std::memory_order_acquire));
-        LOG_INFO("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
-        LOG_INFO("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
-    }
-    LOG_INFO("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
-    LOG_INFO("Error state:");
-    LOG_INFO("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
-    LOG_INFO("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
-    LOG_INFO("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
-    LOG_INFO("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
-    LOG_INFO("================================");
-}
-
-bool pto2_sm_validate(PTO2SharedMemoryHandle *handle) {
-    if (!handle) return false;
-    if (!handle->sm_base) return false;
-    if (!handle->header) return false;
-
-    PTO2SharedMemoryHeader *h = handle->header;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!h->rings[r].fc.validate(handle, r)) return false;
-    }
-
-    return true;
-}
-
-bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
-    if (!handle) return false;
-    if (!handle->header) return false;
-    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
-
-    const PTO2SharedMemoryHeader *h = handle->header;
-
-    // Check that offsets are within bounds
-    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
-
-    // Check pointer alignment
-    if ((uintptr_t)handle->task_descriptors[ring_id] % PTO2_ALIGN_SIZE != 0) return false;
-
-    // Check flow control pointer sanity
-    int32_t current = current_task_index.load(std::memory_order_acquire);
-    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
-    uint64_t top = heap_top.load(std::memory_order_acquire);
-    uint64_t tail = heap_tail.load(std::memory_order_acquire);
-    if (current < 0) return false;
-    if (last_alive < 0) return false;
-    if (top > h->rings[ring_id].heap_size) return false;
-    if (tail > h->rings[ring_id].heap_size) return false;
-
-    return true;
-}
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h
deleted file mode 100644
index d7880f482..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_shared_memory.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Shared Memory Layout
- *
- * Defines the shared memory structure for Orchestrator-Scheduler communication.
- *
- * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
- *   +---------------------------+
- *   | SharedMemoryHeader        |  (per-ring flow control + sync)
- *   +---------------------------+
- *   | Ring 0: TaskDescriptor[]  |
- *   | Ring 0: TaskPayload[]     |
- *   +---------------------------+
- *   | Ring 1: TaskDescriptor[]  |
- *   | Ring 1: TaskPayload[]     |
- *   +---------------------------+
- *   | ...                       |
- *   +---------------------------+
- *
- * Design principles:
- * - Only data needed for Orchestrator<->Scheduler communication is here
- * - Scope_stack, ready_queues, dep_pool are in private memory
- * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#ifndef PTO_SHARED_MEMORY_H
-#define PTO_SHARED_MEMORY_H
-
-#include "pto_runtime2_types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// =============================================================================
-// Shared Memory Header
-// =============================================================================
-
-struct PTO2SharedMemoryHandle;
-
-/**
- * Per-ring flow control state in shared memory.
- * Written/read by Orchestrator and Scheduler for synchronization.
- */
-struct PTO2RingFlowControl {
-    // Written by Orchestrator, Read by Scheduler
-    std::atomic<uint64_t> heap_top;           // Heap ring allocation pointer
-    std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
-    int32_t _pad0;                            // Alignment padding
-
-    // Written by Scheduler, Read by Orchestrator (for back-pressure)
-    std::atomic<uint64_t> heap_tail;       // Heap ring free pointer
-    std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
-    int32_t _pad1;                         // Alignment padding
-
-    void init() {
-        heap_top.store(0, std::memory_order_relaxed);
-        current_task_index.store(0, std::memory_order_relaxed);
-        heap_tail.store(0, std::memory_order_relaxed);
-        last_task_alive.store(0, std::memory_order_relaxed);
-    }
-
-    bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const;
-};
-
-/**
- * Per-ring shared memory header section.
- *
- * Groups flow-control and layout info for a single ring to avoid parallel arrays.
- */
-struct PTO2SharedMemoryRingHeader {
-    PTO2RingFlowControl fc;
-    uint64_t task_window_size;
-    uint64_t heap_size;
-    uint64_t task_descriptors_offset;  // Offset from SM base, in bytes
-};
-
-/**
- * Shared memory header structure
- *
- * Contains per-ring flow control and global layout information.
- */
-struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
-    // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
-    PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
-
-    // === GLOBAL FIELDS ===
-    std::atomic<int32_t> orchestrator_done;  // Flag: orchestration complete
-
-    // Total shared memory size (for validation)
-    uint64_t total_size;
-
-    // Graph output for copy-back (set by orchestrator when using packed buffer)
-    // Host finalize copies from this address instead of dev_ptr when non-zero
-    std::atomic<uint64_t> graph_output_ptr;   // Address where final output was written (packed buffer)
-    std::atomic<uint64_t> graph_output_size;  // Size in bytes
-
-    // === ERROR REPORTING ===
-
-    // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host)
-    // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host.
-    std::atomic<int32_t> orch_error_code;
-
-    // Scheduler error state (Scheduler → Host, independent of orchestrator)
-    // Written by scheduler threads on timeout; read by orchestrator and host.
-    std::atomic<int32_t> sched_error_bitmap;  // Bit X set = thread X had error
-    std::atomic<int32_t> sched_error_code;    // Last scheduler error code (last-writer-wins)
-    std::atomic<int32_t> sched_error_thread;  // Thread index of last error writer
-};
-
-static_assert(
-    sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0,
-    "PTO2SharedMemoryHeader must be aligned to cache line (PTO2_ALIGN_SIZE)"
-);
-
-// =============================================================================
-// Shared Memory Handle
-// =============================================================================
-
-/**
- * Handle for shared memory access
- * Provides both Orchestrator and Scheduler views of the same memory
- */
-struct PTO2SharedMemoryHandle {
-    void *sm_base;     // Base address of shared memory
-    uint64_t sm_size;  // Total size of shared memory
-
-    // Quick pointers into shared memory regions (per-ring)
-    PTO2SharedMemoryHeader *header;
-    PTO2TaskDescriptor *task_descriptors[PTO2_MAX_RING_DEPTH];
-    PTO2TaskPayload *task_payloads[PTO2_MAX_RING_DEPTH];
-
-    // Ownership flag
-    bool is_owner;  // True if this handle allocated the memory
-};
-
-// =============================================================================
-// Shared Memory API
-// =============================================================================
-
-/**
- * Calculate required shared memory size
- *
- * @param task_window_size  Number of task slots per ring
- * @return Total bytes required
- */
-uint64_t pto2_sm_calculate_size(uint64_t task_window_size);
-
-/**
- * Calculate required shared memory size for per-ring task windows.
- *
- * @param task_window_sizes  Array of window sizes per ring
- * @return Total bytes required
- */
-uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
-
-/**
- * Create shared memory for Orchestrator and Scheduler
- *
- * @param task_window_size  Number of task slots per ring
- * @param heap_size         Heap size per ring for output buffers
- * @return Handle with both views, or NULL on failure
- */
-PTO2SharedMemoryHandle *pto2_sm_create(uint64_t task_window_size, uint64_t heap_size);
-
-/**
- * Create shared memory with default sizes
- */
-PTO2SharedMemoryHandle *pto2_sm_create_default(void);
-
-/**
- * Wrap an existing buffer as shared memory (e.g. device GM buffer).
- * Caller owns the buffer; handle will not free sm_base.
- *
- * @param sm_base            Base address of pre-allocated buffer
- * @param sm_size            Total size in bytes
- * @param task_window_size   Number of task slots per ring (must match buffer layout)
- * @param heap_size          Heap size per ring (for layout; buffer has no heap region)
- * @return Handle, or NULL on failure
- */
-PTO2SharedMemoryHandle *
-pto2_sm_create_from_buffer(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
-
-/**
- * Destroy shared memory and free resources
- */
-void pto2_sm_destroy(PTO2SharedMemoryHandle *handle);
-
-/**
- * Initialize shared memory header with layout information
- * Called after memory is allocated
- */
-void pto2_sm_init_header(PTO2SharedMemoryHandle *handle, uint64_t task_window_size, uint64_t heap_size);
-
-/**
- * Initialize shared memory header with per-ring layout information.
- */
-void pto2_sm_init_header_per_ring(
-    PTO2SharedMemoryHandle *handle, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-);
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-/**
- * Print shared memory layout info
- */
-void pto2_sm_print_layout(PTO2SharedMemoryHandle *handle);
-
-/**
- * Validate shared memory integrity
- * @return true if valid, false if corrupted
- */
-bool pto2_sm_validate(PTO2SharedMemoryHandle *handle);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // PTO_SHARED_MEMORY_H
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h
deleted file mode 100644
index d27decf3b..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_submit_types.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Submit Types - Shared submit-contract definitions
- *
- * Header-only definitions shared by orchestration-facing and runtime-facing
- * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
- */
-
-#ifndef PTO_SUBMIT_TYPES_H
-#define PTO_SUBMIT_TYPES_H
-
-#include <stdint.h>
-
-inline constexpr int32_t INVALID_KERNEL_ID = -1;
-
-/**
- * Subtask slot count: AIC, AIV0, AIV1
- */
-inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
-
-/**
- * Subtask slot indices
- */
-enum class PTO2SubtaskSlot : uint8_t {
-    AIC = 0,
-    AIV0 = 1,
-    AIV1 = 2,
-};
-
-/**
- * Subtask mask bits (for active_mask / subtask_done_mask)
- */
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);   // 0x1
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);  // 0x2
-inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);  // 0x4
-
-/**
- * Test whether a subtask slot is active in a given mask
- */
-static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) {
-    return (mask & (1u << static_cast<uint8_t>(slot))) != 0;
-}
-
-/**
- * Mixed-task submit contract.
- *
- * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
- * At least one slot must be valid.
- */
-struct MixedKernels {
-    int32_t aic_kernel_id{INVALID_KERNEL_ID};
-    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
-    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
-};
-
-/**
- * Resource shape — classifies a MixedKernels into one of 5 queue buckets.
- */
-enum class PTO2ResourceShape : uint8_t {
-    AIC_ONLY = 0,    // AIC only
-    AIV_X1 = 1,      // One AIV slot
-    AIV_X2 = 2,      // Both AIV slots
-    AIC_AIV_X1 = 3,  // AIC + one AIV
-    AIC_AIV_X2 = 4,  // AIC + both AIV
-};
-
-inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5;
-
-/**
- * Derive resource shape from active_mask.
- * Caller must ensure active_mask is valid (at least one bit set).
- */
-static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
-    bool has_aic = (active_mask & PTO2_SUBTASK_MASK_AIC) != 0;
-    int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0) + ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0);
-
-    if (has_aic) {
-        if (aiv_count == 0) return PTO2ResourceShape::AIC_ONLY;
-        if (aiv_count == 1) return PTO2ResourceShape::AIC_AIV_X1;
-        return PTO2ResourceShape::AIC_AIV_X2;
-    }
-    if (aiv_count == 1) return PTO2ResourceShape::AIV_X1;
-    return PTO2ResourceShape::AIV_X2;
-}
-
-/**
- * Compute active_mask from MixedKernels.
- */
-static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels &mk) {
-    uint8_t mask = 0;
-    if (mk.aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
-    if (mk.aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
-    if (mk.aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
-    return mask;
-}
-
-#endif  // PTO_SUBMIT_TYPES_H
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h
deleted file mode 100644
index b4d9bb1cd..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_types.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Orchestration Build Graph Types - Data structures for orchestration runtime extensions
- *
- * Standalone header defining orchestration-specific types for:
- * - TaskOutputTensors: Return value from submit containing materialized output Tensors
- * - TensorRef: Tagged union for tensor slots (Tensor* or TensorCreateInfo)
- * - SubmitResult: Combined return value (PTO2TaskId + TaskOutputTensors)
- * - Arg: Aggregated argument container for pto_submit_task API
- *
- * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are
- * defined in tensor.h.
- *
- * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h
- * without type conflicts (Handshake, TensorPair, HostApi).
- */
-
-#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_TYPES_H_
-#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_TYPES_H_
-
-#include <stdint.h>
-#include <string.h>
-
-#if defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-#include "task_args.h"
-#include "tensor.h"
-#include "tensor_arg.h"
-
-// Task arguments
-#define MAX_TENSOR_ARGS 16   // Maximum tensor parameters per task
-#define MAX_SCALAR_ARGS 32   // Maximum scalar parameters per task
-#define PTO2_MAX_OUTPUTS 16  // Maximum outputs per task
-#define PTO2_MAX_INPUTS 16   // Maximum inputs per task
-#define PTO2_MAX_INOUTS 8    // Maximum in-out args per task
-
-// Forward declaration for SubmitResult
-struct PTO2TaskId;
-
-// =============================================================================
-// Task Output Tensors (return value from submit)
-// =============================================================================
-
-/**
- * TaskOutputTensors — returned by submit, holds materialized output Tensors.
- *
- * Only runtime-created outputs are stored here, indexed in add_output order.
- *
- * The underlying storage is uninitialized; only output_count elements are
- * valid after submit returns.  This avoids default-constructing Tensor[]
- * on the hot path (2 KB of unnecessary zeroing per submit).
- *
- * Users must hold a named TaskOutputTensors variable and borrow via get_ref();
- * binding get_ref() on an rvalue is compile-time rejected to prevent dangling.
- */
-class TaskOutputTensors {
-public:
-    TaskOutputTensors() :
-        output_count_(0) {}
-
-    bool empty() const { return output_count_ == 0; }
-    uint32_t size() const { return output_count_; }
-
-    /// Borrow a materialized output tensor by index (lvalue only).
-    const Tensor &get_ref(uint32_t index) const & {
-        always_assert(index < output_count_);
-        return *reinterpret_cast<const Tensor *>(_storage + index * sizeof(Tensor));
-    }
-    const Tensor &get_ref(uint32_t index) const && = delete;
-
-    /// Runtime-internal: append one materialized output Tensor.
-    Tensor &materialize_output(const TensorCreateInfo &ci, void *addr, int32_t version) {
-        always_assert(output_count_ < PTO2_MAX_OUTPUTS);
-        Tensor *out = output_ptr(output_count_);
-        out->init_from_create_info(ci, addr, version);
-        output_count_++;
-        return *out;
-    }
-
-    /// Runtime-internal: writable pointer for materialization.
-    Tensor *output_ptr(uint32_t index) { return reinterpret_cast<Tensor *>(_storage + index * sizeof(Tensor)); }
-    const Tensor *output_ptr(uint32_t index) const {
-        return reinterpret_cast<const Tensor *>(_storage + index * sizeof(Tensor));
-    }
-
-private:
-    uint32_t output_count_;
-    alignas(Tensor) unsigned char _storage[PTO2_MAX_OUTPUTS * sizeof(Tensor)];
-};
-
-// =============================================================================
-// Argument Types (for pto_submit_task API)
-// =============================================================================
-
-// TensorArgType is defined in tensor_arg.h (included above)
-
-/**
- * Tagged union for a single Arg slot — either a Tensor* or a TensorCreateInfo value.
- * The active member is determined by TensorArgType (OUTPUT → create_info, else → ptr).
- */
-union TensorRef {
-    const Tensor *ptr;
-    TensorCreateInfo create_info;
-    TensorRef() :
-        ptr(nullptr) {}
-};
-
-/**
- * Aggregated argument container for pto_submit_task
- *
- * Inherits storage from TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>.
- * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo)
- * discriminated by the corresponding tag().
- * Tensors are dispatched first in kernel args, followed by scalars.
- *
- * Output arguments follow two distinct ownership models:
- * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer
- *   and materializes a new Tensor, returned via TaskOutputTensors.
- * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target.
- *
- * Example:
- *   Tensor x = make_tensor_external(dev_a, shapes, 2);
- *   Arg args;
- *   args.add_input(x);
- *   args.add_output(TensorCreateInfo(shapes, 2));
- *   args.add_scalar(some_value);
- *   SubmitResult r = rt_submit_aic_task(rt, kernel_id, args);
- *   const Tensor& y = r.outputs.get_ref(0);
- */
-struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType> {
-    bool has_error{false};
-    const char *error_msg{nullptr};
-
-    void reset() {
-        clear();
-        has_error = false;
-        error_msg = nullptr;
-    }
-
-    void set_error(const char *msg) {
-        if (!has_error) {
-            has_error = true;
-            error_msg = msg;
-        }
-    }
-
-    bool check_add_tensor_valid() {
-        if (scalar_count_ != 0) {
-            set_error(
-                "add_input/add_output/add_inout called after add_scalar: "
-                "all tensors must be added before any scalars"
-            );
-            return false;
-        }
-        if (tensor_count_ >= MAX_TENSOR_ARGS) {
-            set_error("Too many tensor args (exceeds MAX_TENSOR_ARGS=16)");
-            return false;
-        }
-        return true;
-    }
-
-    void add_input(const Tensor &t) {
-        if (!check_add_tensor_valid()) {
-            return;
-        }
-        tensors_[tensor_count_].ptr = &t;
-        tags_[tensor_count_] = TensorArgType::INPUT;
-        tensor_count_++;
-    }
-
-    /// Standard future-output path: runtime allocates buffer from heap,
-    /// materializes Tensor into TaskOutputTensors.
-    void add_output(const TensorCreateInfo &ci) {
-        if (!check_add_tensor_valid()) {
-            return;
-        }
-        tensors_[tensor_count_].create_info = ci;
-        tags_[tensor_count_] = TensorArgType::OUTPUT;
-        tensor_count_++;
-    }
-
-    void add_inout(const Tensor &t) {
-        if (!check_add_tensor_valid()) {
-            return;
-        }
-        tensors_[tensor_count_].ptr = &t;
-        tags_[tensor_count_] = TensorArgType::INOUT;
-        tensor_count_++;
-    }
-
-    /**
-     * Add a scalar value. Type is deduced from the argument;
-     * the value is bit-cast to uint64_t for storage.
-     *
-     *   args.add_scalar(uint64_val);      // existing usage unchanged
-     *   args.add_scalar(3.14f);           // float, auto bit-cast
-     *   args.add_scalar(int32_t(42));     // int32, auto bit-cast
-     */
-    template <typename T = uint64_t>
-    void add_scalar(T value) {
-        static_assert(is_supported_scalar_arg_v<T>, "add_scalar: type must be arithmetic or enum");
-        if (scalar_count_ >= MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
-            return;
-        }
-        scalars_[scalar_count_++] = to_u64(value);
-    }
-
-    void add_scalars(const uint64_t *values, int count) {
-        if (scalar_count_ + count > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
-            return;
-        }
-        memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t));
-        scalar_count_ += count;
-    }
-
-    /**
-     * Zero-extend int32 bit patterns into uint64 scalar slots.
-     * Negative values are treated as their unsigned 32-bit representation
-     * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF).
-     * Uses NEON to process 4 elements per iteration on aarch64.
-     */
-    void add_scalars_i32(const int32_t *values, int count) {
-        if (scalar_count_ + count > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
-            return;
-        }
-        uint64_t *dst = &scalars_[scalar_count_];
-#if defined(__aarch64__)
-        int i = 0;
-        for (; i + 4 <= count; i += 4) {
-            uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t *>(values + i));
-            uint64x2_t lo = vmovl_u32(vget_low_u32(v));
-            uint64x2_t hi = vmovl_u32(vget_high_u32(v));
-            vst1q_u64(dst + i, lo);
-            vst1q_u64(dst + i + 2, hi);
-        }
-        for (; i < count; i++) {
-            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
-        }
-#else
-        for (int i = 0; i < count; i++) {
-            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
-        }
-#endif
-        scalar_count_ += count;
-    }
-
-    /**
-     * Copy scalars from another Arg's scalar array.
-     * Useful when multiple tasks share the same scalar data (e.g., block indices).
-     */
-    void copy_scalars_from(const Arg &src, int src_offset, int count) {
-        if (src_offset + count > src.scalar_count_) {
-            set_error("Source scalar range out of bounds in copy_scalars_from");
-            return;
-        }
-        if (scalar_count_ + count > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
-            return;
-        }
-        memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t));
-        scalar_count_ += count;
-    }
-};
-
-#endif  // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_PTO_TYPES_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp
deleted file mode 100644
index 5d8886cdf..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Runtime Class - Implementation
- *
- * Device execution and handshake control.
- * Task graph construction is handled by PTO2Runtime.
- */
-
-#include "runtime.h"
-
-#include "common/unified_log.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-// =============================================================================
-// Constructor
-// =============================================================================
-
-Runtime::Runtime() {
-    // NOTE: host_api is initialized in InitRuntime() (host-only code)
-    // because the CApi functions don't exist when compiled for device.
-
-    // Initialize handshake buffers
-    memset(workers, 0, sizeof(workers));
-    worker_count = 0;
-    sche_cpu_num = 1;
-    ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
-    task_window_size = 0;
-    heap_size = 0;
-    dep_pool_size = 0;
-    orch_to_sched = false;
-
-    // Initialize tensor pairs
-    tensor_pair_count = 0;
-
-    // Initialize device orchestration state
-    orch_built_on_host_ = true;
-    gm_sm_ptr_ = nullptr;
-    gm_heap_ptr_ = nullptr;
-    slot_states_ptr_ = nullptr;
-    orch_args_storage_.clear();
-
-    // Initialize device orchestration SO binary
-    dev_orch_so_addr_ = 0;
-    dev_orch_so_size_ = 0;
-    has_new_orch_so_ = false;
-
-    // Initialize kernel binary tracking
-    registered_kernel_count_ = 0;
-
-    // Initialize function address mapping
-    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
-        func_id_to_addr_[i] = 0;
-    }
-}
-
-// =============================================================================
-// Tensor Pair Management
-// =============================================================================
-
-void Runtime::record_tensor_pair(void *host_ptr, void *dev_ptr, size_t size) {
-    if (tensor_pair_count >= RUNTIME_MAX_TENSOR_PAIRS) {
-        LOG_ERROR("[Runtime] Tensor pairs full (max=%d)", RUNTIME_MAX_TENSOR_PAIRS);
-        return;
-    }
-    tensor_pairs[tensor_pair_count].host_ptr = host_ptr;
-    tensor_pairs[tensor_pair_count].dev_ptr = dev_ptr;
-    tensor_pairs[tensor_pair_count].size = size;
-    tensor_pair_count++;
-    LOG_INFO("Recorded tensor pair: host=%p dev=%p size=%zu", host_ptr, dev_ptr, size);
-}
-
-TensorPair *Runtime::get_tensor_pairs() { return tensor_pairs; }
-
-int Runtime::get_tensor_pair_count() const { return tensor_pair_count; }
-
-void Runtime::clear_tensor_pairs() { tensor_pair_count = 0; }
-
-// =============================================================================
-// Device orchestration
-// =============================================================================
-
-bool Runtime::get_orch_built_on_host() const { return orch_built_on_host_; }
-void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; }
-void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; }
-const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; }
-void Runtime::set_orch_built_on_host(bool v) { orch_built_on_host_ = v; }
-void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; }
-void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
-void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
-void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
-
-// Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size/dirty-flag travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new) {
-    dev_orch_so_addr_ = dev_addr;
-    dev_orch_so_size_ = size;
-    has_new_orch_so_ = is_new;
-}
-
-uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
-
-uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
-
-bool Runtime::has_new_orch_so() const { return has_new_orch_so_; }
-
-uint64_t Runtime::get_function_bin_addr(int func_id) const {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
-    return func_id_to_addr_[func_id];
-}
-
-void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    if (addr != 0 && func_id_to_addr_[func_id] == 0) {
-        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
-            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
-        } else {
-            LOG_ERROR(
-                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
-                func_id
-            );
-        }
-    }
-    func_id_to_addr_[func_id] = addr;
-}
-
-int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
-
-int Runtime::get_registered_kernel_func_id(int index) const {
-    if (index < 0 || index >= registered_kernel_count_) return -1;
-    return registered_kernel_func_ids_[index];
-}
-
-void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
deleted file mode 100644
index 340c78ae0..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Runtime Class - Device Execution and Handshake Control
- *
- * This class manages device-side execution through AICPU-AICore handshake
- * protocol. Task graph construction is handled by PTO2Runtime; this class
- * only handles:
- * - Handshake buffers for AICPU-AICore communication
- * - Execution parameters (block_dim, sche_cpu_num)
- * - Tensor pair management for host-device memory tracking
- * - Device orchestration state (gm_sm_ptr_, orch_args_)
- * - Function address mapping (func_id_to_addr_)
- *
- * Task dispatch uses PTO2DispatchPayload from PTO2 shared memory.
- */
-
-#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_RUNTIME_H_
-#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_RUNTIME_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>   // for fprintf, printf
-#include <string.h>  // for memset
-
-#include "common/core_type.h"
-#include "common/l2_perf_profiling.h"
-#include "common/platform_config.h"
-#include "pto2_dispatch_payload.h"
-#include "task_args.h"
-
-// =============================================================================
-// Configuration Macros
-// =============================================================================
-
-#define RUNTIME_MAX_ARGS 128
-#define RUNTIME_MAX_WORKER 72  // 24 AIC + 48 AIV cores
-#define RUNTIME_MAX_TENSOR_PAIRS 64
-#define RUNTIME_MAX_FUNC_ID 1024
-#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 1MB max for orchestration SO
-
-// Default ready queue shards: one shard per worker thread (total minus orchestrator)
-constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
-
-// =============================================================================
-// Data Structures
-// =============================================================================
-
-/**
- * Handshake Structure - Shared between Host, AICPU, and AICore
- *
- * This structure facilitates communication and synchronization between
- * AICPU and AICore during task execution.
- *
- * Protocol State Machine:
- * 1. Initialization: AICPU sets aicpu_ready=1
- * 2. Acknowledgment: AICore sets aicore_done=core_id+1
- * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core task pointer
- * 4. Task Execution: AICore reads the dispatched task and executes
- * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion
- * 6. Shutdown: AICPU sets control=1, AICore exits
- *
- * Each AICore instance has its own handshake buffer to enable concurrent
- * task execution across multiple cores.
- */
-
-/**
- * Handshake buffer for AICPU-AICore communication
- *
- * Each AICore has its own handshake buffer for synchronization with AICPU.
- * The structure is cache-line aligned (64 bytes) to prevent false sharing
- * between cores and optimize cache coherency operations.
- *
- * enable_profiling_flag bit definitions (umbrella bitmask — "profiling"
- * is the umbrella, each bit is a parallel diagnostics sub-feature):
- * - bit0: tensor dump enabled
- * - bit1: L2 swimlane enabled
- * - bit2: PMU enabled
- *
- * Field Access Patterns:
- * - aicpu_ready: Written by AICPU, read by AICore
- * - aicore_done: Written by AICore, read by AICPU
- * - task: Written by AICPU, read by AICore (0 = no task, non-zero = PTO2DispatchPayload*)
- * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
- * - enable_profiling_flag: Written by host/AICPU init, read by AICore (bitmask)
- */
-struct Handshake {
-    volatile uint32_t aicpu_ready;           // AICPU ready signal: 0=not ready, 1=ready
-    volatile uint32_t aicore_done;           // AICore ready signal: 0=not ready, core_id+1=ready
-    volatile uint64_t task;                  // Task pointer: 0=no task, non-zero=PTO2DispatchPayload*
-    volatile CoreType core_type;             // Core type: CoreType::AIC or CoreType::AIV
-    volatile uint64_t l2_perf_records_addr;  // Performance records address
-    volatile uint32_t physical_core_id;      // Physical core ID
-    volatile uint32_t aicpu_regs_ready;      // AICPU register init done: 0=pending, 1=done
-    volatile uint32_t aicore_regs_ready;     // AICore ID reported: 0=pending, 1=done
-    volatile uint32_t
-        enable_profiling_flag;  // Umbrella diagnostics bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
-} __attribute__((aligned(64)));
-
-/**
- * Tensor pair for tracking host-device memory mappings.
- * Used for copy-back during finalize.
- */
-struct TensorPair {
-    void *host_ptr;
-    void *dev_ptr;
-    size_t size;
-};
-
-/**
- * Host API function pointers for device memory operations.
- * Allows runtime to use pluggable device memory backends.
- */
-struct HostApi {
-    void *(*device_malloc)(size_t size);
-    void (*device_free)(void *dev_ptr);
-    int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
-    int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    uint64_t (*upload_kernel_binary)(int func_id, const uint8_t *bin_data, size_t bin_size);
-    void (*remove_kernel_binary)(int func_id);
-};
-
-/**
- * Task structure - Compatibility stub for platform layer
- *
- * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
- * This stub exists only for API compatibility with device_runner.cpp.
- * Since get_task_count() returns 0, this struct is never actually used.
- */
-struct Task {
-    int func_id;
-    uint64_t function_bin_addr;
-};
-
-// =============================================================================
-// Runtime Class
-// =============================================================================
-
-/**
- * Runtime class for device execution and handshake control
- *
- * This class manages AICPU-AICore communication through handshake buffers.
- * Task graph construction is handled by PTO2Runtime; this class only handles
- * execution control and device orchestration state.
- */
-class Runtime {
-public:
-    // Handshake buffers for AICPU-AICore communication
-    Handshake workers[RUNTIME_MAX_WORKER];  // Worker (AICore) handshake buffers
-    int worker_count;                       // Number of active workers
-
-    // Execution parameters for AICPU scheduling
-    int sche_cpu_num;        // Number of AICPU threads for scheduling
-    int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
-
-    // Ring buffer size overrides (0 = use compile-time defaults)
-    uint64_t task_window_size;
-    uint64_t heap_size;
-    uint64_t dep_pool_size;
-
-    // PTO2 integration: kernel_id -> GM function_bin_addr mapping
-    // NOTE: Made public for direct access from aicore code
-    uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
-
-    // Orchestrator-to-scheduler transition control
-    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
-    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
-    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
-    bool orch_to_sched;
-
-private:
-    // Tensor pairs for host-device memory tracking
-    TensorPair tensor_pairs[RUNTIME_MAX_TENSOR_PAIRS];
-    int tensor_pair_count;
-
-    // Kernel binary tracking for cleanup
-    int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID];
-    int registered_kernel_count_;
-
-    // Device orchestration: when false, orchestration runs on device (thread 3)
-    bool orch_built_on_host_;
-    void *gm_sm_ptr_;                        // GM pointer to PTO2 shared memory (device)
-    void *gm_heap_ptr_;                      // GM heap for orchestrator output buffers (device)
-    void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
-    ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
-
-    // Device orchestration SO (for dlopen on AICPU thread 3).
-    // Bytes live in a separate device buffer owned by DeviceRunner; only the
-    // metadata travels in Runtime. `has_new_orch_so_` tells AICPU to reload.
-    uint64_t dev_orch_so_addr_;
-    uint64_t dev_orch_so_size_;
-    bool has_new_orch_so_;
-
-public:
-    /**
-     * Constructor - zero-initialize all arrays
-     */
-    Runtime();
-
-    // =========================================================================
-    // Tensor Pair Management
-    // =========================================================================
-
-    /**
-     * Record a host-device tensor pair for copy-back during finalize.
-     */
-    void record_tensor_pair(void *host_ptr, void *dev_ptr, size_t size);
-
-    /**
-     * Get pointer to tensor pairs array.
-     */
-    TensorPair *get_tensor_pairs();
-
-    /**
-     * Get number of recorded tensor pairs.
-     */
-    int get_tensor_pair_count() const;
-
-    /**
-     * Clear all recorded tensor pairs.
-     */
-    void clear_tensor_pairs();
-
-    // =========================================================================
-    // Performance Profiling
-    // =========================================================================
-
-    // =========================================================================
-    // Device orchestration (for AICPU thread 3)
-    // =========================================================================
-
-    bool get_orch_built_on_host() const;
-    void *get_gm_sm_ptr() const;
-    void *get_gm_heap_ptr() const;
-    const ChipStorageTaskArgs &get_orch_args() const;
-    void set_orch_built_on_host(bool v);
-    void set_gm_sm_ptr(void *p);
-    void set_gm_heap(void *p);
-    void set_slot_states_ptr(void *p);
-    void set_orch_args(const ChipStorageTaskArgs &args);
-
-    // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size, bool is_new);
-    uint64_t get_dev_orch_so_addr() const;
-    uint64_t get_dev_orch_so_size() const;
-    bool has_new_orch_so() const;
-
-    uint64_t get_function_bin_addr(int func_id) const;
-    void set_function_bin_addr(int func_id, uint64_t addr);
-
-    int get_registered_kernel_count() const;
-    int get_registered_kernel_func_id(int index) const;
-    void clear_registered_kernels();
-
-    // =========================================================================
-    // Deprecated API (for platform compatibility, always returns 0/nullptr)
-    // Task graph is now managed by PTO2Runtime, not Runtime
-    // =========================================================================
-
-    /** @deprecated Task count is now in PTO2 shared memory */
-    int get_task_count() const { return 0; }
-
-    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
-    Task *get_task(int) { return nullptr; }
-
-    /** @deprecated Use PTO2 dispatch mode */
-    bool get_use_pto2_dispatch() const { return true; }
-
-    /** @deprecated Use PTO2 dispatch mode */
-    void set_use_pto2_dispatch(bool) {}
-
-    // =========================================================================
-    // Host API (host-only, not copied to device)
-    // =========================================================================
-
-    // Host API function pointers for device memory operations
-    // NOTE: Placed at end of class to avoid affecting device memory layout
-    HostApi host_api;
-
-    // Host-only staging for orchestration SO; consumed by DeviceRunner.
-    const void *pending_orch_so_data_{nullptr};
-    size_t pending_orch_so_size_{0};
-};
-
-#endif  // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_RUNTIME_RUNTIME_H_
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h b/src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h
deleted file mode 100644
index 15af8992b..000000000
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/tensor.h
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-#pragma once
-
-#include <memory.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "common.h"
-#include "data_type.h"
-
-constexpr int RUNTIME_MAX_TENSOR_DIMS = 5;
-
-/**
- * Buffer Handle
- *
- * Represents a device memory buffer with address and total size in bytes.
- * This is the underlying memory allocation that a Tensor describes access patterns for.
- */
-struct PTOBufferHandle {
-    uint64_t addr;  // Device memory address (bytes)
-    uint64_t size;  // Total buffer size in bytes
-};
-
-enum class OverlapStatus {
-    NO_OVERLAP,
-    COVERED,
-    OTHER,
-};
-
-struct Segment {
-    uint64_t begin;
-    uint64_t end;
-
-    bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; }
-    bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; }
-};
-
-/**
- * TensorCreateInfo — metadata for runtime-allocated output tensors.
- *
- * Captures shape, dtype, and buffer size without allocating memory.
- * Passed by value to Arg::add_output(); the runtime allocates from the heap
- * and materializes a full Tensor via Tensor::init_from_create_info().
- */
-struct TensorCreateInfo {
-    DataType dtype;
-    uint32_t ndims;
-    uint32_t raw_shapes[RUNTIME_MAX_TENSOR_DIMS];
-    bool manual_dep;
-    bool has_initial_value;
-    uint64_t initial_value;
-
-    TensorCreateInfo(
-        const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false
-    ) :
-        dtype(dtype),
-        ndims(ndims),
-        manual_dep(manual_dep),
-        has_initial_value(false),
-        initial_value(0) {
-        for (uint32_t i = 0; i < ndims; i++) {
-            raw_shapes[i] = shapes[i];
-        }
-    }
-
-    void set_initial_value(uint64_t value) {
-        has_initial_value = true;
-        initial_value = value;
-    }
-
-    uint64_t buffer_size_bytes() const {
-        uint64_t total = 1;
-        for (uint32_t i = 0; i < ndims; i++) {
-            total *= raw_shapes[i];
-        }
-        return total * get_element_size(dtype);
-    }
-};
-
-/**
- * Tensor descriptor for Task input/output (128B = 2 cache lines)
- *
- * Describes a memory access pattern on Global Memory (GM) using
- * raw_shapes (underlying buffer dimensions), shapes (current view dimensions),
- * and offsets (multi-dimensional offset into the buffer).
- *
- * - `buffer` contains the underlying memory allocation (addr in bytes, size in bytes)
- * - `raw_shapes[]`, `shapes[]`, `offsets[]` are in ELEMENTS
- * - `dtype` specifies element type for interpreting buffer contents
- *
- * Fast-path flags (both on cache line 1):
- * - is_all_offset_zero: when true, offsets[] are implicitly zero — skip offset read/write
- * - is_raw_eq_shapes: when true, raw_shapes[] == shapes[] — skip raw_shapes read/write,
- *   use shapes[] wherever raw_shapes would be needed
- *
- * When BOTH flags are true, cache line 2 is never accessed.
- *
- * Layout: cache line 1 holds hot-path fields (buffer, start_offset, version,
- * dtype, ndims, flags, shapes); cache line 2 holds warm-path fields (raw_shapes, offsets).
- */
-struct alignas(64) Tensor {
-    // === Cache line 1 (64B) — hot path ===
-    PTOBufferHandle buffer;   // Underlying memory buffer (addr in bytes, size in bytes)
-    uint64_t start_offset;    // Cached 1D element offset (precomputed from raw_shapes + offsets), only calc before
-                              // incore, useless in orch
-    int32_t version;          // Tensor version for overlap detection
-    DataType dtype;           // Data type of tensor elements
-    uint32_t ndims;           // Number of dimensions used
-    bool is_all_offset_zero;  // True when all offsets[] are zero (skip offset read/write)
-    bool is_raw_eq_shapes;    // True when raw_shapes[] == shapes[] (skip raw_shapes read/write)
-    bool manual_dep;          // True when dependency is managed manually (skip tensormap lookup/insert)
-    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS];  // Current view shape per dimension
-    uint32_t __padding__;
-
-    // === Cache line 2 (64B) — warm path ===
-    uint32_t raw_shapes[RUNTIME_MAX_TENSOR_DIMS];  // Underlying buffer shape per dimension
-    uint32_t offsets[RUNTIME_MAX_TENSOR_DIMS];     // Multi-dimensional offset per dimension
-
-    Tensor() = default;
-    Tensor(const Tensor &) = default;
-    Tensor &operator=(const Tensor &) = default;
-    Tensor(Tensor &&) = default;
-    Tensor &operator=(Tensor &&) = default;
-    ~Tensor() = default;
-
-    /// Return the effective raw_shapes pointer (shapes[] when is_raw_eq_shapes).
-    /// Avoids cache line 2 access for the common case.
-    const uint32_t *get_raw_shapes() const { return is_raw_eq_shapes ? shapes : raw_shapes; }
-
-    Tensor(
-        void *addr, uint64_t buffer_size_bytes, const uint32_t raw_shapes[], const uint32_t shapes[],
-        const uint32_t offsets[], uint32_t ndims, DataType dtype, int32_t version, bool is_all_offset_zero = false,
-        bool is_raw_eq_shapes = false, bool manual_dep = false
-    ) {
-        init(
-            addr, buffer_size_bytes, raw_shapes, shapes, offsets, ndims, dtype, version, is_all_offset_zero,
-            is_raw_eq_shapes, manual_dep
-        );
-    }
-
-    // --- Initialization ---
-    void init(
-        void *addr, uint64_t buffer_size_bytes, const uint32_t in_raw_shapes[], const uint32_t in_shapes[],
-        const uint32_t in_offsets[], uint32_t in_ndims, DataType in_dtype, int32_t in_version,
-        bool in_is_all_offset_zero = false, bool in_is_raw_eq_shapes = false, bool in_manual_dep = false
-    ) {
-        buffer = {reinterpret_cast<uint64_t>(addr), buffer_size_bytes};
-        ndims = in_ndims;
-        dtype = in_dtype;
-        version = in_version;
-        is_all_offset_zero = in_is_all_offset_zero;
-        is_raw_eq_shapes = in_is_raw_eq_shapes;
-        manual_dep = in_manual_dep;
-        for (uint32_t i = 0; i < in_ndims; i++) {
-            shapes[i] = in_shapes[i];
-        }
-        if (!in_is_raw_eq_shapes) {
-            for (uint32_t i = 0; i < in_ndims; i++) {
-                raw_shapes[i] = in_raw_shapes[i];
-            }
-        }
-        if (!in_is_all_offset_zero) {
-            for (uint32_t i = 0; i < in_ndims; i++) {
-                offsets[i] = in_offsets[i];
-            }
-        }
-    }
-
-    void init(const Tensor &other) {
-        memcpy(this, &other, 64);  // fast copy cache line 1
-        if (!other.is_raw_eq_shapes) {
-            for (uint32_t i = 0; i < ndims; i++) {
-                raw_shapes[i] = other.raw_shapes[i];
-            }
-        }
-        if (!other.is_all_offset_zero) {
-            for (uint32_t i = 0; i < ndims; i++) {
-                offsets[i] = other.offsets[i];
-            }
-        }
-    }
-
-    void init_with_view(
-        const Tensor &other, const uint32_t view_shapes[], const uint32_t view_offsets[], bool in_manual_dep = false
-    ) {
-        buffer = other.buffer;
-        ndims = other.ndims;
-        dtype = other.dtype;
-        version = other.version;
-        manual_dep = in_manual_dep;
-        // view always diverges shapes from raw_shapes, so is_raw_eq_shapes = false.
-        // Read parent's effective raw_shapes (avoids parent cache line 2 when parent is_raw_eq_shapes).
-        is_raw_eq_shapes = false;
-        const uint32_t *parent_raw = other.get_raw_shapes();
-        for (uint32_t i = 0; i < ndims; i++) {
-            raw_shapes[i] = parent_raw[i];
-            shapes[i] = view_shapes[i];
-        }
-        // Compute offsets and zero-flag
-        bool all_zero = true;
-        if (other.is_all_offset_zero) {
-            for (uint32_t i = 0; i < ndims; i++) {
-                if (view_offsets[i] != 0) {
-                    all_zero = false;
-                    break;
-                }
-            }
-            if (!all_zero) {
-                for (uint32_t i = 0; i < ndims; i++) {
-                    offsets[i] = view_offsets[i];
-                }
-            }
-        } else {
-            all_zero = false;
-            for (uint32_t i = 0; i < ndims; i++) {
-                offsets[i] = other.offsets[i] + view_offsets[i];
-            }
-        }
-        is_all_offset_zero = all_zero;
-    }
-
-    // --- Operations ---
-    void update_start_offset() {
-        if (is_all_offset_zero) {
-            start_offset = 0;
-            return;
-        }
-        const uint32_t *rs = get_raw_shapes();
-        uint64_t result = 0;
-        uint64_t stride = 1;
-        for (int i = static_cast<int>(ndims) - 1; i >= 0; i--) {
-            result += offsets[i] * stride;
-            stride *= rs[i];
-        }
-        start_offset = result;
-    }
-
-    void copy(const Tensor &other) { init(other); }
-
-    Tensor view(const uint32_t view_shapes[], const uint32_t view_offsets[], bool manual_dep = false) const {
-        Tensor result;
-        result.init_with_view(*this, view_shapes, view_offsets, manual_dep);
-        return result;
-    }
-
-    bool is_contiguous() const {
-        if (is_raw_eq_shapes || ndims == 0) {
-            return true;
-        }
-        for (uint32_t i = 1; i < ndims; i++) {
-            if (shapes[i] != raw_shapes[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    bool valid_reshape(const uint32_t new_shapes[], uint32_t new_ndims) const {
-        uint64_t x = numel();
-        uint64_t y = 1;
-        for (uint32_t i = 0; i < new_ndims; i++) {
-            y *= new_shapes[i];
-        }
-        return x == y;
-    }
-
-    Tensor reshape(const uint32_t new_shapes[], uint32_t new_ndims, bool manual_dep = false) const {
-        debug_assert(valid_reshape(new_shapes, new_ndims));
-        always_assert(is_contiguous());
-        Tensor result;
-        result.copy(*this);
-        result.ndims = new_ndims;
-        result.is_all_offset_zero = true;
-        result.is_raw_eq_shapes = true;
-        result.manual_dep = manual_dep;
-        for (uint32_t i = 0; i < new_ndims; i++) {
-            result.shapes[i] = new_shapes[i];
-        }
-        return result;
-    }
-
-    bool valid_transpose(uint32_t x, uint32_t y) const { return x < ndims && y < ndims; }
-
-    Tensor transpose(uint32_t x, uint32_t y, bool manual_dep = false) const {
-        debug_assert(valid_transpose(x, y));
-        Tensor result;
-        result.copy(*this);
-        result.manual_dep = manual_dep;
-        // transpose swaps the same dims in both arrays, so equality is preserved
-        std::swap(result.shapes[x], result.shapes[y]);
-        if (!result.is_raw_eq_shapes) {
-            std::swap(result.raw_shapes[x], result.raw_shapes[y]);
-        }
-        if (!result.is_all_offset_zero) {
-            std::swap(result.offsets[x], result.offsets[y]);
-        }
-        return result;
-    }
-
-    uint64_t numel() const {
-        if (ndims == 0) {
-            return 0;
-        }
-        uint64_t total = 1;
-        for (uint32_t i = 0; i < ndims; i++) {
-            total *= shapes[i];
-        }
-        return total;
-    }
-
-    bool is_same_memref(const Tensor &other) const { return buffer.addr == other.buffer.addr; }
-
-    /// Materialize a TensorCreateInfo into this Tensor (fresh contiguous output).
-    void init_from_create_info(const struct TensorCreateInfo &ci, void *addr, int32_t version_val) {
-        init(
-            addr, ci.buffer_size_bytes(), ci.raw_shapes, ci.raw_shapes, nullptr, ci.ndims, ci.dtype, version_val,
-            /*is_all_offset_zero=*/true,
-            /*is_raw_eq_shapes=*/true, ci.manual_dep
-        );
-    }
-
-    std::string dump() const {
-        std::stringstream ss;
-        std::string indent = "    ";
-        ss << "{" << '\n';
-        ss << indent << "buffer.addr: " << buffer.addr << '\n';
-        ss << indent << "buffer.size: " << buffer.size << " bytes" << '\n';
-        ss << indent << "dtype: " << get_dtype_name(dtype) << '\n';
-        ss << indent << "ndims: " << ndims << '\n';
-        ss << indent << "version: " << version << '\n';
-
-        const uint32_t *rs = get_raw_shapes();
-        ss << indent << "raw_shapes: [";
-        for (uint32_t i = 0; i < ndims; i++) {
-            if (i > 0) {
-                ss << ", ";
-            }
-            ss << rs[i];
-        }
-        ss << "]" << '\n';
-        ss << indent << "shapes: [";
-        for (uint32_t i = 0; i < ndims; i++) {
-            if (i > 0) {
-                ss << ", ";
-            }
-            ss << shapes[i];
-        }
-        ss << "]" << '\n';
-        ss << indent << "offsets: [";
-        for (uint32_t i = 0; i < ndims; i++) {
-            if (i > 0) {
-                ss << ", ";
-            }
-            ss << (is_all_offset_zero ? 0u : offsets[i]);
-        }
-        ss << "]" << '\n';
-        ss << "}" << '\n';
-        return ss.str();
-    }
-};
-
-static_assert(sizeof(Tensor) == 128, "Tensor must be exactly 2 cache lines (128 bytes)");
-static_assert(offsetof(Tensor, raw_shapes) == 64);
-
-using TensorData = Tensor;
-
-// =============================================================================
-// Factory Helpers
-// =============================================================================
-/**
- * Create a Tensor for pre-allocated external memory.
- */
-static inline Tensor make_tensor_external(
-    void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false,
-    int32_t version = 0
-) {
-    static uint32_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
-    uint64_t total = 1;
-    for (uint32_t i = 0; i < ndims; i++) {
-        total *= shapes[i];
-    }
-    return {
-        addr,
-        total * get_element_size(dtype),
-        shapes,
-        shapes,
-        zero_offsets,
-        ndims,
-        dtype,
-        version,
-        /*is_all_offset_zero=*/true,
-        /*is_raw_eq_shapes=*/true,
-        manual_dep
-    };
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index a1d1f1540..8d6b97f13 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -35,7 +35,7 @@ PTO2 (Parallel Task Orchestration v2) is a runtime system for executing task gra
 
 ## 1. Runtime Variants
 
-Three runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy.
+Two runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy.
 
 ### 1.1 host_build_graph
 
@@ -45,15 +45,7 @@ The host builds the complete task graph before launching device execution. The o
 - **Scheduling**: AICPU receives the pre-built graph and dispatches tasks by traversing dependencies
 - **Use case**: development and debugging; no device-side orchestration overhead
 
-### 1.2 aicpu_build_graph
-
-The orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`).
-
-- **Task storage**: same `Task[]` array as host_build_graph
-- **AicpuBuildApi**: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc`
-- **Use case**: reduced host→device data transfer; graph can depend on device-side data
-
-### 1.3 tensormap_and_ringbuffer (PTO2)
+### 1.2 tensormap_and_ringbuffer (PTO2)
 
 The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking.
 
diff --git a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h b/src/a5/platform/include/aicore/l2_perf_collector_aicore.h
index dbc1aa512..00a30af1e 100644
--- a/src/a5/platform/include/aicore/l2_perf_collector_aicore.h
+++ b/src/a5/platform/include/aicore/l2_perf_collector_aicore.h
@@ -38,8 +38,8 @@
  * Buffer management and final commit are handled by AICPU.
  *
  * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended).
- * For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites
- * with the full (ring_id << 32) | local_id encoding after handshake match.
+ * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id
+ * encoding after handshake match.
  *
  * @param l2_perf_buf Performance buffer pointer
  * @param task_id Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits
diff --git a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h
index 2eecb6a41..a6a5e6f68 100644
--- a/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h
+++ b/src/a5/platform/include/aicpu/l2_perf_collector_aicpu.h
@@ -109,7 +109,7 @@ void l2_perf_aicpu_init_phase_profiling(int num_sched_threads);
  * @param loop_iter Current loop iteration number
  * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or
  *                        full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator
- *                        phases in multi-ring runtimes: tensormap_and_ringbuffer, aicpu_build_graph)
+ *                        phases in tensormap_and_ringbuffer)
  */
 void l2_perf_aicpu_record_phase(
     int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
@@ -146,9 +146,8 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
  * @param start_time Phase start timestamp
  * @param end_time Phase end timestamp
  * @param submit_idx Task submission index (acts as loop_iter)
- * @param task_id Task identifier. For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), this is the
- * full PTO2 encoding: (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler
- * swimlanes.
+ * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding:
+ * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes.
  */
 void l2_perf_aicpu_record_orch_phase(
     AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
diff --git a/src/a5/platform/include/common/l2_perf_profiling.h b/src/a5/platform/include/common/l2_perf_profiling.h
index 7c26d7c23..98168375a 100644
--- a/src/a5/platform/include/common/l2_perf_profiling.h
+++ b/src/a5/platform/include/common/l2_perf_profiling.h
@@ -61,8 +61,8 @@ struct L2PerfRecord {
     uint64_t finish_time;    // AICPU timestamp: when AICPU observed task completion
 
     // AICore writes the register dispatch token (low 32 bits only) zero-extended into task_id.
-    // For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites
-    // with the full PTO2 encoding (ring_id << 32) | local_id after FIN/perf row match.
+    // For tensormap_and_ringbuffer, AICPU overwrites with the full PTO2 encoding
+    // (ring_id << 32) | local_id after FIN/perf row match.
     // For host_build_graph, task_id stays as the plain integer task index (ring_id = 0).
     uint64_t task_id;
     uint32_t func_id;    // Kernel function identifier
@@ -140,8 +140,8 @@ struct AicpuPhaseRecord {
     uint32_t loop_iter;     // Loop iteration number
     AicpuPhaseId phase_id;  // Phase type
     union {
-        uint64_t task_id;          // Multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph):
-                                   // full PTO2 encoding (ring_id << 32) | local_id for cross-view correlation.
+        uint64_t task_id;          // tensormap_and_ringbuffer: full PTO2 encoding
+                                   // (ring_id << 32) | local_id for cross-view correlation.
         uint64_t tasks_processed;  // Scheduler phases: number of tasks processed in this batch
     };
 };
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index 4fd07bee2..1f151dfb5 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -35,7 +35,7 @@ PTO2 (Parallel Task Orchestration v2) is a runtime system for executing task gra
 
 ## 1. Runtime Variants
 
-Three runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy.
+Two runtime backends exist under `src/runtime/`, each representing a different orchestration and scheduling strategy.
 
 ### 1.1 host_build_graph
 
@@ -45,15 +45,7 @@ The host builds the complete task graph before launching device execution. The o
 - **Scheduling**: AICPU receives the pre-built graph and dispatches tasks by traversing dependencies
 - **Use case**: development and debugging; no device-side orchestration overhead
 
-### 1.2 aicpu_build_graph
-
-The orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`).
-
-- **Task storage**: same `Task[]` array as host_build_graph
-- **AicpuBuildApi**: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc`
-- **Use case**: reduced host→device data transfer; graph can depend on device-side data
-
-### 1.3 tensormap_and_ringbuffer (PTO2)
+### 1.2 tensormap_and_ringbuffer (PTO2)
 
 The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking.
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6b2baa0e1..fd106ebe8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -127,11 +127,6 @@ def pytest_collection_modifyitems(session, config, items):
     available_runtimes = discover_runtimes_for_arch(arch)
 
     for item in items:
-        # Skip aicpu_build_graph tests for architectures that don't have it
-        if "test_discovers_aicpu_build_graph" in item.nodeid:
-            if "aicpu_build_graph" not in available_runtimes:
-                item.add_marker(pytest.mark.skip(reason=f"aicpu_build_graph not available for {arch} architecture"))
-
         # Skip tensormap_and_ringbuffer tests for architectures that don't have it
         if "tensormap_and_ringbuffer" in item.nodeid:
             if "tensormap_and_ringbuffer" not in available_runtimes:
diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/README.md b/tests/st/a2a3/aicpu_build_graph/bgemm/README.md
deleted file mode 100644
index 504c14c5c..000000000
--- a/tests/st/a2a3/aicpu_build_graph/bgemm/README.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# BGEMM Example (AICPU Build Graph Runtime)
-
-Tiled matrix multiplication example demonstrating Cube (AIC) and Vector (AIV) core cooperation.
-
-## Computation
-
-```text
-C = A @ B
-```
-
-Tiled computation with 4x4x4 grid:
-
-- Tile size: 64 x 64
-- Matrix A: 256 x 256 (4x4 tiles)
-- Matrix B: 256 x 256 (4x4 tiles)
-- Matrix C: 256 x 256 (4x4 tiles)
-
-## Task Graph
-
-For each output tile C[m,n]:
-
-```text
-for k in [0, GRID_K):
-    P = A[m,k] @ B[k,n]    (gemm_tile on Cube core)
-    C[m,n] = C[m,n] + P    (tile_add on Vector core)
-```
-
-Dependencies:
-
-- gemm_tile → tile_add: P must be computed before accumulation
-- tile_add[k] → gemm_tile[k+1]: K-dimension accumulation is sequential
-
-Total tasks: 128 (64 gemm + 64 add)
-
-## Kernels
-
-| Kernel | Core Type | Function |
-| ------ | --------- | -------- |
-| kernel_gemm_tile | AIC (Cube) | 64x64 matrix multiplication |
-| kernel_tile_add | AIV (Vector) | 64x64 element-wise addition |
-
-## File Structure
-
-```text
-bgemm/
-├── golden.py                          # Test specification
-├── README.md                          # This file
-└── kernels/
-    ├── kernel_config.py               # Kernel configuration
-    ├── orchestration/
-    │   └── bgemm_orch.cpp             # Task graph builder
-    ├── aic/
-    │   └── kernel_gemm_tile.cpp       # Cube core matmul kernel
-    └── aiv/
-        └── kernel_tile_add.cpp        # Vector core add kernel
-```
-
-## Technical Details
-
-### Memory Layout (Tile-First)
-
-```text
-A: [BATCH, GRID_M, GRID_K, TILE_M, TILE_K]
-B: [BATCH, GRID_K, GRID_N, TILE_K, TILE_N]
-C: [BATCH, GRID_M, GRID_N, TILE_M, TILE_N]
-```
-
-### Runtime Characteristics
-
-- Task graph is built on AICPU
-- Framework automatically manages I/O tensor device memory
-- Orchestration function allocates intermediate buffers via AicpuBuildApi
-
-### Kernel Implementation
-
-Both kernels use PTO ISA tile operations:
-
-- **kernel_gemm_tile**: Uses `TileLeft`, `TileRight`, `TileAcc` types with `TLOAD`, `TMOV`, `TMATMUL`, `TSTORE` instructions
-- **kernel_tile_add**: Uses `TileVec` type with `TLOAD`, `TADD`, `TSTORE` instructions
-
-### Pipeline Synchronization
-
-Kernels include proper pipeline synchronization:
-
-- `PIPE_MTE2` → `PIPE_M`/`PIPE_V`: After loads, before compute
-- `PIPE_M`/`PIPE_V` → `PIPE_MTE3`: After compute, before store
diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
deleted file mode 100644
index 9682a5278..000000000
--- a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Tile-based Matrix Multiplication Kernel (Cube Core)
- *
- * Computes: output = input_a @ input_b (64x64 tile matmul)
- * Uses TMATMUL instruction
- *
- * Args (Tensor*):
- *   args[0] = input_a (INPUT)
- *   args[1] = input_b (INPUT)
- *   args[2] = output  (OUTPUT)
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-#include <pto/common/constants.hpp>
-#include <pto/common/pto_tile.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <typename T>
-AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
-    if (num_2 == 0) {
-        return 0;
-    }
-    return (num_1 + num_2 - 1) / num_2 * num_2;
-}
-
-static __aicore__ void
-gemm_tile_impl(__gm__ Tensor *input_a_tensor, __gm__ Tensor *input_b_tensor, __gm__ Tensor *output_tensor) {
-    __gm__ float *input_a =
-        reinterpret_cast<__gm__ float *>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset;
-    __gm__ float *input_b =
-        reinterpret_cast<__gm__ float *>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset;
-    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
-
-    constexpr int TILE = 64;
-    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
-    constexpr int M = CeilAlign<int>(TILE, 16);
-    constexpr int K = CeilAlign<int>(TILE, blockAlign);
-    constexpr int N = CeilAlign<int>(TILE, blockAlign);
-
-    using GlobalDataA =
-        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
-    using GlobalDataB =
-        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
-    using GlobalDataC =
-        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
-
-    GlobalDataA src0Global(input_a);
-    GlobalDataB src1Global(input_b);
-    GlobalDataC dstGlobal(output);
-
-    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
-
-    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
-    using RightTile = TileRight<float, K, N, TILE, TILE>;
-    using AccTile = TileAcc<float, M, N, TILE, TILE>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    TLOAD(aMatTile, src0Global);
-    TLOAD(bMatTile, src1Global);
-
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    TMOV(aTile, aMatTile);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(dstGlobal, cTile);
-
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]);
-
-    gemm_tile_impl(input_a, input_b, output);
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
deleted file mode 100644
index 123c1abc1..000000000
--- a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Tile-based Element-wise Addition Kernel (Vector Core) - INOUT Pattern
- *
- * Computes: C_tile = C_tile + P (64x64 tile accumulation)
- * Uses TADD instruction
- *
- * Args (Tensor*):
- *   args[0] = C_tile (INOUT: read + write accumulator)
- *   args[1] = P      (INPUT: matmul result to accumulate)
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-#include <pto/common/constants.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-
-    __gm__ float *c_ptr = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset;
-    __gm__ float *p_ptr = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset;
-
-    constexpr int TILE = 64;
-
-    using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
-    using DynStridDim5 = Stride<1, 1, 1, TILE, 1>;
-    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-    using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
-
-    TileData cTile(TILE, TILE);
-    TileData pTile(TILE, TILE);
-    TileData outTile(TILE, TILE);
-    TASSIGN(cTile, 0x0);
-    TASSIGN(pTile, 0x10000);
-    TASSIGN(outTile, 0x20000);
-
-    GlobalData cGlobal(c_ptr);
-    GlobalData pGlobal(p_ptr);
-    GlobalData outGlobal(c_ptr);  // write back to same C location
-
-    TLOAD(cTile, cGlobal);
-    TLOAD(pTile, pGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    TADD(outTile, cTile, pTile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(outGlobal, outTile);
-
-    pipe_sync();
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
deleted file mode 100644
index acae7cfa6..000000000
--- a/tests/st/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * BGEMM Orchestration Function (aicpu_build_graph Runtime)
- *
- * Builds the task graph for tiled matrix multiplication: C = A @ B
- *
- * Configuration:
- *   - Tile size: 64 x 64
- *   - Grid: 4 x 4 x 4 (GRID_M x GRID_K x GRID_N)
- *   - Batch: 1
- *
- * Memory layout (tile-first, 5D flattened):
- *   A: [BATCH, GRID_M, GRID_K, TILE, TILE]
- *   B: [BATCH, GRID_K, GRID_N, TILE, TILE]
- *   C: [BATCH, GRID_M, GRID_N, TILE, TILE]
- *
- * Task graph per output tile C[batch, m, n]:
- *   for k in [0, GRID_K):
- *     P = A[m,k] @ B[k,n]    (gemm_tile on Cube core, func_id=0)
- *     C[m,n] = C[m,n] + P    (tile_add on Vector core, func_id=1)
- *
- * Dependencies are explicit via rt_add_dependency:
- *   - gemm(k) -> add(k): add reads P which gemm produces
- *   - add(k-1) -> add(k): add reads/writes C_view (K accumulation chain)
- *
- * Arg layout: [A, B, C]  — shape/dtype/size in ContinuousTensor metadata
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define FUNC_GEMM_TILE 0
-#define FUNC_TILE_ADD 1
-
-static constexpr int TILE = 64;
-static constexpr int GRID_M = 4;
-static constexpr int GRID_K = 4;
-static constexpr int GRID_N = 4;
-static constexpr int BATCH = 1;
-
-static constexpr uint32_t TILE_ELEMS = TILE * TILE;
-static constexpr uint64_t TILE_BYTES = TILE_ELEMS * sizeof(float);
-
-extern "C" {
-
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;  // NOLINT(readability/casting)
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 3,
-    };
-}
-
-__attribute__((visibility("default"))) void
-aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) {
-    Tensor ext_A = from_tensor_arg(orch_args.tensor(0));
-    Tensor ext_B = from_tensor_arg(orch_args.tensor(1));
-    Tensor ext_C = from_tensor_arg(orch_args.tensor(2));
-
-    LOG_INFO(rt, "[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d", GRID_M, GRID_K, GRID_N, BATCH, TILE);
-
-    uint32_t tile_shapes[1] = {TILE_ELEMS};
-
-    for (int batch = 0; batch < BATCH; batch++) {
-        for (int m_idx = 0; m_idx < GRID_M; m_idx++) {
-            for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
-                PTO2_SCOPE(rt) {
-                    uint32_t c_elem_offset = (static_cast<uint32_t>(batch) * GRID_M * GRID_N +
-                                              static_cast<uint32_t>(m_idx) * GRID_N + static_cast<uint32_t>(n_idx)) *
-                                             TILE_ELEMS;
-                    uint32_t c_view_offsets[1] = {c_elem_offset};
-                    Tensor C_view = ext_C.view(tile_shapes, c_view_offsets);
-
-                    PTO2TaskId last_add_task = PTO2TaskId{};
-                    bool has_last_add = false;
-
-                    for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
-                        uint32_t a_elem_offset =
-                            (static_cast<uint32_t>(batch) * GRID_M * GRID_K + static_cast<uint32_t>(m_idx) * GRID_K +
-                             static_cast<uint32_t>(k_idx)) *
-                            TILE_ELEMS;
-                        uint32_t b_elem_offset =
-                            (static_cast<uint32_t>(batch) * GRID_K * GRID_N + static_cast<uint32_t>(k_idx) * GRID_N +
-                             static_cast<uint32_t>(n_idx)) *
-                            TILE_ELEMS;
-
-                        uint32_t a_view_offsets[1] = {a_elem_offset};
-                        Tensor A_view = ext_A.view(tile_shapes, a_view_offsets);
-                        uint32_t b_view_offsets[1] = {b_elem_offset};
-                        Tensor B_view = ext_B.view(tile_shapes, b_view_offsets);
-
-                        // P = A[m,k] @ B[k,n]
-                        Arg args_gemm;
-                        args_gemm.add_input(A_view);
-                        args_gemm.add_input(B_view);
-                        args_gemm.add_output(TensorCreateInfo(tile_shapes, 1, DataType::FLOAT32));
-                        SubmitResult r_gemm = rt_submit_aic_task(rt, FUNC_GEMM_TILE, args_gemm);
-
-                        // C[m,n] += P
-                        Arg args_add;
-                        args_add.add_inout(C_view);
-                        args_add.add_input(r_gemm.outputs.get_ref(0));
-                        SubmitResult r_add = rt_submit_aiv_task(rt, FUNC_TILE_ADD, args_add);
-
-                        // gemm -> add: add reads P which gemm produces
-                        rt_add_dependency(rt, r_gemm.task_id, r_add.task_id);
-                        // K accumulation chain: previous add -> current add
-                        if (has_last_add) {
-                            rt_add_dependency(rt, last_add_task, r_add.task_id);
-                        }
-
-                        last_add_task = r_add.task_id;
-                        has_last_add = true;
-                    }
-                }
-            }
-        }
-    }
-
-    LOG_INFO(
-        rt, "[bgemm_orch] Submitted tasks for %d batches, %dx%d output tiles, %d K steps each", BATCH, GRID_M, GRID_N,
-        GRID_K
-    );
-}
-
-}  // extern "C"
diff --git a/tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py b/tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py
deleted file mode 100644
index 9be0dc72b..000000000
--- a/tests/st/a2a3/aicpu_build_graph/bgemm/test_bgemm.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-
-"""BGEMM — aicpu_build_graph runtime with tiled matrix multiplication.
-
-Computation: C = A @ B (4x4x4 grid, 64x64 tiles).
-Tests AIC (Cube) + AIV (Vector) cooperation with tile-first memory layout.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-
-TILE_M = 64
-TILE_K = 64
-TILE_N = 64
-GRID_M = 4
-GRID_K = 4
-GRID_N = 4
-BATCH = 1
-
-
-@scene_test(level=2, runtime="aicpu_build_graph")
-class TestBgemm(SceneTestCase):
-    """BGEMM: tiled C = A @ B with AIC gemm + AIV tile add."""
-
-    RTOL = 1e-3
-    ATOL = 1e-3
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/bgemm_orch.cpp",
-            "function_name": "aicpu_orchestration_entry",
-            "signature": [D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "source": "kernels/aic/kernel_gemm_tile.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 1,
-                "source": "kernels/aiv/kernel_tile_add.cpp",
-                "core_type": "aiv",
-                "signature": [D.INOUT, D.IN],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "default",
-            "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 3},
-            "params": {},
-        },
-    ]
-
-    def generate_args(self, params):
-        A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01
-        B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01
-        C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32)
-
-        return TaskArgsBuilder(
-            Tensor("A", A.flatten()),
-            Tensor("B", B.flatten()),
-            Tensor("C", C.flatten()),
-        )
-
-    def compute_golden(self, args, params):
-        A = args.A.reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
-        B = args.B.reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
-        C = args.C.reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
-
-        C[:] = 0.0
-        for batch in range(BATCH):
-            for m_idx in range(GRID_M):
-                for n_idx in range(GRID_N):
-                    for k_idx in range(GRID_K):
-                        C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp b/tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp
deleted file mode 100644
index babd6f685..000000000
--- a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/kernels/orchestration/example_orchestration.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Minimal orchestration for orch_so_cache test (a2a3 aicpu_build_graph)
- *
- * Computes: f = a + b  (single AIV task)
- *
- * Args layout (3 args):
- *   [0] = a (INPUT)   - 128 x 128 float32
- *   [1] = b (INPUT)   - 128 x 128 float32
- *   [2] = f (OUTPUT)  - 128 x 128 float32
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define FUNC_ADD 0  // kernel_add: args[0..2] -> f = a + b
-
-extern "C" {
-
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 3,
-    };
-}
-
-__attribute__((visibility("default"))) void
-aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) {
-    Tensor ext_a = from_tensor_arg(orch_args.tensor(0));
-    Tensor ext_b = from_tensor_arg(orch_args.tensor(1));
-    Tensor ext_f = from_tensor_arg(orch_args.tensor(2));
-
-    PTO2_SCOPE(rt) {
-        // f = a + b
-        Arg args;
-        args.add_input(ext_a);
-        args.add_input(ext_b);
-        args.add_inout(ext_f);
-        rt_submit_aiv_task(rt, FUNC_ADD, args);
-    }
-}
-
-}  // extern "C"
diff --git a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py b/tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py
deleted file mode 100644
index 7fa638e2c..000000000
--- a/tests/st/a2a3/aicpu_build_graph/orch_so_cache/test_orch_so_cache.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""End-to-end coverage for the orchestration SO host-side cache (a2a3 aicpu_build_graph).
-
-The host hashes the orchestration SO's GNU Build-ID, skips re-uploading bytes
-that already live on device, and tells AICPU to reuse the cached `dlopen`
-handle. The framework reuses one `Worker` (and therefore one `DeviceRunner`)
-across cases inside a `SceneTestCase`, so running multiple cases against the
-same `CALLABLE` exercises the cache-hit path on every case after the first.
-
-This test deliberately:
-  - Reuses the vector_example kernel_add (args[0..2] -> f = a + b).
-  - Spans three cases with different (a, b) inputs — proves cache hit doesn't
-    leak any per-run state across iterations.
-  - Uses the same tensor size (128*128) because the AIV kernel has a hardcoded
-    tile shape (128x128) and does not accept a runtime size.
-  - Runs on both sim and hardware (sim DeviceRunner uses the same code path,
-    just with `mem_alloc_` returning host memory).
-
-Verification is purely outcome-based: every case must produce the correct
-result. A regression in cache logic (stale handle, wrong device buffer,
-missing dlopen on first run) shows up as wrong output or a runtime failure.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-
-_VECTOR_KERNELS = "../vector_example/kernels"
-
-
-@scene_test(level=2, runtime="aicpu_build_graph")
-class TestOrchSoCache(SceneTestCase):
-    """Same callable, three cases — case 0 misses the cache, cases 1-2 hit it."""
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/example_orchestration.cpp",
-            "function_name": "aicpu_orchestration_entry",
-            "signature": [D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "source": f"{_VECTOR_KERNELS}/aiv/kernel_add.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-        ],
-    }
-
-    # Three cases sharing one callable. The framework iterates them on a
-    # single Worker; cases after the first land on cache-hit. Different
-    # (a, b) values verify that no per-run state leaks across iterations.
-    _COMMON_CONFIG = {"aicpu_thread_num": 4, "block_dim": 3}
-    _PLATFORMS = ["a2a3sim", "a2a3"]
-
-    # All cases use the same size (128*128) because the AIV kernel has a
-    # hardcoded tile shape (128x128) and does not read a runtime size
-    # argument — running with a smaller tensor would cause an out-of-bounds
-    # access.
-    CASES = [
-        {
-            "name": "first_miss",
-            "platforms": _PLATFORMS,
-            "config": _COMMON_CONFIG,
-            "params": {"size": 128 * 128, "a": 2.0, "b": 3.0},
-        },
-        {
-            "name": "second_hit",
-            "platforms": _PLATFORMS,
-            "config": _COMMON_CONFIG,
-            "params": {"size": 128 * 128, "a": 1.0, "b": 4.0},
-        },
-        {
-            "name": "third_hit",
-            "platforms": _PLATFORMS,
-            "config": _COMMON_CONFIG,
-            "params": {"size": 128 * 128, "a": 0.5, "b": 0.5},
-        },
-    ]
-
-    def generate_args(self, params):
-        size = params["size"]
-        a = params["a"]
-        b = params["b"]
-        return TaskArgsBuilder(
-            Tensor("a", torch.full((size,), a, dtype=torch.float32)),
-            Tensor("b", torch.full((size,), b, dtype=torch.float32)),
-            Tensor("f", torch.zeros(size, dtype=torch.float32)),
-        )
-
-    def compute_golden(self, args, params):
-        # f = a + b
-        args.f[:] = args.a + args.b
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp
deleted file mode 100644
index 45f90aab3..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-constexpr int M = 16;
-constexpr int K = 16;
-constexpr int N = 16;
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
deleted file mode 100644
index 04aa9b5f6..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: (16, 128) @ (128, 128) -> (16, 128)
-//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
-//
-// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
-// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
-// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int K, int N>
-static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) {
-    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
-    __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr);
-    __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
-
-    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
-    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
-
-    GlobalA pijGlobal(pij_addr + pij->start_offset);
-    GlobalB vjGlobal(vj_addr + vj->start_offset);
-    GlobalOut oiGlobal(oi_addr + oi->start_offset);
-
-    // L1 Mat tiles: standard ND pattern for both A and B
-    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-
-    // L0 tiles
-    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
-    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Load pij and vj to L1 with separate events for pipeline overlap
-    TLOAD(aMatTile, pijGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
-    TLOAD(bMatTile, vjGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
-
-    // Move A to L0A as soon as A load completes (B may still be loading)
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    TMOV(aTile, aMatTile);
-    // Move B to L0B after B load completes
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    // Single matmul: (M,K) x (K,N) -> (M,N)
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(oiGlobal, cTile);
-
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
-    // args[4] = block_size, args[5] = head_dim
-
-    if (q_tile_size == 16) {
-        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
-    } else {
-        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
deleted file mode 100644
index f65656605..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
-//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
-//
-// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
-// This is equivalent to (K, N) in column-major (DN) layout.
-// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int K, int N>
-static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) {
-    __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr);
-    __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr);
-    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
-
-    // qi (M, K) bf16 in ND (row-major) layout
-    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
-    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
-
-    GlobalA qiGlobal(qi_addr + qi->start_offset);
-    GlobalB kjGlobal(kj_addr + kj->start_offset);
-    GlobalOut sijGlobal(sij_addr + sij->start_offset);
-
-    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
-    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
-
-    // L0 tiles
-    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
-    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Load A and B to L1 with separate events for pipeline overlap
-    TLOAD(aMatTile, qiGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
-    TLOAD(bMatTile, kjGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
-
-    // Move A to L0A as soon as A load completes (B may still be loading)
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    TMOV(aTile, aMatTile);
-    // Move B to L0B after B load completes
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    // Matmul
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(sijGlobal, cTile);
-
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
-    // args[4] = head_dim (128), args[5] = block_size
-
-    if (q_tile_size == 16) {
-        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
-    } else {
-        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp
deleted file mode 100644
index 45f90aab3..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-constexpr int M = 16;
-constexpr int K = 16;
-constexpr int N = 16;
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
deleted file mode 100644
index bfdddc75e..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Online Softmax Update + Normalize Kernel (AIV)
-//
-// Operates on full tiles where M=q_tile_size, N=head_dim (128):
-//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
-//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
-//
-// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
-//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
-//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
-//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
-//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int N>
-static __aicore__ void online_update_impl(
-    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
-    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
-) {
-    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
-    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
-    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
-    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
-    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
-    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
-    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
-
-    // Aligned rows for ColMajor DN tiles (32-byte alignment)
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    // --- GlobalTensor types ---
-
-    // Data (M, N) RowMajor
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
-
-    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    // Scalar ND: for storing mi_new and li_new back to GM
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
-    using GlobalScalarND =
-        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
-
-    // --- GlobalTensor instances ---
-
-    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
-    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
-    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
-
-    // DN globals for loading scalars as ColMajor
-    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
-    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
-    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
-    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
-
-    // ND globals for storing scalar results
-    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
-    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
-
-    // --- Tile types ---
-
-    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
-    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
-
-    // ND tile for storing back to GM
-    using TileScalarND =
-        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-
-    // --- UB memory layout ---
-
-    constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
-
-    // Data tiles
-    TileDataMxN oiNewTile;
-    TileDataMxN oiTile;
-
-    // Scalar DN tiles loaded from GM (ColMajor)
-    TileScalarDN mijDN, lijDN, miDN, liDN;
-
-    // Temporary DN tiles for results
-    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
-
-    TASSIGN(oiNewTile, 0);
-    TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijDN, 2 * kDataBytes);
-    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
-    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
-    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
-    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
-    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
-
-    if (is_first) {
-        // --- First block: copy inputs to accumulators ---
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijDN, mijGlobalDN);
-        TLOAD(lijDN, lijGlobalDN);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // Store mi = mij, li = lij, oi = oi_new
-        // Alias ND tiles to the same UB as DN tiles for storing as ND format
-        TileScalarND mijND, lijND;
-        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
-        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
-
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, mijND);    // mi = mij
-        TSTORE(liGlobalND, lijND);    // li = lij
-        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
-
-        if (is_last) {
-            // Single block: normalize dst = oi_new / lij
-            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            TSTORE(dstGlobal, oiNewTile);
-        }
-    } else {
-        // --- Subsequent blocks: accumulate ---
-
-        // Load all inputs
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(oiTile, oiGlobal);
-        TLOAD(mijDN, mijGlobalDN);
-        TLOAD(lijDN, lijGlobalDN);
-        TLOAD(miDN, miGlobalDN);
-        TLOAD(liDN, liGlobalDN);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
-        TileScalarRow miRow, mijRow, liRow, lijRow;
-        TRESHAPE(miRow, miDN);
-        TRESHAPE(mijRow, mijDN);
-        TRESHAPE(liRow, liDN);
-        TRESHAPE(lijRow, lijDN);
-
-        // Scalar arithmetic in RowMajor (1, M) layout
-        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
-        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
-        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
-        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
-        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
-        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
-
-        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
-        pipe_barrier(PIPE_V);
-        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
-        pipe_barrier(PIPE_V);
-        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
-        pipe_barrier(PIPE_V);
-        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
-        pipe_barrier(PIPE_V);
-        TEXP(betaRow, betaRow);  // beta = exp(mij - mi_new)
-        pipe_barrier(PIPE_V);
-        TMUL(tmpRow, alphaRow, liRow);  // alpha * li
-        pipe_barrier(PIPE_V);
-        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
-        pipe_barrier(PIPE_V);
-        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
-
-        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
-        TRESHAPE(alphaDN, alphaRow);
-        TRESHAPE(betaDN, betaRow);
-
-        // Scale data tiles using row-broadcast multiply
-        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
-        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
-        pipe_barrier(PIPE_V);
-        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
-
-        // Store mi_new and li_new to GM (ND format)
-        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
-        TileScalarND miNewND, liNewND;
-        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
-        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
-
-        if (is_last) {
-            // Normalize and output: dst = oi / li_new
-            TRESHAPE(liNewDN, liNewRow);
-            pipe_barrier(PIPE_V);
-            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            TSTORE(miGlobalND, miNewND);  // persist mi_new
-            TSTORE(liGlobalND, liNewND);  // persist li_new
-            TSTORE(dstGlobal, oiTile);
-        } else {
-            // Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            TSTORE(miGlobalND, miNewND);  // persist mi_new
-            TSTORE(liGlobalND, liNewND);  // persist li_new
-            TSTORE(oiGlobal, oiTile);
-        }
-    }
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
-    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
-    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
-    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
-    uint64_t is_first = static_cast<uint64_t>(args[7]);
-    uint64_t is_last = static_cast<uint64_t>(args[8]);
-    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
-    // args[10] = head_dim (128)
-
-    if (q_tile_size == 16) {
-        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-    } else {
-        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
deleted file mode 100644
index 0669123c2..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Softmax Preparation Kernel (AIV) with partial block masking
-//
-// Operates on (M, N) tile where M=q_tile_size, N=block_size:
-//   Case1: sij is (16, 128)
-//   Case2: sij is (64, 64)
-//
-// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
-// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
-// so that invalid key positions contribute zero attention weight.
-//
-// Computes:
-//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
-//   sij_scale = sij_masked * scale
-//   mij = row_max(sij_scale)        -> (M, 1)
-//   pij = exp(sij_scale - mij)      -> (M, N)
-//   lij = row_sum(pij)              -> (M, 1)
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int N>
-static __aicore__ void softmax_prepare_impl(
-    __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij
-) {
-    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
-    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
-    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
-    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
-    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
-
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
-    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
-    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
-    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
-    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
-
-    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
-    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
-    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
-    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
-
-    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    TileVecMxN sijTile;
-    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
-    TileSijPad sijPadTile;
-    TileVecMxN pijTile;
-    TileVecMxN tmpTile;
-    TileScalarDN maxTile;
-    TileScalarDN sumTile;
-    TileVecMxN_bf16 pijBf16Tile;
-
-    // All sij tiles share UB address 0x0 (in-place masking)
-    TASSIGN(sijTile, 0x0);
-    TASSIGN(sijDynTile, 0x0);
-    TASSIGN(sijPadTile, 0x0);
-    TASSIGN(pijTile, M * N * sizeof(float));
-    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
-    TASSIGN(maxTile, 3 * M * N * sizeof(float));
-    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
-    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
-
-    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
-    // printf("sij addr incore %x\n", sij->buffer.addr);
-    TLOAD(sijTile, sijGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
-    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
-    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
-    pipe_barrier(PIPE_V);
-
-    TMULS(sijTile, sijTile, scale_value);
-    pipe_barrier(PIPE_V);
-    TROWMAX(maxTile, sijTile, tmpTile);
-    pipe_barrier(PIPE_V);
-    TROWEXPANDSUB(pijTile, sijTile, maxTile);
-    pipe_barrier(PIPE_V);
-    TEXP(pijTile, pijTile);
-    // Truncate pij to bf16 first
-    pipe_barrier(PIPE_V);
-    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);  // pij bf16 ready, can store early
-
-    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
-    pipe_barrier(PIPE_V);
-    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
-    pipe_barrier(PIPE_V);
-    TROWSUM(sumTile, pijTile, tmpTile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);  // sum ready
-
-    // Store pij (overlaps with TCVT + TROWSUM above)
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(pijGlobal, pijBf16Tile);
-
-    // Store max and sum
-    TSTORE(mijGlobal, maxTile);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-    TSTORE(lijGlobal, sumTile);
-
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
-    union {
-        uint64_t u;
-        float f;
-    } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[4]);
-    float scale_value = scale_conv.f;
-    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
-
-    if (q_tile_size == 16) {
-        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
-    } else {
-        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
deleted file mode 100644
index 9ded96c08..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Paged Attention Orchestration — Per-Block Version
- * (aicpu_build_graph variant: explicit add_dependency, no TensorMap)
- *
- * For each batch, for each head tile, for each KV block:
- *   1. QK matmul:  qi @ kj^T → sij (q_tile, block_size)
- *   2. Softmax:    sij → pij, mi, li
- *   3. PV matmul:  pij @ vj → oi_tmp (q_tile, head_dim)
- *   4. Update:     online softmax accumulation
- *
- * Dependency graph per block:
- *   QK → Softmax → PV → Update
- *              └──────────→ Update
- *   Update(prev block) ──→ Update(this block)
- *   Hub(init) ────────────→ Update(first block)
- */
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define FUNC_QK_MATMUL 0
-#define FUNC_SOFTMAX_PREPARE 1
-#define FUNC_PV_MATMUL 2
-#define FUNC_ONLINE_UPDATE 3
-#define FUNC_AIC_HUB 4
-#define FUNC_AIV_HUB 5
-
-extern "C" {
-
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;  // NOLINT(readability/casting)
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 7,
-    };
-}
-
-__attribute__((visibility("default"))) void
-aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) {
-    // Read dimensions from tensor metadata
-    // query: shape=[batch, num_heads, head_dim]
-    uint64_t batch = orch_args.tensor(0).shapes[0];
-    uint64_t num_heads = orch_args.tensor(0).shapes[1];
-    uint64_t head_dim = orch_args.tensor(0).shapes[2];
-    DataType data_type = orch_args.tensor(0).dtype;
-
-    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
-    uint64_t block_size = orch_args.tensor(1).shapes[1];
-
-    // block_table: shape=[batch, max_num_blocks_per_req]
-    uint64_t block_num = orch_args.tensor(3).shapes[1];
-
-    // scale from scalar arg
-    uint64_t scale_value = orch_args.scalar(0);
-
-    uint64_t q_head_num = num_heads;
-    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
-    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
-
-    // Reshape tensors for kernel consumption (2D flattened)
-    void *query_ptr = orch_args.tensor(0).data_as<void>();
-    void *kc_ptr = orch_args.tensor(1).data_as<void>();
-    void *vc_ptr = orch_args.tensor(2).data_as<void>();
-    void *out_ptr = orch_args.tensor(5).data_as<void>();
-
-    uint64_t total_blocks_count = orch_args.tensor(1).shapes[0];
-
-    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    uint32_t key_cache_shapes[2] = {
-        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
-    };
-    uint32_t value_cache_shapes[2] = {
-        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
-    };
-    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false);
-    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false);
-    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false);
-    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
-
-    int *host_block_table = orch_args.tensor(3).data_as<int>();
-    int *host_context_lens = orch_args.tensor(4).data_as<int>();
-
-    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
-        uint64_t cur_seq = host_context_lens[b_idx];
-        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
-
-        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
-            PTO2_SCOPE(rt) {
-                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
-
-                uint32_t oi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-                uint32_t li_shapes[1] = {static_cast<uint32_t>(q_tile)};
-                uint32_t mi_shapes[1] = {static_cast<uint32_t>(q_tile)};
-                uint32_t qi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
-                Tensor qi = query.view(qi_shapes, qi_offsets);
-                uint32_t out_view_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
-                Tensor out_view = out.view(out_view_shapes, out_view_offsets);
-
-                // Hub task: zero-initialize accumulators
-                Arg args_inplace;
-                args_inplace.add_output(TensorCreateInfo(oi_shapes, 2, DataType::FLOAT32));
-                args_inplace.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32));
-                args_inplace.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32));
-                SubmitResult r_hub = rt_submit_aiv_task(rt, FUNC_AIV_HUB, args_inplace);
-                const Tensor &oi = r_hub.outputs.get_ref(0);
-                const Tensor &li_update = r_hub.outputs.get_ref(1);
-                const Tensor &mi_update = r_hub.outputs.get_ref(2);
-
-                PTO2TaskId prev_update_task = r_hub.task_id;
-
-                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
-                    uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
-                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
-
-                    // KV views for this block
-                    uint32_t kv_shapes[2] = {static_cast<uint32_t>(block_size), static_cast<uint32_t>(head_dim)};
-                    uint32_t kv_offsets[2] = {static_cast<uint32_t>(cur_block_idx * block_size), 0};
-                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
-                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
-
-                    // === Task 1: QK matmul ===
-                    uint32_t sij_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(block_size)};
-
-                    Arg args_qk;
-                    args_qk.add_input(qi);
-                    args_qk.add_input(kj);
-                    args_qk.add_output(TensorCreateInfo(sij_shapes, 2, DataType::FLOAT32));
-                    SubmitResult r_qk = rt_submit_aic_task(rt, FUNC_QK_MATMUL, args_qk);
-
-                    // === Task 2: Softmax ===
-                    uint32_t sij_valid_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(valid_len)};
-                    uint32_t sij_valid_offsets[2] = {0, 0};
-                    Tensor sij_valid = r_qk.outputs.get_ref(0).view(sij_valid_shapes, sij_valid_offsets);
-
-                    Arg args_sf;
-                    args_sf.add_input(sij_valid);
-                    args_sf.add_output(TensorCreateInfo(sij_shapes, 2, data_type));
-                    args_sf.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32));
-                    args_sf.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32));
-                    args_sf.add_scalar(scale_value);
-                    SubmitResult r_sf = rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, args_sf);
-                    rt_add_dependency(rt, r_qk.task_id, r_sf.task_id);
-
-                    // === Task 3: PV matmul ===
-                    uint32_t oi_tmp_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-
-                    Arg args_pv;
-                    args_pv.add_input(r_sf.outputs.get_ref(0));
-                    args_pv.add_input(vj);
-                    args_pv.add_output(TensorCreateInfo(oi_tmp_shapes, 2, DataType::FLOAT32));
-                    SubmitResult r_pv = rt_submit_aic_task(rt, FUNC_PV_MATMUL, args_pv);
-                    rt_add_dependency(rt, r_sf.task_id, r_pv.task_id);
-
-                    // === Task 4: Online update ===
-                    uint64_t is_first = (bn == 0) ? 1 : 0;
-                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
-
-                    Arg args_up;
-                    args_up.add_input(r_sf.outputs.get_ref(1));
-                    args_up.add_input(r_sf.outputs.get_ref(2));
-                    args_up.add_input(r_pv.outputs.get_ref(0));
-                    args_up.add_inout(mi_update);
-                    args_up.add_inout(li_update);
-                    args_up.add_inout(oi);
-                    args_up.add_inout(out_view);
-                    args_up.add_scalar(is_first);
-                    args_up.add_scalar(is_last);
-                    SubmitResult r_up = rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, args_up);
-                    rt_add_dependency(rt, r_sf.task_id, r_up.task_id);
-                    rt_add_dependency(rt, r_pv.task_id, r_up.task_id);
-                    rt_add_dependency(rt, prev_update_task, r_up.task_id);
-
-                    prev_update_task = r_up.task_id;
-                }
-            }
-        }
-    }
-}
-
-}  // extern "C"
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py b/tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py
deleted file mode 100644
index b4ee7a376..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged attention — aicpu_build_graph runtime (production scale, bfloat16).
-
-Tests aicpu_build_graph runtime with hub kernels (aic_hub, aiv_hub),
-INOUT tensors, and AIC+AIV mixed execution.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden  # noqa: PLC0415
-from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs  # noqa: PLC0415
-
-
-@scene_test(level=2, runtime="aicpu_build_graph")
-class TestPagedAttentionAicpuBuildGraph(SceneTestCase):
-    """Paged attention with aicpu_build_graph runtime and hub kernels."""
-
-    RTOL = 1e-3
-    ATOL = 1e-3
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/paged_attention_orch.cpp",
-            "function_name": "aicpu_orchestration_entry",
-            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "source": "kernels/aic/aic_qk_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 2,
-                "source": "kernels/aic/aic_pv_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 4,
-                "source": "kernels/aic/aic_hub.cpp",
-                "core_type": "aic",
-                "signature": [],
-            },
-            {
-                "func_id": 1,
-                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-            },
-            {
-                "func_id": 3,
-                "source": "kernels/aiv/aiv_online_update.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-            },
-            {
-                "func_id": 5,
-                "source": "kernels/aiv/aiv_hub.cpp",
-                "core_type": "aiv",
-                "signature": [],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "case1",
-            "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "params": {
-                "batch": 256,
-                "num_heads": 16,
-                "kv_head_num": 1,
-                "head_dim": 128,
-                "block_size": 128,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-        {
-            "name": "case2",
-            "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "params": {
-                "batch": 64,
-                "num_heads": 64,
-                "kv_head_num": 1,
-                "head_dim": 128,
-                "block_size": 64,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-    ]
-
-    def generate_args(self, params):
-        inputs = _pa_generate_inputs(params)
-        specs = []
-        for name, val in inputs:
-            if isinstance(val, torch.Tensor):
-                specs.append(Tensor(name, val))
-            else:
-                specs.append(Scalar(name, val))
-        return TaskArgsBuilder(*specs)
-
-    def compute_golden(self, args, params):
-        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
-        _pa_compute_golden(tensors, params)
-        for s in args.specs:
-            if isinstance(s, Tensor) and s.name in tensors:
-                getattr(args, s.name)[:] = tensors[s.name]
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp
deleted file mode 100644
index 45f90aab3..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_hub.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-constexpr int M = 16;
-constexpr int K = 16;
-constexpr int N = 16;
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
deleted file mode 100644
index d06e1e06c..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks
-//
-// Processes n_blocks blocks using SplitK accumulation pattern:
-//   Block 0: TMATMUL(C, A, B)       — initialize accumulator
-//   Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C
-//
-// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K)
-// Per-block vj addresses: value_cache base + block_indices lookup
-// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks
-//
-// Optimizations:
-//   - Double-buffered L1 tiles (ping/pong for A and B)
-//   - TLOAD(next pij+vj) overlaps with TMATMUL_ACC(current) via MTE2/PIPE_M parallelism
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: (16, 128) @ (128, 128) -> (16, 128)
-//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
-//
-// pij is bfloat16 (from softmax_prepare TCVT).
-// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int K, int N>
-static __aicore__ void pv_matmul_n_impl(
-    __gm__ bfloat16_t *pij_base, __gm__ bfloat16_t *val_base, __gm__ float *oi_base, uint64_t n_blocks,
-    __gm__ int32_t *block_table
-) {
-    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-
-    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
-    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    // Double-buffered L1 tiles (ping/pong)
-    TileMatA aMatTile_ping, aMatTile_pong;
-    TileMatB bMatTile_ping, bMatTile_pong;
-    TASSIGN(aMatTile_ping, 0x0);
-    TASSIGN(aMatTile_pong, 0x10000);
-    TASSIGN(bMatTile_ping, 0x20000);
-    TASSIGN(bMatTile_pong, 0x30000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    GlobalOut oiGlobal(oi_base);
-
-    // Pre-load first iteration's tiles into ping buffers
-    GlobalA pijGlobal_0(pij_base);
-    GlobalB vjGlobal_0(val_base + block_table[0] * K * N);
-    TLOAD(aMatTile_ping, pijGlobal_0);
-    TLOAD(bMatTile_ping, vjGlobal_0);
-
-    for (uint64_t i = 0; i < n_blocks; i++) {
-        // Select current buffers based on iteration parity
-        TileMatA &curA = (i % 2 == 0) ? aMatTile_ping : aMatTile_pong;
-        TileMatB &curB = (i % 2 == 0) ? bMatTile_ping : bMatTile_pong;
-
-        // Wait for current TLOAD to complete
-        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-        // Wait for previous matmul to complete (L0A/L0B safe to overwrite)
-        if (i > 0) {
-            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        }
-
-        TMOV(aTile, curA);
-        TMOV(bTile, curB);
-
-        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-        if (i == 0) {
-            TMATMUL(cTile, aTile, bTile);
-        } else {
-            TMATMUL_ACC(cTile, cTile, aTile, bTile);
-        }
-
-        // Prefetch next iteration's data (MTE2 overlaps with matmul completion)
-        if (i + 1 < n_blocks) {
-            // Signal matmul completion for next iteration's TMOV guard
-            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-            TileMatA &nxtA = (i % 2 == 0) ? aMatTile_pong : aMatTile_ping;
-            TileMatB &nxtB = (i % 2 == 0) ? bMatTile_pong : bMatTile_ping;
-            GlobalA pijGlobal_next(pij_base + (i + 1) * M * K);
-            GlobalB vjGlobal_next(val_base + block_table[i + 1] * K * N);
-            TLOAD(nxtA, pijGlobal_next);
-            TLOAD(nxtB, vjGlobal_next);
-        }
-    }
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    TSTORE(oiGlobal, cTile);
-
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ TensorData *pij_buf = reinterpret_cast<__gm__ TensorData *>(args[0]);
-    __gm__ TensorData *value_cache = reinterpret_cast<__gm__ TensorData *>(args[1]);
-    __gm__ TensorData *oi_new = reinterpret_cast<__gm__ TensorData *>(args[2]);
-    uint64_t n_blocks = static_cast<uint64_t>(args[3]);
-    __gm__ int32_t *block_table = reinterpret_cast<__gm__ int32_t *>(args[4]);
-
-    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
-    __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr);
-    __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset;
-
-    uint64_t q_tile_size = static_cast<uint64_t>(pij_buf->shapes[0]);
-
-    if (q_tile_size == 16) {
-        pv_matmul_n_impl<16, 128, 128>(pij_base, val_base, oi_base, n_blocks, block_table);
-    } else {
-        pv_matmul_n_impl<64, 64, 128>(pij_base, val_base, oi_base, n_blocks, block_table);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
deleted file mode 100644
index 5f38ee47f..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block
-//
-// Processes n_blocks blocks in a single kernel invocation.
-// Per-block kj addresses computed from key_cache base + block_indices lookup.
-// qi is shared across all blocks (same query head against different key blocks).
-//
-// Output layout: n_blocks contiguous (M, N) tiles stacked vertically.
-// Block i occupies sij[i*M : (i+1)*M, 0:N].
-//
-// Optimizations:
-//   - qi TLOAD hoisted before the loop (constant across all iterations)
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
-//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
-//
-// Template: M=q_tile, K=head_dim, N=block_size
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int K, int N>
-static __aicore__ void qk_matmul_n_impl(
-    __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ float *sij_base, uint64_t n_blocks,
-    __gm__ int32_t *block_table
-) {
-    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
-
-    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
-    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Hoist qi TLOAD before the loop (qi is constant across all blocks)
-    GlobalA qiGlobal(qi_base);
-    TLOAD(aMatTile, qiGlobal);
-
-    for (uint64_t i = 0; i < n_blocks; i++) {
-        GlobalB kjGlobal(key_base + block_table[i] * N * K);
-        GlobalOut sijGlobal(sij_base + i * M * N);
-
-        // Load only B each iteration (qi already in L1 from hoist)
-        TLOAD(bMatTile, kjGlobal);
-
-        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-        // TMOV qi from L1→L0A (re-copy since TMATMUL consumed L0A) and kj from L1→L0B
-        TMOV(aTile, aMatTile);
-        TMOV(bTile, bMatTile);
-
-        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-        TMATMUL(cTile, aTile, bTile);
-
-        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-        TSTORE(sijGlobal, cTile);
-
-        if (i + 1 < n_blocks) {
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ TensorData *qi = reinterpret_cast<__gm__ TensorData *>(args[0]);
-    __gm__ TensorData *key_cache = reinterpret_cast<__gm__ TensorData *>(args[1]);
-    __gm__ TensorData *sij_buf = reinterpret_cast<__gm__ TensorData *>(args[2]);
-    uint64_t n_blocks = static_cast<uint64_t>(args[3]);
-    __gm__ int32_t *block_table = reinterpret_cast<__gm__ int32_t *>(args[4]);
-
-    __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset;
-    __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr);
-    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
-
-    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
-
-    if (q_tile_size == 16) {
-        qk_matmul_n_impl<16, 128, 128>(qi_base, key_base, sij_base, n_blocks, block_table);
-    } else {
-        qk_matmul_n_impl<64, 128, 64>(qi_base, key_base, sij_base, n_blocks, block_table);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
deleted file mode 100644
index 45f90aab3..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-constexpr int M = 16;
-constexpr int K = 16;
-constexpr int N = 16;
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
deleted file mode 100644
index a68908229..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Online Softmax Update + Normalize Kernel (AIV)
-//
-// Operates on full tiles where M=q_tile_size, N=head_dim (128):
-//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
-//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
-//
-// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
-//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
-//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
-//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
-//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int N>
-static __aicore__ void online_update_impl(
-    __gm__ TensorData *mij, __gm__ TensorData *lij, __gm__ TensorData *oi_new, __gm__ TensorData *mi,
-    __gm__ TensorData *li, __gm__ TensorData *oi, uint64_t is_first, uint64_t is_last, __gm__ TensorData *dst
-) {
-    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
-    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
-    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
-    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
-    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
-    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
-    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
-
-    // Aligned rows for ColMajor DN tiles (32-byte alignment)
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    // --- GlobalTensor types ---
-
-    // Data (M, N) RowMajor
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
-
-    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    // Scalar ND: for storing mi_new and li_new back to GM
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
-    using GlobalScalarND =
-        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
-
-    // --- GlobalTensor instances ---
-
-    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
-    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
-    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
-
-    // DN globals for loading scalars as ColMajor
-    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
-    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
-    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
-    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
-
-    // ND globals for storing scalar results
-    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
-    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
-
-    // --- Tile types ---
-
-    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
-    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
-
-    // ND tile for storing back to GM
-    using TileScalarND =
-        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-
-    // --- UB memory layout ---
-
-    constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
-
-    // Data tiles
-    TileDataMxN oiNewTile;
-    TileDataMxN oiTile;
-
-    // Scalar DN tiles loaded from GM (ColMajor)
-    TileScalarDN mijDN, lijDN, miDN, liDN;
-
-    // Temporary DN tiles for results
-    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
-
-    TASSIGN(oiNewTile, 0);
-    TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijDN, 2 * kDataBytes);
-    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
-    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
-    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
-    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
-    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
-
-    if (is_first) {
-        // --- First block: copy inputs to accumulators ---
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijDN, mijGlobalDN);
-        TLOAD(lijDN, lijGlobalDN);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // Store mi = mij, li = lij, oi = oi_new
-        // Alias ND tiles to same UB as DN tiles for ND-format store
-        TileScalarND mijND, lijND;
-        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
-        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
-
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, mijND);    // mi = mij
-        TSTORE(liGlobalND, lijND);    // li = lij
-        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
-
-        if (is_last) {
-            // Single block: normalize dst = oi_new / lij
-            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            TSTORE(dstGlobal, oiNewTile);
-        }
-    } else {
-        // --- Subsequent blocks: accumulate ---
-
-        // Load all inputs as DN (ColMajor)
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(oiTile, oiGlobal);
-        TLOAD(mijDN, mijGlobalDN);
-        TLOAD(lijDN, lijGlobalDN);
-        TLOAD(miDN, miGlobalDN);
-        TLOAD(liDN, liGlobalDN);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
-        TileScalarRow miRow, mijRow, liRow, lijRow;
-        TRESHAPE(miRow, miDN);
-        TRESHAPE(mijRow, mijDN);
-        TRESHAPE(liRow, liDN);
-        TRESHAPE(lijRow, lijDN);
-
-        // Scalar arithmetic in RowMajor (1, M) layout
-        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
-        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
-        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
-        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
-        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
-        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
-
-        TMAX(miNewRow, miRow, mijRow);  // mi_new = max(mi, mij)
-        pipe_barrier(PIPE_V);
-        TSUB(alphaRow, miRow, miNewRow);  // alpha_exp = mi - mi_new
-        pipe_barrier(PIPE_V);
-        TEXP(alphaRow, alphaRow);  // alpha = exp(mi - mi_new)
-        pipe_barrier(PIPE_V);
-        TSUB(betaRow, mijRow, miNewRow);  // beta_exp = mij - mi_new
-        pipe_barrier(PIPE_V);
-        TEXP(betaRow, betaRow);  // beta = exp(mij - mi_new)
-        pipe_barrier(PIPE_V);
-        TMUL(tmpRow, alphaRow, liRow);  // alpha * li
-        pipe_barrier(PIPE_V);
-        TMUL(liNewRow, betaRow, lijRow);  // beta * lij
-        pipe_barrier(PIPE_V);
-        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
-
-        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
-        TRESHAPE(alphaDN, alphaRow);
-        TRESHAPE(betaDN, betaRow);
-
-        // Scale data tiles using row-broadcast multiply
-        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
-        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
-        pipe_barrier(PIPE_V);
-        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
-
-        // Store mi_new and li_new to GM (ND format)
-        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
-        TileScalarND miNewND, liNewND;
-        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
-        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
-
-        if (is_last) {
-            // Normalize and output: dst = oi / li_new
-            TRESHAPE(liNewDN, liNewRow);
-            pipe_barrier(PIPE_V);
-            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            TSTORE(miGlobalND, miNewND);  // persist mi_new
-            TSTORE(liGlobalND, liNewND);  // persist li_new
-            TSTORE(dstGlobal, oiTile);
-        } else {
-            // Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            TSTORE(miGlobalND, miNewND);  // persist mi_new
-            TSTORE(liGlobalND, liNewND);  // persist li_new
-            TSTORE(oiGlobal, oiTile);
-        }
-    }
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ TensorData *mij = reinterpret_cast<__gm__ TensorData *>(args[0]);
-    __gm__ TensorData *lij = reinterpret_cast<__gm__ TensorData *>(args[1]);
-    __gm__ TensorData *oi_new = reinterpret_cast<__gm__ TensorData *>(args[2]);
-    __gm__ TensorData *mi = reinterpret_cast<__gm__ TensorData *>(args[3]);
-    __gm__ TensorData *li = reinterpret_cast<__gm__ TensorData *>(args[4]);
-    __gm__ TensorData *oi = reinterpret_cast<__gm__ TensorData *>(args[5]);
-    __gm__ TensorData *dst = reinterpret_cast<__gm__ TensorData *>(args[6]);
-    uint64_t is_first = static_cast<uint64_t>(args[7]);
-    uint64_t is_last = static_cast<uint64_t>(args[8]);
-    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
-    // args[10] = head_dim (128)
-
-    if (q_tile_size == 16) {
-        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-    } else {
-        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
deleted file mode 100644
index b484a0b8a..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Two-Pass Softmax Kernel (AIV) for n_blocks tiles
-//
-// Input:  sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically
-// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block
-//         mij (M,) fp32 — global row max across all blocks
-//         lij (M,) fp32 — total row sum across all blocks
-//
-// Pass 1: Iterate over n_blocks tiles, apply scale, mask last block,
-//         find global m = max over all blocks of rowmax(S_i * scale)
-//         Uses TRESHAPE for DN↔Row conversion to keep globalMax in UB
-//         (eliminates 63 × 4 GM round-trip operations).
-// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16,
-//         accumulate l = sum over all blocks of rowsum(P_i)
-//         Uses double-buffered sij tiles to overlap TLOAD with computation.
-//
-// Two-pass ensures all P_i tiles share the same scale (global max),
-// enabling direct TMATMUL_ACC accumulation in the PV kernel.
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: M=16, N=128 (q_tile=16, block_size=128)
-//   Case2: M=64, N=64  (q_tile=64, block_size=64)
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int N>
-static __aicore__ void softmax_prepare_n_impl(
-    __gm__ float *sij_base, float scale_value, __gm__ bfloat16_t *pij_base, __gm__ float *mij_addr,
-    __gm__ float *lij_addr, uint64_t n_blocks, uint64_t valid_len_last
-) {
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
-
-    // --- GlobalTensor types ---
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
-    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
-    using GlobalScalarND =
-        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
-
-    // --- Tile types ---
-    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
-    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
-    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-    using TileScalarND =
-        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-    // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE
-    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
-
-    // --- UB memory layout (double-buffered sij) ---
-    constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
-
-    // Double-buffered sij tiles
-    TileVecMxN sijTile_A;
-    TileSijPad sijPadTile_A;
-    TileVecMxN sijTile_B;
-    TileSijPad sijPadTile_B;
-    TileVecMxN pijTile;
-    TileVecMxN tmpTile;
-    TileVecMxN sumAccTile;
-    TileScalarDN localMaxDN;
-    TileScalarDN globalMaxDN;
-    TileScalarDN sumDN;
-    TileVecMxN_bf16 pijBf16Tile;
-
-    // TRESHAPE aliases (same UB address as their DN counterparts)
-    TileScalarRow localMaxRow;
-    TileScalarRow globalMaxRow;
-
-    // ND alias for storing globalMax to GM
-    TileScalarND globalMaxND;
-
-    TASSIGN(sijTile_A, 0x0);
-    TASSIGN(sijPadTile_A, 0x0);
-    TASSIGN(sijTile_B, kDataBytes);
-    TASSIGN(sijPadTile_B, kDataBytes);
-    TASSIGN(pijTile, 2 * kDataBytes);
-    TASSIGN(tmpTile, 3 * kDataBytes);
-    TASSIGN(sumAccTile, 4 * kDataBytes);
-    int scalarBase = 5 * kDataBytes;
-    TASSIGN(localMaxDN, scalarBase);
-    TASSIGN(localMaxRow, scalarBase);  // alias: same UB as localMaxDN
-    TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes);
-    TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes);  // alias: same UB as globalMaxDN
-    TASSIGN(globalMaxND, scalarBase + kScalarDNBytes);   // alias: same UB as globalMaxDN
-    TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes);
-    TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes);
-
-    // GM aliases (mij/lij output buffers)
-    GlobalScalarND mijGlobalND(mij_addr);
-    GlobalScalarDN lijGlobalDN(lij_addr);
-
-    // ======== Pass 1: Find global row max via TRESHAPE (no GM round-trip) ========
-    for (uint64_t i = 0; i < n_blocks; i++) {
-        GlobalDataMxN sijGlobal(sij_base + i * M * N);
-        TLOAD(sijTile_A, sijGlobal);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
-            TileSijDyn sijDynTile(static_cast<size_t>(valid_len_last));
-            TASSIGN(sijDynTile, 0x0);
-            TFILLPAD_INPLACE(sijPadTile_A, sijDynTile);
-        }
-
-        TMULS(sijTile_A, sijTile_A, scale_value);
-        pipe_barrier(PIPE_V);
-        TROWMAX(localMaxDN, sijTile_A, tmpTile);
-
-        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX
-        TRESHAPE(localMaxRow, localMaxDN);
-        if (i == 0) {
-            pipe_barrier(PIPE_V);
-            TMAX(globalMaxRow, localMaxRow, localMaxRow);
-        } else {
-            pipe_barrier(PIPE_V);
-            TMAX(globalMaxRow, globalMaxRow, localMaxRow);
-        }
-    }
-
-    // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB
-    TRESHAPE(globalMaxDN, globalMaxRow);
-
-    // Store final global max to mij for online_update to consume
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(mijGlobalND, globalMaxND);
-
-    // ======== Pass 2: Compute softmax with double-buffered sij ========
-    // globalMaxDN is already in UB from TRESHAPE — no reload needed.
-    // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD.
-    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-
-    // Pre-load first sij tile into buffer A
-    GlobalDataMxN sijGlobal_0(sij_base);
-    TLOAD(sijTile_A, sijGlobal_0);
-
-    for (uint64_t i = 0; i < n_blocks; i++) {
-        GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N);
-
-        // Wait for current tile's TLOAD to complete
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // TFILLPAD on current buffer if last block with partial valid length
-        if (i == n_blocks - 1 && valid_len_last < static_cast<uint64_t>(N)) {
-            TileSijDyn curSijDyn(static_cast<size_t>(valid_len_last));
-            if (i % 2 == 0) {
-                TASSIGN(curSijDyn, 0x0);
-                TFILLPAD_INPLACE(sijPadTile_A, curSijDyn);
-            } else {
-                TASSIGN(curSijDyn, static_cast<int>(kDataBytes));
-                TFILLPAD_INPLACE(sijPadTile_B, curSijDyn);
-            }
-        }
-
-        // Compute on current buffer (select A or B based on iteration parity)
-        if (i % 2 == 0) {
-            TMULS(sijTile_A, sijTile_A, scale_value);
-            pipe_barrier(PIPE_V);
-            TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN);
-        } else {
-            TMULS(sijTile_B, sijTile_B, scale_value);
-            pipe_barrier(PIPE_V);
-            TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN);
-        }
-        pipe_barrier(PIPE_V);
-        TEXP(pijTile, pijTile);
-        TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
-        TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
-
-        if (i == 0) {
-            TMULS(sumAccTile, pijTile, 1.0f);
-        } else {
-            TADD(sumAccTile, sumAccTile, pijTile);
-        }
-
-        // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile)
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(pijGlobal, pijBf16Tile);
-
-        // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race)
-        if (i + 1 < n_blocks) {
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N);
-            if (i % 2 == 0) {
-                TLOAD(sijTile_B, sijGlobal_next);
-            } else {
-                TLOAD(sijTile_A, sijGlobal_next);
-            }
-        }
-    }
-
-    // Compute final row sum from accumulated pij values
-    pipe_barrier(PIPE_V);
-    TROWSUM(sumDN, sumAccTile, tmpTile);
-
-    // Store lij (total sum). mij already stored after Pass 1.
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(lijGlobalDN, sumDN);
-
-    pipe_sync();
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ TensorData *sij_buf = reinterpret_cast<__gm__ TensorData *>(args[0]);
-    __gm__ TensorData *pij_buf = reinterpret_cast<__gm__ TensorData *>(args[1]);
-    __gm__ TensorData *mij = reinterpret_cast<__gm__ TensorData *>(args[2]);
-    __gm__ TensorData *lij = reinterpret_cast<__gm__ TensorData *>(args[3]);
-    union {
-        uint64_t u;
-        float f;
-    } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[4]);
-    float scale_value = scale_conv.f;
-    uint64_t n_blocks = static_cast<uint64_t>(args[5]);
-    uint64_t valid_len_last = static_cast<uint64_t>(args[6]);
-
-    __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset;
-    __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset;
-    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset;
-    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset;
-
-    uint64_t q_tile_size = static_cast<uint64_t>(sij_buf->shapes[0]);
-
-    if (q_tile_size == 16) {
-        softmax_prepare_n_impl<16, 128>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
-    } else {
-        softmax_prepare_n_impl<64, 64>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last);
-    }
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
deleted file mode 100644
index d1b8a7c1d..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Paged Attention Orchestration — N_UNROLL=64, 4 Tasks Per Group
- * (aicpu_build_graph variant: explicit add_dependency, no TensorMap)
- *
- * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks:
- *   1. QK matmul:  qi @ K^T for n_blocks → sij_buf (q_tile, n_blocks * block_size)
- *   2. Softmax:    two-pass over sij_buf → pij_buf, mi, li
- *   3. PV matmul:  SplitK accumulated P @ V → oi_new (q_tile, head_dim)
- *   4. Update:     online softmax accumulation with group-level mi, li, oi_new
- *
- * Dependency graph per group:
- *   QK → Softmax → PV → Update
- *             └──────────→ Update
- *   Update(prev group) ──→ Update(this group)
- *   Hub(init) ────────────→ Update(first group)
- */
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define N_UNROLL 64
-
-#define FUNC_QK_MATMUL 0
-#define FUNC_SOFTMAX_PREPARE 1
-#define FUNC_PV_MATMUL 2
-#define FUNC_ONLINE_UPDATE 3
-#define FUNC_AIC_HUB 4
-#define FUNC_AIV_HUB 5
-
-constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
-
-inline double cycles_to_us(uint64_t cycles) {
-    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
-}
-
-inline uint64_t get_sys_cnt_aicpu() {
-    uint64_t ticks;
-    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
-    return ticks;
-}
-
-#ifdef ENABLE_PROFILING
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#else
-#define CYCLE_COUNT_START() (void)0
-#define CYCLE_COUNT_LAP(acc) (void)0
-#endif
-
-extern "C" {
-/**
- * Orchestration config — the executor reads these values to set up
- * shared memory and runtime before calling aicpu_orchestration_entry.
- */
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;  // NOLINT(readability/casting)
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 7,
-    };
-}
-
-__attribute__((visibility("default"))) void
-aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) {
-#ifdef ENABLE_PROFILING
-    uint64_t prof_param_extract = 0;
-    uint64_t prof_ext_tensor = 0;
-    uint64_t prof_make_tensor = 0;
-    uint64_t prof_tensor_view = 0;
-    uint64_t prof_param_setup = 0;
-    uint64_t prof_submit_task = 0;
-    uint64_t prof_scope_and_loop = 0;
-    int prof_submit_count = 0;
-    int prof_make_count = 0;
-    int prof_view_count = 0;
-#endif
-
-    CYCLE_COUNT_START();
-
-    // Read dimensions from tensor metadata
-    // query: shape=[batch, num_heads, head_dim]
-    uint64_t batch = orch_args.tensor(0).shapes[0];
-    uint64_t num_heads = orch_args.tensor(0).shapes[1];
-    uint64_t head_dim = orch_args.tensor(0).shapes[2];
-    DataType data_type = orch_args.tensor(0).dtype;
-
-    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
-    uint64_t block_size = orch_args.tensor(1).shapes[1];
-
-    // block_table: shape=[batch, max_num_blocks_per_req]
-    uint64_t block_num = orch_args.tensor(3).shapes[1];
-
-    // scale from scalar arg
-    uint64_t scale_value = orch_args.scalar(0);
-
-    uint64_t q_head_num = num_heads;
-    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
-    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
-    CYCLE_COUNT_LAP(prof_param_extract);
-
-    // Reshape tensors for kernel consumption (2D flattened)
-    void *query_ptr = orch_args.tensor(0).data_as<void>();
-    void *kc_ptr = orch_args.tensor(1).data_as<void>();
-    void *vc_ptr = orch_args.tensor(2).data_as<void>();
-    void *out_ptr = orch_args.tensor(5).data_as<void>();
-
-    uint64_t total_blocks_count = orch_args.tensor(1).shapes[0];
-
-    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    uint32_t key_cache_shapes[2] = {
-        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
-    };
-    uint32_t value_cache_shapes[2] = {
-        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
-    };
-    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false);
-    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false);
-    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false);
-    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
-
-    int *host_block_table = orch_args.tensor(3).data_as<int>();
-    int *host_context_lens = orch_args.tensor(4).data_as<int>();
-
-#ifdef ENABLE_PROFILING
-    CYCLE_COUNT_LAP(prof_ext_tensor);
-#endif
-
-    // Prefetch first batch's block table data into cache (4 cache lines = 256 bytes)
-    for (int cl = 0; cl < N_UNROLL * static_cast<int>(sizeof(int)); cl += 64) {
-        __builtin_prefetch(reinterpret_cast<char *>(host_block_table) + cl, 0, 3);
-    }
-    __builtin_prefetch(&host_context_lens[0], 0, 3);
-
-    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
-        uint64_t cur_seq = host_context_lens[b_idx];
-        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
-        // Pre-compute block table base pointer for this batch
-        int *bt_base = host_block_table + b_idx * block_num;
-
-        // Prefetch next batch's block table + context_lens while processing current batch
-        if (b_idx + 1 < batch) {
-            int *bt_next = host_block_table + (b_idx + 1) * block_num;
-            for (int cl = 0; cl < N_UNROLL * static_cast<int>(sizeof(int)); cl += 64) {
-                __builtin_prefetch(reinterpret_cast<char *>(bt_next) + cl, 0, 3);
-            }
-            __builtin_prefetch(&host_context_lens[b_idx + 1], 0, 3);
-        }
-        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
-            CYCLE_COUNT_LAP(prof_scope_and_loop);
-            PTO2_SCOPE(rt) {
-                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
-
-                uint32_t oi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-                uint32_t li_shapes[1] = {static_cast<uint32_t>(q_tile)};
-                uint32_t mi_shapes[1] = {static_cast<uint32_t>(q_tile)};
-
-#ifdef ENABLE_PROFILING
-                prof_make_count += 3;
-                CYCLE_COUNT_LAP(prof_make_tensor);
-#endif
-
-                uint32_t qi_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
-                Tensor qi = query.view(qi_shapes, qi_offsets);
-                uint32_t out_view_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
-                Tensor out_view = out.view(out_view_shapes, out_view_offsets);
-#ifdef ENABLE_PROFILING
-                prof_view_count += 2;
-                CYCLE_COUNT_LAP(prof_tensor_view);
-#endif
-                // Hub task: zero-initialize oi, li_update, mi_update
-                Arg args_inplace;
-                args_inplace.add_output(TensorCreateInfo(oi_shapes, 2, DataType::FLOAT32));
-                args_inplace.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32));
-                args_inplace.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32));
-                CYCLE_COUNT_LAP(prof_param_setup);
-                SubmitResult r_hub = rt_submit_aiv_task(rt, FUNC_AIV_HUB, args_inplace);
-                const Tensor &oi = r_hub.outputs.get_ref(0);
-                const Tensor &li_update = r_hub.outputs.get_ref(1);
-                const Tensor &mi_update = r_hub.outputs.get_ref(2);
-#ifdef ENABLE_PROFILING
-                prof_submit_count++;
-                CYCLE_COUNT_LAP(prof_submit_task);
-#endif
-
-                // Reusable Arg objects — reset() before each use avoids
-                // repeated stack-frame construction in the inner loop.
-                Arg args_qk, args_sf, args_pv, args_up;
-
-                PTO2TaskId prev_update_task = r_hub.task_id;
-
-                for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) {
-                    uint64_t n_blocks = std::min(static_cast<uint64_t>(N_UNROLL), bn_this_batch - bn);
-
-                    // Valid length for last block in this group
-                    uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size;
-                    uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start);
-                    CYCLE_COUNT_LAP(prof_param_extract);
-
-                    // === Task 1: Batched QK matmul ===
-                    uint32_t sij_buf_shapes[2] = {
-                        static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)
-                    };
-
-#ifdef ENABLE_PROFILING
-                    prof_make_count += 1;
-                    CYCLE_COUNT_LAP(prof_make_tensor);
-#endif
-
-                    args_qk.reset();
-                    args_qk.add_input(qi);
-                    args_qk.add_input(key_cache);
-                    args_qk.add_output(TensorCreateInfo(sij_buf_shapes, 2, DataType::FLOAT32));
-                    args_qk.add_scalar(n_blocks);
-                    args_qk.add_scalar(reinterpret_cast<uint64_t>(bt_base + bn));
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    SubmitResult r_qk = rt_submit_aic_task(rt, FUNC_QK_MATMUL, args_qk);
-#ifdef ENABLE_PROFILING
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-#endif
-
-                    // === Task 2: Two-pass softmax over all blocks in group ===
-                    uint32_t pij_buf_shapes[2] = {
-                        static_cast<uint32_t>(q_tile), static_cast<uint32_t>(n_blocks * block_size)
-                    };
-#ifdef ENABLE_PROFILING
-                    prof_make_count += 3;
-                    CYCLE_COUNT_LAP(prof_make_tensor);
-#endif
-
-                    args_sf.reset();
-                    args_sf.add_input(r_qk.outputs.get_ref(0));
-                    args_sf.add_output(TensorCreateInfo(pij_buf_shapes, 2, data_type));
-                    args_sf.add_output(TensorCreateInfo(mi_shapes, 1, DataType::FLOAT32));
-                    args_sf.add_output(TensorCreateInfo(li_shapes, 1, DataType::FLOAT32));
-                    args_sf.add_scalar(scale_value);
-                    args_sf.add_scalar(n_blocks);
-                    args_sf.add_scalar(valid_len_last);
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    SubmitResult r_sf = rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, args_sf);
-                    // QK → Softmax (sij_buf)
-                    rt_add_dependency(rt, r_qk.task_id, r_sf.task_id);
-#ifdef ENABLE_PROFILING
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-#endif
-
-                    // === Task 3: SplitK PV matmul (accumulated P @ V) ===
-                    uint32_t oi_new_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-#ifdef ENABLE_PROFILING
-                    prof_make_count += 1;
-                    CYCLE_COUNT_LAP(prof_make_tensor);
-#endif
-
-                    args_pv.reset();
-                    args_pv.add_input(r_sf.outputs.get_ref(0));
-                    args_pv.add_input(value_cache);
-                    args_pv.add_output(TensorCreateInfo(oi_new_shapes, 2, DataType::FLOAT32));
-                    args_pv.add_scalar(n_blocks);
-                    args_pv.add_scalar(reinterpret_cast<uint64_t>(bt_base + bn));
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    SubmitResult r_pv = rt_submit_aic_task(rt, FUNC_PV_MATMUL, args_pv);
-                    // Softmax → PV (pij_buf)
-                    rt_add_dependency(rt, r_sf.task_id, r_pv.task_id);
-#ifdef ENABLE_PROFILING
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-#endif
-
-                    // === Task 4: Online update (per-group) ===
-                    uint64_t is_first = (bn == 0) ? 1 : 0;
-                    uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0;
-
-                    args_up.reset();
-                    args_up.add_input(r_sf.outputs.get_ref(1));
-                    args_up.add_input(r_sf.outputs.get_ref(2));
-                    args_up.add_input(r_pv.outputs.get_ref(0));
-                    args_up.add_inout(mi_update);
-                    args_up.add_inout(li_update);
-                    args_up.add_inout(oi);
-                    args_up.add_inout(out_view);
-                    args_up.add_scalar(is_first);
-                    args_up.add_scalar(is_last);
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    SubmitResult r_up = rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, args_up);
-                    // Softmax → Update (mi, li)
-                    rt_add_dependency(rt, r_sf.task_id, r_up.task_id);
-                    // PV → Update (oi_new)
-                    rt_add_dependency(rt, r_pv.task_id, r_up.task_id);
-                    // Previous update → this update (mi_update, li_update, oi accumulation chain)
-                    rt_add_dependency(rt, prev_update_task, r_up.task_id);
-#ifdef ENABLE_PROFILING
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-#endif
-                    prev_update_task = r_up.task_id;
-                }
-            }
-            CYCLE_COUNT_LAP(prof_scope_and_loop);
-        }
-    }
-    CYCLE_COUNT_LAP(prof_scope_and_loop);
-
-#ifdef ENABLE_PROFILING
-    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
-                     prof_submit_task + prof_scope_and_loop;
-    LOG_ALWAYS(
-        rt, "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
-        prof_make_count, prof_view_count, cycles_to_us(total)
-    );
-    if (total > 0) {
-        LOG_ALWAYS(
-            rt, "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
-            prof_param_extract * 100.0 / total
-        );
-        LOG_ALWAYS(
-            rt, "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
-        );
-        LOG_ALWAYS(
-            rt, "  make_tensor(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
-            prof_make_tensor * 100.0 / total,
-            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
-        );
-        LOG_ALWAYS(
-            rt, "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
-            prof_tensor_view * 100.0 / total,
-            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
-        );
-        LOG_ALWAYS(
-            rt, "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup),
-            prof_param_setup * 100.0 / total
-        );
-        LOG_ALWAYS(
-            rt, "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
-            prof_submit_task * 100.0 / total,
-            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
-        );
-        LOG_ALWAYS(
-            rt, "  scope_and_loop   : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope_and_loop),
-            prof_scope_and_loop * 100.0 / total
-        );
-    }
-#endif
-
-#undef CYCLE_COUNT_START
-#undef CYCLE_COUNT_LAP
-}
-
-}  // extern "C"
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py
deleted file mode 100644
index d0b982df0..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged attention unroll — aicpu_build_graph runtime (production scale, bfloat16).
-
-Tests aicpu_build_graph runtime with N_UNROLL=64, hub kernels (aic_hub, aiv_hub),
-INOUT tensors, and AIC+AIV mixed execution.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden  # noqa: PLC0415
-from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs  # noqa: PLC0415
-
-
-@scene_test(level=2, runtime="aicpu_build_graph")
-class TestPagedAttentionUnrollAicpuBuildGraph(SceneTestCase):
-    """Paged attention unroll with aicpu_build_graph runtime and hub kernels."""
-
-    RTOL = 1e-3
-    ATOL = 1e-3
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/paged_attention_orch.cpp",
-            "function_name": "aicpu_orchestration_entry",
-            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "name": "QK",
-                "source": "kernels/aic/aic_qk_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 2,
-                "name": "PV",
-                "source": "kernels/aic/aic_pv_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 4,
-                "name": "AIC_HUB",
-                "source": "kernels/aic/aic_hub.cpp",
-                "core_type": "aic",
-                "signature": [],
-            },
-            {
-                "func_id": 1,
-                "name": "SF",
-                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-            },
-            {
-                "func_id": 3,
-                "name": "UP",
-                "source": "kernels/aiv/aiv_online_update.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-            },
-            {
-                "func_id": 5,
-                "name": "AIV_HUB",
-                "source": "kernels/aiv/aiv_hub.cpp",
-                "core_type": "aiv",
-                "signature": [],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "Case1",
-            "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "params": {
-                "batch": 256,
-                "num_heads": 16,
-                "kv_head_num": 1,
-                "head_dim": 128,
-                "block_size": 128,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-        {
-            "name": "Case2",
-            "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "manual": True,
-            "params": {
-                "batch": 64,
-                "num_heads": 64,
-                "kv_head_num": 1,
-                "head_dim": 128,
-                "block_size": 64,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-        {
-            "name": "Case3",
-            "platforms": ["a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "manual": True,
-            "params": {
-                "batch": 64,
-                "num_heads": 64,
-                "kv_head_num": 1,
-                "head_dim": 256,
-                "block_size": 64,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-    ]
-
-    def generate_args(self, params):
-        inputs = _pa_generate_inputs(params)
-        specs = []
-        for name, val in inputs:
-            if isinstance(val, torch.Tensor):
-                specs.append(Tensor(name, val))
-            else:
-                specs.append(Scalar(name, val))
-        return TaskArgsBuilder(*specs)
-
-    def compute_golden(self, args, params):
-        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
-        _pa_compute_golden(tensors, params)
-        for s in args.specs:
-            if isinstance(s, Tensor) and s.name in tensors:
-                getattr(args, s.name)[:] = tensors[s.name]
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/README.md b/tests/st/a2a3/aicpu_build_graph/vector_example/README.md
deleted file mode 100644
index 13106bc68..000000000
--- a/tests/st/a2a3/aicpu_build_graph/vector_example/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# PTO Runtime Example - AICPU Builds Graph (aicpu_build_graph)
-
-This example runs the same computation as `host_build_graph_example`, but the task graph is built on **AICPU** (1 builder thread) while scheduling/execution runs on **AICPU** (3 scheduler threads), for a total of **4** AICPU threads.
-
-## Run (simulation)
-
-```bash
-python tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py -p a2a3sim
-
-# Or via pytest
-pytest tests/st/a2a3/aicpu_build_graph/vector_example --platform a2a3sim
-```
-
-## Key difference vs host_build_graph/vector_example
-
-- The framework (`init_runtime_impl`) automatically manages I/O tensor device memory
-  using `arg_types`/`arg_sizes` and populates `runtime->orch_args[]`.
-- `kernels/aicpu/orchestration.cpp` is compiled into a small AICPU-side plugin `.so`.
-  - The framework embeds the plugin bytes into `Runtime`.
-  - The AICPU runtime `dlopen()`s the embedded plugin and calls `orchestration(Runtime*)` on device.
-  - The orchestration allocates intermediate tensors via `api.device_malloc()` (HBM) and builds the task graph.
diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp
deleted file mode 100644
index 50954fdf9..000000000
--- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Element-wise Tensor Addition Kernel
- *
- * Implements: out[i] = src0[i] + src1[i]
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
-    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
-
-    constexpr int kTRows_ = 128;
-    constexpr int kTCols_ = 128;
-    constexpr int vRows = 128;
-    constexpr int vCols = 128;
-
-    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
-    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
-    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
-
-    TileData src0Tile(vRows, vCols);
-    TileData src1Tile(vRows, vCols);
-    TileData dstTile(vRows, vCols);
-    TASSIGN(src0Tile, 0x0);
-    TASSIGN(src1Tile, 0x10000);
-    TASSIGN(dstTile, 0x20000);
-
-    GlobalData src0Global(src0);
-    GlobalData src1Global(src1);
-    GlobalData dstGlobal(out);
-
-    TLOAD(src0Tile, src0Global);
-    TLOAD(src1Tile, src1Global);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    TADD(dstTile, src0Tile, src1Tile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(dstGlobal, dstTile);
-
-    pipe_sync();
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
deleted file mode 100644
index 72f1fbde4..000000000
--- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Scalar Addition Kernel
- *
- * Implements: out[i] = src[i] + scalar
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
-
-    union {
-        uint64_t u64;
-        float f32;
-    } converter;
-    converter.u64 = args[2];
-    float scalar = converter.f32;
-
-    constexpr int kTRows_ = 128;
-    constexpr int kTCols_ = 128;
-    constexpr int vRows = 128;
-    constexpr int vCols = 128;
-
-    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
-    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
-    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
-
-    TileData srcTile(vRows, vCols);
-    TileData dstTile(vRows, vCols);
-    TASSIGN(srcTile, 0x0);
-    TASSIGN(dstTile, 0x10000);
-
-    GlobalData srcGlobal(src);
-    GlobalData dstGlobal(out);
-
-    TLOAD(srcTile, srcGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    TADDS(dstTile, srcTile, scalar);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(dstGlobal, dstTile);
-
-    pipe_sync();
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
deleted file mode 100644
index 6692257b4..000000000
--- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Element-wise Tensor Multiplication Kernel
- *
- * Implements: out[i] = src0[i] * src1[i]
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#include "pipe_sync.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
-    __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
-    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
-
-    constexpr int kTRows_ = 128;
-    constexpr int kTCols_ = 128;
-    constexpr int vRows = 128;
-    constexpr int vCols = 128;
-
-    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
-    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
-    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
-
-    TileData src0Tile(vRows, vCols);
-    TileData src1Tile(vRows, vCols);
-    TileData dstTile(vRows, vCols);
-    TASSIGN(src0Tile, 0x0);
-    TASSIGN(src1Tile, 0x10000);
-    TASSIGN(dstTile, 0x20000);
-
-    GlobalData src0Global(src0);
-    GlobalData src1Global(src1);
-    GlobalData dstGlobal(out);
-
-    TLOAD(src0Tile, src0Global);
-    TLOAD(src1Tile, src1Global);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    TMUL(dstTile, src0Tile, src1Tile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(dstGlobal, dstTile);
-
-    pipe_sync();
-}
diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp b/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
deleted file mode 100644
index eeee70764..000000000
--- a/tests/st/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * AICPU orchestration for the vector example.
- *
- * DAG structure for formula: f = (a + b + 1) * (a + b + 2)
- *   t0: c = a + b     (func_id=0, kernel_add)
- *   t1: d = c + 1     (func_id=1, kernel_add_scalar)
- *   t2: e = c + 2     (func_id=1, kernel_add_scalar)
- *   t3: f = d * e     (func_id=2, kernel_mul)
- *   Dependencies: t0->t1, t0->t2, t1->t3, t2->t3
- *
- * Uses explicit add_dependency for all dependency edges (no TensorMap).
- * Tasks are batch-published at scope_end.
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-extern "C" {
-
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 3,
-    };
-}
-
-__attribute__((visibility("default"))) void
-aicpu_orchestration_entry(PTO2Runtime *rt, const ChipStorageTaskArgs &orch_args) {
-    // golden shape = kernel shape, use from_tensor_arg() directly
-    Tensor ext_a = from_tensor_arg(orch_args.tensor(0));
-    Tensor ext_b = from_tensor_arg(orch_args.tensor(1));
-    Tensor ext_f = from_tensor_arg(orch_args.tensor(2));
-
-    uint32_t SIZE = orch_args.tensor(0).shapes[0];
-
-    uint32_t shapes[1] = {SIZE};
-
-    PTO2_SCOPE(rt) {
-        // t0: c = a + b
-        Arg args_t0;
-        args_t0.add_input(ext_a);
-        args_t0.add_input(ext_b);
-        args_t0.add_output(TensorCreateInfo(shapes, 1, DataType::FLOAT32));
-        SubmitResult r0 = rt_submit_aiv_task(rt, 0, args_t0);
-
-        // t1: d = c + 1.0
-        Arg args_t1;
-        args_t1.add_input(r0.outputs.get_ref(0));
-        args_t1.add_output(TensorCreateInfo(shapes, 1, DataType::FLOAT32));
-        args_t1.add_scalar(1.0f);
-        SubmitResult r1 = rt_submit_aiv_task(rt, 1, args_t1);
-        rt_add_dependency(rt, r0.task_id, r1.task_id);
-
-        // t2: e = c + 2.0
-        Arg args_t2;
-        args_t2.add_input(r0.outputs.get_ref(0));
-        args_t2.add_output(TensorCreateInfo(shapes, 1, DataType::FLOAT32));
-        args_t2.add_scalar(2.0f);
-        SubmitResult r2 = rt_submit_aiv_task(rt, 1, args_t2);
-        rt_add_dependency(rt, r0.task_id, r2.task_id);
-
-        // t3: f = d * e
-        Arg args_t3;
-        args_t3.add_input(r1.outputs.get_ref(0));
-        args_t3.add_input(r2.outputs.get_ref(0));
-        args_t3.add_inout(ext_f);
-        SubmitResult r3 = rt_submit_aiv_task(rt, 2, args_t3);
-        rt_add_dependency(rt, r1.task_id, r3.task_id);
-        rt_add_dependency(rt, r2.task_id, r3.task_id);
-    }  // scope_end: batch-publish all tasks
-}
-
-}  // extern "C"
diff --git a/tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py b/tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py
deleted file mode 100644
index 2e071c78f..000000000
--- a/tests/st/a2a3/aicpu_build_graph/vector_example/test_vector_example.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Vector example — aicpu_build_graph runtime with device-side DAG building.
-
-Computation: f = (a + b + 1) * (a + b + 2), where a=2.0, b=3.0, so f=42.0.
-Tests aicpu_build_graph runtime with intermediate tensors allocated from HeapRing.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-
-
-@scene_test(level=2, runtime="aicpu_build_graph")
-class TestVectorExample(SceneTestCase):
-    """Vector example: f = (a + b + 1) * (a + b + 2) via device-side DAG."""
-
-    RTOL = 1e-5
-    ATOL = 1e-5
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/orchestration.cpp",
-            "function_name": "aicpu_orchestration_entry",
-            "signature": [D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "source": "kernels/aiv/kernel_add.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 1,
-                "source": "kernels/aiv/kernel_add_scalar.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.OUT],
-            },
-            {
-                "func_id": 2,
-                "source": "kernels/aiv/kernel_mul.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "default",
-            "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 3},
-            "params": {},
-        },
-    ]
-
-    def generate_args(self, params):
-        SIZE = 128 * 128
-        a = torch.full((SIZE,), 2.0, dtype=torch.float32)
-        b = torch.full((SIZE,), 3.0, dtype=torch.float32)
-        f = torch.zeros(SIZE, dtype=torch.float32)
-
-        return TaskArgsBuilder(
-            Tensor("a", a),
-            Tensor("b", b),
-            Tensor("f", f),
-        )
-
-    def compute_golden(self, args, params):
-        a = args.a
-        b = args.b
-        args.f[:] = (a + b + 1) * (a + b + 2)
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/tests/ut/py/test_runtime_builder.py b/tests/ut/py/test_runtime_builder.py
index 6d5951dcd..122cf867b 100644
--- a/tests/ut/py/test_runtime_builder.py
+++ b/tests/ut/py/test_runtime_builder.py
@@ -28,14 +28,6 @@ def test_discovers_real_runtimes(self, default_test_platform):
         runtimes = builder.list_runtimes()
         assert "host_build_graph" in runtimes
 
-    def test_discovers_aicpu_build_graph(self, default_test_platform):
-        """RuntimeBuilder discovers aicpu_build_graph from the real project tree."""
-        from simpler_setup.runtime_builder import RuntimeBuilder  # noqa: PLC0415
-
-        builder = RuntimeBuilder(platform=default_test_platform)
-        runtimes = builder.list_runtimes()
-        assert "aicpu_build_graph" in runtimes
-
     def test_runtime_dir_resolves_to_project_root(self, default_test_platform, test_arch):
         """runtime_dir resolves to src/{arch}/runtime/ under the project root."""
         from simpler_setup.runtime_builder import RuntimeBuilder  # noqa: PLC0415
diff --git a/tools/README.md b/tools/README.md
index 53526d9f1..468f27a2a 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -22,12 +22,12 @@ elapsed time.
 ```
 
 Requires `PTO2_PROFILING=1` in the runtime; device log must include the
-`orch_*` / `sched_*` lines. The `EXAMPLE_CASES` maps at the top of the script
-control which examples/cases are run per runtime.
+`orch_*` / `sched_*` lines. The `TMR_EXAMPLE_CASES` map at the top of the
+script controls which examples/cases are run.
 
 ## verify_packaging.sh
 
-Exercises all 5 install paths × 4 entry points from a fully clean state.
+Exercises all 5 install paths × 2 entry points from a fully clean state.
 CI calls this directly; see [docs/python-packaging.md](../docs/python-packaging.md).
 Must run from the repo root inside an activated venv.
 
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index d88a9ef3d..710b60108 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -13,7 +13,7 @@
 # Usage:
 #   ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>] [-r <runtime>]
 #
-# Edit the EXAMPLE_CASES maps below to control which examples and cases to run.
+# Edit the EXAMPLE_CASES map below to control which examples and cases to run.
 
 set -euo pipefail
 
@@ -44,14 +44,6 @@ TMR_EXAMPLE_ORDER=(
     spmd_paged_attention
 )
 
-# --- aicpu_build_graph ---
-declare -A ABG_EXAMPLE_CASES=(
-    [paged_attention_unroll]="Case1,Case2"
-)
-ABG_EXAMPLE_ORDER=(
-    paged_attention_unroll
-)
-
 # ---------------------------------------------------------------------------
 # Parse arguments
 # ---------------------------------------------------------------------------
@@ -95,7 +87,7 @@ Options:
   -p, --platform Platform to run on (default: a2a3)
   -d, --device   Device ID (default: 0)
   -n, --rounds   Override number of rounds for each example (default: 100)
-  -r, --runtime  Runtime to benchmark: tensormap_and_ringbuffer (default), aicpu_build_graph
+  -r, --runtime  Runtime to benchmark: tensormap_and_ringbuffer (default)
   -v, --verbose  Save detailed test_*.py output to a timestamped log file
   -h, --help     Show this help
 
@@ -156,12 +148,8 @@ case "$RUNTIME" in
         declare -n EXAMPLE_CASES=TMR_EXAMPLE_CASES
         EXAMPLE_ORDER=("${TMR_EXAMPLE_ORDER[@]}")
         ;;
-    aicpu_build_graph)
-        declare -n EXAMPLE_CASES=ABG_EXAMPLE_CASES
-        EXAMPLE_ORDER=("${ABG_EXAMPLE_ORDER[@]}")
-        ;;
     *)
-        echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer or aicpu_build_graph."
+        echo "ERROR: unknown runtime '$RUNTIME'. Use tensormap_and_ringbuffer."
         exit 1
         ;;
 esac
diff --git a/tools/verify_packaging.sh b/tools/verify_packaging.sh
index a56cf192c..3b06271fa 100755
--- a/tools/verify_packaging.sh
+++ b/tools/verify_packaging.sh
@@ -7,7 +7,7 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-# Verify all 5 install paths x 4 entry points are green.
+# Verify all 5 install paths x 2 entry points are green.
 #
 # Each mode runs from a fully clean state (uninstall + wipe build artifacts) so
 # leftover binaries from a previous mode cannot mask a regression in the next.
@@ -70,7 +70,7 @@ print('incore helpers OK:', inc_dirs)
 "
     echo "::endgroup::"
     echo "::group::[${mode}] standalone test_*.py --help"
-    python tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py --help >/dev/null
+    python tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py --help >/dev/null
     echo "::endgroup::"
     echo "smoke[${mode}] OK"
 }