hw-native-sys · jvjhfhg · Apr 29, 2026 · Apr 29, 2026
diff --git a/.claude/commands/perf-runtime-device.md b/.claude/commands/perf-runtime-device.md
@@ -4,7 +4,7 @@ If `$ARGUMENTS` is provided, use it as the runtime name. Otherwise, default to `
 
 Reference `tools/benchmark_rounds.sh` for the full implementation pattern (device log resolution, timing parsing, reporting format).
 
-1. Validate the runtime is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list valid runtimes and stop.
+1. Validate the runtime is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list valid runtimes and stop.
 2. Check `command -v npu-smi` — if not found, tell the user this requires hardware and stop.
 3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3`.
 4. Find the lowest-ID idle device (HBM-Usage = 0) from the `npu-smi info` output. If none, stop.

diff --git a/.claude/commands/test-runtime-device.md b/.claude/commands/test-runtime-device.md
@@ -1,6 +1,6 @@
 # Run hardware device tests for a single runtime specified by $ARGUMENTS
 
-1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
+1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
 2. Check `command -v npu-smi` — if not found, tell the user to use `/test-runtime-sim` instead and stop.
 3. **Detect platform**: Run `npu-smi info` and parse the chip name. Map `910B`/`910C` → `a2a3`, `950` → `a5`. If unrecognized, warn and default to `a2a3`.
 4. Read `.github/workflows/ci.yml` to extract the current `--pto-isa-commit` and `--pto-session-timeout` values from the `st-onboard-<platform>` job's `pytest` invocation.

diff --git a/.claude/commands/test-runtime-sim.md b/.claude/commands/test-runtime-sim.md
@@ -1,6 +1,6 @@
 # Run simulation tests for a single runtime specified by $ARGUMENTS
 
-1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
+1. Validate that `$ARGUMENTS` is one of: `host_build_graph`, `tensormap_and_ringbuffer`. If not, list the valid runtimes and stop.
 2. Read `.github/workflows/ci.yml` to extract the current `--pto-isa-commit` and `--pto-session-timeout` values from the `st-sim-*` jobs' `pytest` invocations.
 3. **Detect platform**: If `npu-smi` is available, parse the chip name from `npu-smi info`. Map `910B`/`910C` → `a2a3sim`, `950` → `a5sim`. If `npu-smi` is not found, default to `a2a3sim`.
 4. Run:

diff --git a/.claude/rules/architecture.md b/.claude/rules/architecture.md
@@ -5,7 +5,7 @@ See [docs/chip-level-arch.md](../../docs/chip-level-arch.md) for the full diagra
 ## Key Concepts
 
 - **Three programs**: Host `.so`, AICPU `.so`, AICore `.o` — compiled independently, linked at runtime
-- **Three runtimes** under `src/{arch}/runtime/`: `host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`
+- **Two runtimes** under `src/{arch}/runtime/`: `host_build_graph`, `tensormap_and_ringbuffer`
 - **Two platform backends** under `src/{arch}/platform/`: `onboard/` (hardware), `sim/` (simulation)
 
 ## Python Package Layout

diff --git a/.claude/skills/benchmark/SKILL.md b/.claude/skills/benchmark/SKILL.md
@@ -45,20 +45,8 @@ The `-d` flag specifies NPU device IDs.
 `tools/benchmark_rounds.sh` supports `-r <runtime>`:
 
 - `tensormap_and_ringbuffer` (default)
-- `aicpu_build_graph`
 
-Each runtime has its own example list defined at the top of the script (`TMR_EXAMPLE_CASES` / `ABG_EXAMPLE_CASES`).
-
-**Auto-detection (compare mode only):** Always benchmark TMR. Also benchmark `aicpu_build_graph` if the diff touches its files:
-
-```bash
-RUNTIMES_TO_BENCH=(tensormap_and_ringbuffer)
-if git diff --name-only "$MERGE_BASE"...HEAD | grep -q 'aicpu_build_graph'; then
-  RUNTIMES_TO_BENCH+=(aicpu_build_graph)
-fi
-```
-
-Run `benchmark_rounds.sh` once per runtime, with `-r <runtime>` appended. **Runtimes are always benchmarked serially** — finish all baseline+current runs for one runtime before starting the next. This ensures no device ever runs two benchmark processes concurrently.
+The example list is defined at the top of the script (`TMR_EXAMPLE_CASES`).
 
 ## Step 1: Detect Mode
 

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -29,7 +29,6 @@ body:
       description: Which runtime variant is affected?
       options:
         - tensormap_and_ringbuffer
-        - aicpu_build_graph
         - host_build_graph
         - All / Unknown
     validations:

diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yml b/.github/ISSUE_TEMPLATE/performance_issue.yml
@@ -29,7 +29,6 @@ body:
       description: Which runtime variant is affected?
       options:
         - tensormap_and_ringbuffer
-        - aicpu_build_graph
         - host_build_graph
         - All / Unknown
     validations:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -209,8 +209,8 @@ jobs:
           set +e
           pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
-          if [ $rc -eq 124 ]; then
-            echo "pytest timed out; retrying with pinned PTO-ISA commit"
+          if [ $rc -ne 0 ]; then
+            echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit"
             pytest examples tests/st --platform a2a3sim --device 0-15 -v \
               --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https
             rc=$?
@@ -267,8 +267,8 @@ jobs:
           set +e
           pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
-          if [ $rc -eq 124 ]; then
-            echo "pytest timed out; retrying with pinned PTO-ISA commit"
+          if [ $rc -ne 0 ]; then
+            echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit"
             pytest examples tests/st --platform a5sim --device 0-15 -v \
               --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https
             rc=$?
@@ -338,8 +338,8 @@ jobs:
           source .venv/bin/activate
           python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
-          if [ $rc -eq 124 ]; then
-            echo "pytest timed out; retrying with pinned PTO-ISA commit"
+          if [ $rc -ne 0 ]; then
+            echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit"
             python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v \
               --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https
             rc=$?
@@ -450,4 +450,4 @@ jobs:
           source .venv/bin/activate
           DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))")
           PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v --clone-protocol https"
-          task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -eq 124 ]; then echo 'pytest timed out; retrying with pinned PTO-ISA commit'; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https; rc=\$?; fi; exit \$rc"
+          task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -ne 0 ]; then echo \"pytest failed with rc=\$rc; retrying with pinned PTO-ISA commit\"; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https; rc=\$?; fi; exit \$rc"
diff --git a/README.md b/README.md
@@ -29,12 +29,11 @@ PTO ISA headers are automatically cloned on first run. See [Getting Started](doc
 
 ## Runtime Variants
 
-Three runtimes under `src/{arch}/runtime/`, each with a different graph-building strategy:
+Two runtimes under `src/{arch}/runtime/`, each with a different graph-building strategy:
 
 | Runtime | Graph built on | Use case |
 | ------- | -------------- | -------- |
 | `host_build_graph` | Host CPU | Development, debugging |
-| `aicpu_build_graph` | AICPU (device) | Reduced host-device transfer |
 | `tensormap_and_ringbuffer` | AICPU (device) | Production workloads |
 
 See runtime docs per arch: [a2a3](src/a2a3/docs/runtimes.md), [a5](src/a5/docs/runtimes.md).

diff --git a/docs/developer-guide.md b/docs/developer-guide.md
@@ -22,7 +22,6 @@ pto-runtime/
 │       └── runtime/                   # Runtime implementations
 │           ├── common/                # Shared components across runtimes
 │           ├── host_build_graph/      # Host-built graph runtime
-│           ├── aicpu_build_graph/     # AICPU-built graph runtime
 │           └── tensormap_and_ringbuffer/  # Advanced production runtime
 │
 ├── python/                            # Language bindings
@@ -55,7 +54,6 @@ pto-runtime/
 ├── examples/                          # Working examples
 │   └── {arch}/                        # Architecture-specific examples
 │       ├── host_build_graph/
-│       ├── aicpu_build_graph/
 │       └── tensormap_and_ringbuffer/
 │
 ├── tests/                             # Test suite

diff --git a/docs/dynamic-linking.md b/docs/dynamic-linking.md
@@ -221,7 +221,7 @@ SchedulerContext owns its own teardown:
   (`initialized_`, `init_done_`, `init_failed_`, `finished_`, `thread_idx_`,
   `finished_count_`).
 
-Applies to all 5 runtime executors: a2a3 (abg, hbg, tmr), a5 (hbg, tmr).
+Applies to all 4 runtime executors: a2a3 (hbg, tmr), a5 (hbg, tmr).
 
 ## SO Handle Caching and Reuse
 

diff --git a/docs/python-packaging.md b/docs/python-packaging.md
@@ -96,7 +96,7 @@ Plus one build-time entry point invoked by CMake during `pip install`:
 
 ## Install modes
 
-Five install paths × four entry points = the verification matrix. CI enforces the matrix on macOS and Ubuntu via `.github/workflows/ci.yml::packaging-matrix`.
+Five install paths × two entry points = the verification matrix. CI enforces the matrix on macOS and Ubuntu via `.github/workflows/ci.yml::packaging-matrix`.
 
 ### Mode-by-mode
 

diff --git a/docs/tensor-dump.md b/docs/tensor-dump.md
@@ -6,8 +6,8 @@ runtime observability feature: host pre-allocates buffers on device,
 AICPU writes records during execution, host collects data and exports
 JSON manifest + binary payload.
 
-Supported on both architectures (`a2a3` / `a5`) and all three runtimes
-(`host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`).
+Supported on both architectures (`a2a3` / `a5`) and both runtimes
+(`host_build_graph`, `tensormap_and_ringbuffer`).
 Opt-in via `--dump-tensor` — zero overhead when disabled.
 
 The **primary design** (a2a3) uses shared memory (`halHostRegister`) +
@@ -250,8 +250,8 @@ all device-side writes were globally visible.
 
 AICPU only has device addresses and sizes — it does **not** know the
 logical shape / dtype / view geometry of each tensor unless the runtime
-registers it. Each of the three runtimes exposes metadata through a
-slightly different path, but they all converge on `TensorInfo` (see
+registers it. Each runtime exposes metadata through a slightly different
+path, but they all converge on `TensorInfo` (see
 [`tensor_info.h`](../src/a5/runtime/host_build_graph/runtime/tensor_info.h)):
 
 - **`host_build_graph`** — two orchestration-side APIs:
@@ -261,11 +261,10 @@ slightly different path, but they all converge on `TensorInfo` (see
   See
   [`dump_tensor_orch.cpp`](../tests/st/a5/host_build_graph/dump_tensor_example/kernels/orchestration/dump_tensor_orch.cpp)
   for both styles in one file.
-- **`aicpu_build_graph`** — runtime layer fills `TensorInfo` from
-  `PTO2TaskPayload::tensors[]` directly. No orchestration API needed.
-- **`tensormap_and_ringbuffer`** — identical to `aicpu_build_graph`;
-  the ring buffer carries `PTO2TaskPayload` which already contains
-  shape/offset arrays.
+- **`tensormap_and_ringbuffer`** — runtime layer fills `TensorInfo`
+  from `PTO2TaskPayload::tensors[]` directly. The ring buffer carries
+  `PTO2TaskPayload` which already contains shape/offset arrays, so no
+  orchestration API is needed.
 
 When metadata is missing or inconsistent, the task is **skipped for
 dump** and a single `LOG_WARN` is emitted (guarded by

diff --git a/docs/testing.md b/docs/testing.md
@@ -420,7 +420,6 @@ pytest tests/ut --platform a2a3
 Small, fast examples that run on both simulation and real hardware. Organized by runtime:
 
 - `host_build_graph/` — HBG examples
-- `aicpu_build_graph/` — ABG examples
 - `tensormap_and_ringbuffer/` — TMR examples
 
 Each example has a `golden.py` with `generate_inputs()` and `compute_golden()` for result validation.

diff --git a/examples/workers/README.md b/examples/workers/README.md
@@ -35,8 +35,8 @@ workers/
 
 Why no `tensormap_and_ringbuffer/` layer? Because every example here hard-codes
 `runtime="tensormap_and_ringbuffer"` in its `Worker(...)` call — that is the
-default user-facing runtime. Other runtimes (`host_build_graph`,
-`aicpu_build_graph`) are covered by scene tests under `tests/st/`, not here.
+default user-facing runtime. The other runtime (`host_build_graph`) is
+covered by scene tests under `tests/st/`, not here.
 
 ## Prerequisites
 

diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py
@@ -372,7 +372,7 @@ def compile_orchestration(
 
         Args:
             runtime_name: Name of the runtime (e.g., "host_build_graph",
-                         "tensormap_and_ringbuffer", "aicpu_build_graph")
+                         "tensormap_and_ringbuffer")
             source_path: Path to orchestration source file (.cpp)
             extra_include_dirs: Additional include directories (merged with
                                the runtime/platform include dirs)

diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py
@@ -1015,7 +1015,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
 
     # Orchestrator → scheduler dispatch:
     # - Prefer orch_fanin end → dispatch (explicit deps / fanin path).
-    # - If no orch_fanin for this task (e.g. aicpu_build_graph without fanin records), use orch_params end → dispatch.
+    # - If no orch_fanin for this task, use orch_params end → dispatch.
     if orchestrator_phases and scheduler_phases:
         orch_fanin_by_task = {}
         orch_params_by_task = {}

diff --git a/src/a2a3/docs/runtimes.md b/src/a2a3/docs/runtimes.md
@@ -1,20 +1,20 @@
 # Runtime Variants (a2a3)
 
-Three runtime implementations live under `src/a2a3/runtime/`, each providing a different graph-building strategy. The `RUNTIME_CONFIG.runtime` field in `kernel_config.py` selects which runtime to use.
+Two runtime implementations live under `src/a2a3/runtime/`, each providing a different graph-building strategy. The `RUNTIME_CONFIG.runtime` field in `kernel_config.py` selects which runtime to use.
 
 ## Comparison
 
-| Feature | host_build_graph | aicpu_build_graph | tensormap_and_ringbuffer |
-| ------- | ---------------- | ----------------- | ------------------------ |
-| Graph built on | Host CPU | AICPU (device) | AICPU (device) |
-| Task storage | Fixed `Task[]` array | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) |
-| Dependencies | Explicit edges | Explicit edges | Auto-derived via TensorMap |
-| Memory management | Host-side | Host + device malloc | Ring buffer heap (GM) |
-| Concurrent build+schedule | No | Optional (`build_mode=1`) | Yes (always) |
-| Profiling support | Basic | Basic | Multi-level hierarchy |
-| Batch/streaming | No | No | Yes (flow control, back-pressure) |
-| Thread model | N scheduler threads | 1 builder + N schedulers | 1 orchestrator + 3 schedulers |
-| Use case | Development, debugging | Reduced host-device transfer | Production workloads |
+| Feature | host_build_graph | tensormap_and_ringbuffer |
+| ------- | ---------------- | ------------------------ |
+| Graph built on | Host CPU | AICPU (device) |
+| Task storage | Fixed `Task[]` array | Ring buffer (`PTO2TaskDescriptor[]`) |
+| Dependencies | Explicit edges | Auto-derived via TensorMap |
+| Memory management | Host-side | Ring buffer heap (GM) |
+| Concurrent build+schedule | No | Yes (always) |
+| Profiling support | Basic | Multi-level hierarchy |
+| Batch/streaming | No | Yes (flow control, back-pressure) |
+| Thread model | N scheduler threads | 1 orchestrator + 3 schedulers |
+| Use case | Development, debugging | Production workloads |
 
 ## host_build_graph
 
@@ -26,16 +26,6 @@ The simplest runtime. The host CPU builds the complete task dependency graph bef
 
 See [host_build_graph/docs/RUNTIME_LOGIC.md](../runtime/host_build_graph/docs/RUNTIME_LOGIC.md) for details.
 
-## aicpu_build_graph
-
-Orchestration runs on an AICPU thread, building the task graph on device. Supports concurrent build + schedule (`build_mode=1`).
-
-- Same task array as host_build_graph
-- Device-side API: `add_task`, `add_successor_conditional`, `publish_task`, `device_malloc`
-- Reduces host-device data transfer; graph can depend on device-side data
-
-See [aicpu_build_graph/docs/RUNTIME_LOGIC.md](../runtime/aicpu_build_graph/docs/RUNTIME_LOGIC.md) for details.
-
 ## tensormap_and_ringbuffer (PTO2)
 
 The primary production runtime. Uses ring buffers for task slots and output memory, with a TensorMap for automatic dependency tracking.

diff --git a/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h b/src/a2a3/platform/include/aicore/l2_perf_collector_aicore.h
@@ -38,8 +38,8 @@
  * Buffer management and final commit are handled by AICPU.
  *
  * AICore writes L2PerfRecord.task_id as the register dispatch token (low 32 bits, zero-extended).
- * For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites
- * with the full (ring_id << 32) | local_id encoding after handshake match.
+ * For tensormap_and_ringbuffer, AICPU overwrites with the full (ring_id << 32) | local_id
+ * encoding after handshake match.
  *
  * @param l2_perf_buf Performance buffer pointer
  * @param task_id Register dispatch id (DATA_MAIN_BASE), stored in task_id low 32 bits

diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
@@ -127,7 +127,7 @@ void l2_perf_aicpu_init_phase_profiling(Runtime *runtime, int num_sched_threads)
  * @param loop_iter Current loop iteration number
  * @param tasks_processed Number of tasks processed in this batch (scheduler phases), or
  *                        full PTO2 task_id encoding (ring_id << 32) | local_id (orchestrator
- *                        phases in multi-ring runtimes: tensormap_and_ringbuffer, aicpu_build_graph)
+ *                        phases in tensormap_and_ringbuffer)
  */
 void l2_perf_aicpu_record_phase(
     int thread_idx, AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
@@ -164,9 +164,8 @@ void l2_perf_aicpu_set_orch_thread_idx(int thread_idx);
  * @param start_time Phase start timestamp
  * @param end_time Phase end timestamp
  * @param submit_idx Task submission index (acts as loop_iter)
- * @param task_id Task identifier. For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), this is the
- * full PTO2 encoding: (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler
- * swimlanes.
+ * @param task_id Task identifier. For tensormap_and_ringbuffer, this is the full PTO2 encoding:
+ * (ring_id << 32) | local_id, enabling cross-view correlation between orchestrator and scheduler swimlanes.
  */
 void l2_perf_aicpu_record_orch_phase(
     AicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id

diff --git a/src/a2a3/platform/include/common/l2_perf_profiling.h b/src/a2a3/platform/include/common/l2_perf_profiling.h
@@ -83,8 +83,8 @@ struct L2PerfRecord {
     uint64_t finish_time;    // AICPU timestamp: when AICPU observed task completion
 
     // AICore writes the register dispatch token (low 32 bits only) zero-extended into task_id.
-    // For multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph), AICPU overwrites
-    // with the full PTO2 encoding (ring_id << 32) | local_id after FIN/perf row match.
+    // For tensormap_and_ringbuffer, AICPU overwrites with the full PTO2 encoding
+    // (ring_id << 32) | local_id after FIN/perf row match.
     // For host_build_graph, task_id stays as the plain integer task index (ring_id = 0).
     uint64_t task_id;
     uint32_t func_id;    // Kernel function identifier
@@ -273,8 +273,8 @@ struct AicpuPhaseRecord {
     uint32_t loop_iter;     // Loop iteration number
     AicpuPhaseId phase_id;  // Phase type
     union {
-        uint64_t task_id;          // Multi-ring runtimes (tensormap_and_ringbuffer, aicpu_build_graph):
-                                   // full PTO2 encoding (ring_id << 32) | local_id for cross-view correlation.
+        uint64_t task_id;          // tensormap_and_ringbuffer: full PTO2 encoding
+                                   // (ring_id << 32) | local_id for cross-view correlation.
         uint64_t tasks_processed;  // Scheduler phases: number of tasks processed in this batch
     };
 };