From e230799fc3cd0284e0b9e8d58dc010f88feb0126 Mon Sep 17 00:00:00 2001
From: chenshengxin <hw_chenshengxin@163.com>
Date: Sat, 25 Apr 2026 10:55:23 +0800
Subject: [PATCH] Add: unit tests for PTO2 scheduler core data structures and
 hot-paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PTO2 scheduler (A2A3 and A5) relies on several tightly coupled data
structures (task allocator, dep-list pool, fanin pool, SPSC queue, tensor
map) and hot-path functions (wire_task, on_mixed_task_complete,
on_task_release, advance_ring_pointers) that previously had no unit test
coverage. Bugs in these paths — off-by-one in wrap-around, stale fanin
references, lost dependency edges — surface only under specific
task-graph topologies and are extremely hard to diagnose at the
system-test level.

This change adds per-component tests, covering both A2A3 and A5 runtimes,
that exercise:
- task_allocator: heap bump, wrap-around guard, flow-control window
- task_state: slot lifecycle through src API, profiling CAS semantics
- dep_list_pool / fanin_pool: ring allocation, overflow, tail advance
- spsc_queue: cached-index SPSC correctness, wrap, capacity semantics
- tensormap: hash distribution, overlap detection, lookup saturation
- wiring: end-to-end wire → complete → release → advance cycle

These tests also serve as executable documentation of design contracts
(e.g. heap_available reports max-not-sum, LIFO dispatch for cache
locality, relaxed size() as a hint) that would otherwise exist only as
implicit assumptions in the source.
---
 tests/ut/cpp/CMakeLists.txt                |  46 +-
 tests/ut/cpp/a2a3/test_a2a3_fatal.cpp      |   6 +
 tests/ut/cpp/a2a3/test_dep_list_pool.cpp   | 168 +++++++
 tests/ut/cpp/a2a3/test_fanin_pool.cpp      | 311 ++++++++++++
 tests/ut/cpp/a2a3/test_ready_queue.cpp     | 446 +++++++++++++++++
 tests/ut/cpp/a2a3/test_scheduler_state.cpp | 197 ++++++++
 tests/ut/cpp/a2a3/test_shared_memory.cpp   | 191 +++++++
 tests/ut/cpp/a2a3/test_spsc_queue.cpp      | 293 +++++++++++
 tests/ut/cpp/a2a3/test_task_allocator.cpp  | 407 +++++++++++++++
 tests/ut/cpp/a2a3/test_task_state.cpp      | 201 ++++++++
 tests/ut/cpp/a2a3/test_tensormap.cpp       | 551 +++++++++++++++++++++
 tests/ut/cpp/a2a3/test_wiring.cpp          | 448 +++++++++++++++++
 tests/ut/cpp/a5/test_dep_list_pool.cpp     | 168 +++++++
 tests/ut/cpp/a5/test_fanin_pool.cpp        | 311 ++++++++++++
 tests/ut/cpp/a5/test_ready_queue.cpp       | 446 +++++++++++++++++
 tests/ut/cpp/a5/test_scheduler_state.cpp   | 197 ++++++++
 tests/ut/cpp/a5/test_shared_memory.cpp     | 191 +++++++
 tests/ut/cpp/a5/test_spsc_queue.cpp        | 293 +++++++++++
 tests/ut/cpp/a5/test_task_allocator.cpp    | 407 +++++++++++++++
 tests/ut/cpp/a5/test_task_state.cpp        | 201 ++++++++
 tests/ut/cpp/a5/test_tensormap.cpp         | 551 +++++++++++++++++++++
 tests/ut/cpp/a5/test_wiring.cpp            | 448 +++++++++++++++++
 22 files changed, 6476 insertions(+), 2 deletions(-)
 create mode 100644 tests/ut/cpp/a2a3/test_dep_list_pool.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_fanin_pool.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_ready_queue.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_scheduler_state.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_shared_memory.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_spsc_queue.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_task_allocator.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_task_state.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_tensormap.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_wiring.cpp
 create mode 100644 tests/ut/cpp/a5/test_dep_list_pool.cpp
 create mode 100644 tests/ut/cpp/a5/test_fanin_pool.cpp
 create mode 100644 tests/ut/cpp/a5/test_ready_queue.cpp
 create mode 100644 tests/ut/cpp/a5/test_scheduler_state.cpp
 create mode 100644 tests/ut/cpp/a5/test_shared_memory.cpp
 create mode 100644 tests/ut/cpp/a5/test_spsc_queue.cpp
 create mode 100644 tests/ut/cpp/a5/test_task_allocator.cpp
 create mode 100644 tests/ut/cpp/a5/test_task_state.cpp
 create mode 100644 tests/ut/cpp/a5/test_tensormap.cpp
 create mode 100644 tests/ut/cpp/a5/test_wiring.cpp

diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 06f83d1de..195c601ce 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -81,8 +81,8 @@ set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and
 set(A2A3_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
 set(A2A3_RUNTIME_SOURCES
     ${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp
-    ${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp
-    ${A2A3_RUNTIME_DIR}/pto_scheduler.cpp
+    ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp
+    ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
     ${A2A3_RUNTIME_DIR}/pto_tensormap.cpp
 )
 
@@ -230,6 +230,48 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp)
 # ---------------------------------------------------------------------------
 add_a2a3_test(test_a2a3_fatal a2a3/test_a2a3_fatal.cpp)
 
+# PTO2 runtime-linked tests
+add_a2a3_runtime_test(test_task_allocator
+    SOURCES a2a3/test_task_allocator.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_dep_list_pool
+    SOURCES a2a3/test_dep_list_pool.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_scheduler_state
+    SOURCES a2a3/test_scheduler_state.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_task_state
+    SOURCES a2a3/test_task_state.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_ready_queue
+    SOURCES a2a3/test_ready_queue.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_shared_memory
+    SOURCES a2a3/test_shared_memory.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_a2a3_tensormap
+    SOURCES a2a3/test_tensormap.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_fanin_pool
+    SOURCES a2a3/test_fanin_pool.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_spsc_queue
+    SOURCES a2a3/test_spsc_queue.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_wiring
+    SOURCES a2a3/test_wiring.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+
 # ---------------------------------------------------------------------------
 # A5 tests (src/a5/runtime/tensormap_and_ringbuffer/)
 # ---------------------------------------------------------------------------
diff --git a/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp b/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp
index 4d55788d7..588136f66 100644
--- a/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp
+++ b/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp
@@ -8,6 +8,12 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
+/**
+ * Unit tests for PTO2 A2A3 fatal error handling.
+ *
+ * Tests API short-circuit after fatal state, explicit fatal routing,
+ * and allocation with invalid arguments.
+ */
 
 #include <gtest/gtest.h>
 
diff --git a/tests/ut/cpp/a2a3/test_dep_list_pool.cpp b/tests/ut/cpp/a2a3/test_dep_list_pool.cpp
new file mode 100644
index 000000000..a86a393d1
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_dep_list_pool.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2DepListPool from pto_ring_buffer.h
+ *
+ * Tests dependency list pool allocation, prepend chaining, overflow detection,
+ * tail advancement, and high-water mark tracking.
+ *
+ * Design contracts:
+ *
+ * - advance_tail(new_tail) only advances if new_tail > tail; it does
+ *   not validate new_tail <= top.  Caller contract (monotonic,
+ *   top-bounded).
+ *
+ * - The list terminator is literal nullptr.  base[0] is a normal pool entry;
+ *   init clearing it is incidental, not an invariant.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class DepListPoolTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 8;
+    PTO2DepListEntry entries[POOL_CAP]{};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2DepListPool pool{};
+
+    void SetUp() override {
+        std::memset(entries, 0, sizeof(entries));
+        error_code.store(PTO2_ERROR_NONE);
+        pool.init(entries, POOL_CAP, &error_code);
+    }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(DepListPoolTest, InitialState) {
+    EXPECT_EQ(pool.used(), 0);
+    EXPECT_EQ(pool.available(), POOL_CAP);
+}
+
+TEST_F(DepListPoolTest, SingleAlloc) {
+    PTO2DepListEntry *entry = pool.alloc();
+    ASSERT_NE(entry, nullptr);
+    EXPECT_EQ(pool.used(), 1);
+    EXPECT_EQ(pool.available(), POOL_CAP - 1);
+}
+
+TEST_F(DepListPoolTest, OverflowDetection) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        PTO2DepListEntry *e = pool.alloc();
+        ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i;
+    }
+    EXPECT_EQ(pool.used(), POOL_CAP);
+    EXPECT_EQ(pool.available(), 0);
+
+    PTO2DepListEntry *overflow = pool.alloc();
+    EXPECT_EQ(overflow, nullptr);
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW);
+}
+
+// Prepend builds LIFO linked list: verify each slot_state pointer.
+TEST_F(DepListPoolTest, PrependChainCorrectness) {
+    PTO2TaskSlotState slots[5]{};
+    PTO2DepListEntry *head = nullptr;
+
+    for (int i = 0; i < 5; i++) {
+        head = pool.prepend(head, &slots[i]);
+        ASSERT_NE(head, nullptr);
+    }
+
+    // LIFO order: head -> slots[4] -> slots[3] -> ... -> slots[0] -> nullptr.
+    PTO2DepListEntry *cur = head;
+    for (int i = 4; i >= 0; i--) {
+        ASSERT_NE(cur, nullptr);
+        EXPECT_EQ(cur->slot_state, &slots[i]) << "Entry " << (4 - i) << " should point to slots[" << i << "]";
+        cur = cur->next;
+    }
+    EXPECT_EQ(cur, nullptr) << "Chain should terminate with nullptr";
+}
+
+TEST_F(DepListPoolTest, AdvanceTail) {
+    for (int i = 0; i < 4; i++) {
+        pool.alloc();
+    }
+    EXPECT_EQ(pool.used(), 4);
+    EXPECT_EQ(pool.available(), POOL_CAP - 4);
+
+    pool.advance_tail(4);
+    EXPECT_EQ(pool.used(), 1);
+    EXPECT_EQ(pool.available(), POOL_CAP - 1);
+}
+
+TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) {
+    pool.alloc();
+    pool.alloc();
+    pool.advance_tail(3);
+    int32_t used_after = pool.used();
+
+    pool.advance_tail(2);
+    EXPECT_EQ(pool.used(), used_after);
+
+    pool.advance_tail(3);
+    EXPECT_EQ(pool.used(), used_after);
+}
+
+TEST_F(DepListPoolTest, HighWaterAccuracy) {
+    for (int i = 0; i < 5; i++)
+        pool.alloc();
+    EXPECT_EQ(pool.high_water, 5);
+
+    pool.advance_tail(4);
+    EXPECT_EQ(pool.high_water, 5) << "High water never decreases";
+
+    for (int i = 0; i < 3; i++)
+        pool.alloc();
+    EXPECT_GE(pool.high_water, 5);
+}
+
+// =============================================================================
+// Boundary conditions
+// =============================================================================
+
+// Prepend chain integrity under pool exhaustion: chain must be walkable.
+TEST_F(DepListPoolTest, PrependUnderExhaustion) {
+    PTO2TaskSlotState slots[POOL_CAP]{};
+    PTO2DepListEntry *head = nullptr;
+
+    int count = 0;
+    while (count < POOL_CAP + 5) {
+        PTO2DepListEntry *new_head = pool.prepend(head, &slots[count % POOL_CAP]);
+        if (!new_head) break;
+        head = new_head;
+        count++;
+    }
+
+    int walk = 0;
+    PTO2DepListEntry *cur = head;
+    while (cur) {
+        walk++;
+        cur = cur->next;
+        if (walk > count + 1) {
+            FAIL() << "Chain has cycle -- walked more entries than allocated";
+            break;
+        }
+    }
+    EXPECT_EQ(walk, count);
+}
diff --git a/tests/ut/cpp/a2a3/test_fanin_pool.cpp b/tests/ut/cpp/a2a3/test_fanin_pool.cpp
new file mode 100644
index 000000000..29199ae2e
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_fanin_pool.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2FaninPool and pto2_for_each_fanin_storage/slot_state
+ * from pto_ring_buffer.h / pto_ring_buffer.cpp
+ *
+ * Tests:
+ * 1. PTO2FaninPool — ring buffer allocation, overflow, tail advance,
+ *    high-water tracking
+ * 2. pto2_for_each_fanin_storage — inline-only, spill without wrap,
+ *    spill with wrap, callback early return
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// FaninPool fixture
+// =============================================================================
+
+class FaninPoolTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 32;
+
+    std::vector<PTO2FaninSpillEntry> entries;
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2FaninPool pool{};
+
+    void SetUp() override {
+        entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr});
+        error_code.store(PTO2_ERROR_NONE);
+        pool.init(entries.data(), POOL_CAP, &error_code);
+    }
+};
+
+// =============================================================================
+// FaninPool: basic operations
+// =============================================================================
+
+TEST_F(FaninPoolTest, InitialState) {
+    EXPECT_EQ(pool.used(), 0);
+    EXPECT_EQ(pool.available(), POOL_CAP);
+    EXPECT_EQ(pool.top, 1);
+    EXPECT_EQ(pool.tail, 1);
+    EXPECT_EQ(pool.high_water, 0);
+}
+
+TEST_F(FaninPoolTest, AllocReturnsCorrectModuloIndex) {
+    // First alloc at index top%cap = 1%32 = 1
+    auto *e1 = pool.alloc();
+    EXPECT_EQ(e1, &entries[1]);
+
+    auto *e2 = pool.alloc();
+    EXPECT_EQ(e2, &entries[2]);
+}
+
+TEST_F(FaninPoolTest, AllocFillsPool) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        auto *e = pool.alloc();
+        ASSERT_NE(e, nullptr) << "Alloc failed at i=" << i;
+    }
+    EXPECT_EQ(pool.used(), POOL_CAP);
+    EXPECT_EQ(pool.available(), 0);
+}
+
+TEST_F(FaninPoolTest, OverflowReturnsNullptr) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        pool.alloc();
+    }
+    auto *overflow = pool.alloc();
+    EXPECT_EQ(overflow, nullptr);
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW);
+}
+
+TEST_F(FaninPoolTest, AdvanceTailFreesSpace) {
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+    EXPECT_EQ(pool.used(), 10);
+
+    pool.advance_tail(pool.tail + 5);
+    EXPECT_EQ(pool.used(), 5);
+    EXPECT_EQ(pool.available(), POOL_CAP - 5);
+}
+
+TEST_F(FaninPoolTest, AdvanceTailBackwardsIsNoop) {
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+    int32_t old_tail = pool.tail;
+    pool.advance_tail(old_tail - 1);
+    EXPECT_EQ(pool.tail, old_tail);
+    EXPECT_EQ(pool.used(), 10);
+}
+
+TEST_F(FaninPoolTest, HighWaterNeverDecreases) {
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+    EXPECT_EQ(pool.high_water, 10);
+
+    pool.advance_tail(pool.tail + 5);
+    EXPECT_EQ(pool.high_water, 10) << "high_water must never decrease";
+}
+
+TEST_F(FaninPoolTest, WrapAroundAllocation) {
+    // Fill and drain, then fill again to wrap
+    for (int i = 0; i < POOL_CAP; i++) {
+        pool.alloc();
+    }
+    pool.advance_tail(pool.top);
+    EXPECT_EQ(pool.used(), 0);
+
+    // New allocations wrap around
+    for (int i = 0; i < 5; i++) {
+        auto *e = pool.alloc();
+        ASSERT_NE(e, nullptr);
+        // Verify modulo indexing
+        int32_t expected_idx = (pool.top - 1) % POOL_CAP;
+        EXPECT_EQ(e, &entries[expected_idx]);
+    }
+    EXPECT_EQ(pool.used(), 5);
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: inline only
+// =============================================================================
+
+class ForEachFaninTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 32;
+
+    std::vector<PTO2FaninSpillEntry> spill_entries;
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2FaninPool spill_pool{};
+
+    alignas(64) PTO2TaskSlotState slots[64];
+
+    void SetUp() override {
+        spill_entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr});
+        error_code.store(PTO2_ERROR_NONE);
+        spill_pool.init(spill_entries.data(), POOL_CAP, &error_code);
+        memset(slots, 0, sizeof(slots));
+    }
+};
+
+TEST_F(ForEachFaninTest, InlineOnlyVoid) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < 5; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    std::vector<PTO2TaskSlotState *> visited;
+    pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *s) {
+        visited.push_back(s);
+    });
+
+    ASSERT_EQ(visited.size(), 5u);
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(visited[i], &slots[i]);
+    }
+}
+
+TEST_F(ForEachFaninTest, InlineOnlyBoolEarlyReturn) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < 5; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    int count = 0;
+    bool result = pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *) -> bool {
+        count++;
+        return count < 3;  // stop after 3rd
+    });
+
+    EXPECT_FALSE(result) << "Should return false when callback returns false";
+    EXPECT_EQ(count, 3);
+}
+
+TEST_F(ForEachFaninTest, InlineOnlyBoolAllTrue) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < 3; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    bool result = pto2_for_each_fanin_storage(inline_slots, 3, 0, spill_pool, [](PTO2TaskSlotState *) -> bool {
+        return true;
+    });
+
+    EXPECT_TRUE(result);
+}
+
+TEST_F(ForEachFaninTest, ZeroFanin) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    int count = 0;
+    pto2_for_each_fanin_storage(inline_slots, 0, 0, spill_pool, [&](PTO2TaskSlotState *) {
+        count++;
+    });
+    EXPECT_EQ(count, 0);
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: spill without wrap
+// =============================================================================
+
+TEST_F(ForEachFaninTest, SpillNoWrap) {
+    // 18 fanins = 16 inline + 2 spill
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    // Allocate 2 spill entries
+    auto *s0 = spill_pool.alloc();
+    int32_t spill_start = spill_pool.top - 1;
+    s0->slot_state = &slots[16];
+    auto *s1 = spill_pool.alloc();
+    s1->slot_state = &slots[17];
+
+    std::vector<PTO2TaskSlotState *> visited;
+    pto2_for_each_fanin_storage(inline_slots, 18, spill_start, spill_pool, [&](PTO2TaskSlotState *s) {
+        visited.push_back(s);
+    });
+
+    ASSERT_EQ(visited.size(), 18u);
+    for (int i = 0; i < 16; i++) {
+        EXPECT_EQ(visited[i], &slots[i]) << "Inline slot " << i;
+    }
+    EXPECT_EQ(visited[16], &slots[16]);
+    EXPECT_EQ(visited[17], &slots[17]);
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: spill with wrap
+// =============================================================================
+
+TEST_F(ForEachFaninTest, SpillWithWrap) {
+    // Push pool near end so spill wraps around
+    // Pool cap = 32, advance top to 30 so next alloc is at index 30
+    spill_pool.top = POOL_CAP - 2;
+    spill_pool.tail = POOL_CAP - 2;
+
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    // 4 spill entries: indices 30, 31, 0, 1 (wraps around)
+    int32_t spill_start = spill_pool.top;
+    for (int i = 0; i < 4; i++) {
+        auto *e = spill_pool.alloc();
+        ASSERT_NE(e, nullptr);
+        e->slot_state = &slots[16 + i];
+    }
+
+    std::vector<PTO2TaskSlotState *> visited;
+    pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *s) {
+        visited.push_back(s);
+    });
+
+    ASSERT_EQ(visited.size(), 20u);
+    // Inline
+    for (int i = 0; i < 16; i++) {
+        EXPECT_EQ(visited[i], &slots[i]);
+    }
+    // Spill (wrapped)
+    for (int i = 0; i < 4; i++) {
+        EXPECT_EQ(visited[16 + i], &slots[16 + i]);
+    }
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: spill with bool callback early return
+// =============================================================================
+
+TEST_F(ForEachFaninTest, SpillBoolEarlyReturnInSpillRegion) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    int32_t spill_start = spill_pool.top;
+    for (int i = 0; i < 4; i++) {
+        auto *e = spill_pool.alloc();
+        e->slot_state = &slots[16 + i];
+    }
+
+    int count = 0;
+    bool result =
+        pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *) -> bool {
+            count++;
+            return count < 17;  // stop on 17th (first spill entry)
+        });
+
+    EXPECT_FALSE(result);
+    EXPECT_EQ(count, 17);
+}
diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp
new file mode 100644
index 000000000..1a139a8f1
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h
+ *
+ * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local
+ * ready buffer used for local-first dispatch optimization.
+ *
+ * Design contracts:
+ *
+ * - Sequence wrap: The sequence counter is int64_t.  Practically unreachable
+ *   wrap at 2^63; two's-complement comparisons still work.
+ *
+ * - Pop fast-path: pop() checks enqueue_pos == dequeue_pos as an early-empty
+ *   hint.  A push between the hint and the CAS can race; standard TOCTOU of
+ *   Vyukov MPMC, acceptable.
+ *
+ * - Push near full: All producers that see a full slot return false
+ *   simultaneously even if a pop happens right after.  Acceptable
+ *   back-pressure.
+ *
+ * - size() relaxed ordering: size() reads both positions with
+ *   memory_order_relaxed and is a hint, not a snapshot.  If a stale read
+ *   produces d > e the guard returns 0.
+ *
+ * - LocalReadyBuffer LIFO dispatch: try_push appends at count++, pop returns
+ *   slot_states[--count].  LIFO reversal is intentional for cache-locality
+ *   when a producer immediately dispatches its fanout.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// ReadyQueue: Single-threaded fixture (malloc-backed)
+// =============================================================================
+
+class ReadyQueueTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t CAPACITY = 16;  // Power of 2
+
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); }
+
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) { EXPECT_EQ(queue.pop(), nullptr); }
+
+TEST_F(ReadyQueueTest, SinglePushPop) {
+    PTO2TaskSlotState item;
+    ASSERT_TRUE(queue.push(&item));
+
+    PTO2TaskSlotState *result = queue.pop();
+    EXPECT_EQ(result, &item);
+}
+
+TEST_F(ReadyQueueTest, FIFOOrdering) {
+    PTO2TaskSlotState a, b, c;
+
+    ASSERT_TRUE(queue.push(&a));
+    ASSERT_TRUE(queue.push(&b));
+    ASSERT_TRUE(queue.push(&c));
+
+    EXPECT_EQ(queue.pop(), &a);
+    EXPECT_EQ(queue.pop(), &b);
+    EXPECT_EQ(queue.pop(), &c);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, QueueFullReturnsFalse) {
+    std::vector<PTO2TaskSlotState> items(CAPACITY);
+
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState extra;
+    EXPECT_FALSE(queue.push(&extra));
+}
+
+TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) {
+    std::vector<PTO2TaskSlotState> items(CAPACITY);
+
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) {
+    constexpr int BATCH_SIZE = 5;
+    PTO2TaskSlotState items[BATCH_SIZE];
+    PTO2TaskSlotState *ptrs[BATCH_SIZE];
+    for (int i = 0; i < BATCH_SIZE; i++) {
+        ptrs[i] = &items[i];
+    }
+
+    queue.push_batch(ptrs, BATCH_SIZE);
+
+    for (int i = 0; i < BATCH_SIZE; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) {
+    queue.push_batch(nullptr, 0);
+
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, PopBatchReturnsFive) {
+    constexpr int PUSH_COUNT = 10;
+    PTO2TaskSlotState items[PUSH_COUNT];
+
+    for (int i = 0; i < PUSH_COUNT; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 5);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+TEST_F(ReadyQueueTest, PopBatchPartial) {
+    constexpr int PUSH_COUNT = 3;
+    PTO2TaskSlotState items[PUSH_COUNT];
+
+    for (int i = 0; i < PUSH_COUNT; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, PUSH_COUNT);
+
+    for (int i = 0; i < PUSH_COUNT; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+TEST_F(ReadyQueueTest, PopBatchEmpty) {
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 0);
+}
+
+TEST_F(ReadyQueueTest, SizeAccuracy) {
+    EXPECT_EQ(queue.size(), 0u);
+
+    PTO2TaskSlotState items[8];
+
+    queue.push(&items[0]);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.push(&items[1]);
+    queue.push(&items[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+
+    for (int i = 0; i < 5; i++) {
+        queue.push(&items[i]);
+    }
+    EXPECT_EQ(queue.size(), 5u);
+}
+
+// =============================================================================
+// Boundary conditions (small capacity for precise boundary testing)
+// =============================================================================
+
+class ReadyQueueBoundaryTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t QUEUE_CAP = 8;  // Small for boundary testing
+    PTO2ReadyQueue queue{};
+    PTO2TaskSlotState dummy[8]{};
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, QUEUE_CAP)); }
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) {
+    int pushed = 0;
+    for (uint64_t i = 0; i < QUEUE_CAP; i++) {
+        if (queue.push(&dummy[i % 8])) pushed++;
+        else break;
+    }
+    EXPECT_GE(pushed, (int)(QUEUE_CAP - 1));
+
+    for (int i = 0; i < pushed; i++) {
+        EXPECT_NE(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueBoundaryTest, PushToFullThenRecover) {
+    int pushed = 0;
+    while (queue.push(&dummy[0]))
+        pushed++;
+
+    EXPECT_FALSE(queue.push(&dummy[1])) << "Push to full queue returns false";
+
+    EXPECT_NE(queue.pop(), nullptr);
+    EXPECT_TRUE(queue.push(&dummy[1])) << "Push succeeds after pop from full queue";
+}
+
+// size() with relaxed ordering: exact in single-threaded context.
+TEST_F(ReadyQueueBoundaryTest, SizeRelaxedOrdering) {
+    queue.push(&dummy[0]);
+    queue.push(&dummy[1]);
+    queue.push(&dummy[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// size() guard: after many push/pop cycles, never goes negative.
+TEST_F(ReadyQueueBoundaryTest, SizeNeverNegative) {
+    for (int i = 0; i < 100; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0]));
+        queue.pop();
+    }
+    EXPECT_EQ(queue.size(), 0u) << "size() returns 0 after balanced push/pop cycles";
+}
+
+TEST_F(ReadyQueueBoundaryTest, RepeatedEmptyPop) {
+    for (int i = 0; i < 100; i++) {
+        EXPECT_EQ(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// Sequence numbers grow large after many cycles but remain correct.
+TEST_F(ReadyQueueBoundaryTest, ManyPushPopCycles) {
+    for (int i = 0; i < 10000; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0]));
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr);
+        EXPECT_EQ(s, &dummy[0]);
+    }
+
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_TRUE(queue.push(&dummy[1]));
+    EXPECT_EQ(queue.pop(), &dummy[1]);
+}
+
+// =============================================================================
+// Concurrency
+// =============================================================================
+
+// Parameterized MPMC stress test: {producers, consumers, items_per_producer}
+struct MPMCConfig {
+    int producers;
+    int consumers;
+    int items_per_producer;
+};
+
+class ReadyQueueMPMCTest : public ::testing::TestWithParam<MPMCConfig> {
+protected:
+    static constexpr uint64_t CAPACITY = 1024;
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); }
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) {
+    auto cfg = GetParam();
+    int total = cfg.producers * cfg.items_per_producer;
+
+    std::vector<PTO2TaskSlotState> items(total);
+    std::vector<std::atomic<int>> consumed_count(total);
+    for (int i = 0; i < total; i++) {
+        consumed_count[i].store(0, std::memory_order_relaxed);
+    }
+
+    auto item_index = [&](PTO2TaskSlotState *s) -> int {
+        return static_cast<int>(s - items.data());
+    };
+
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        for (int i = id; i < total; i += cfg.producers) {
+            while (!queue.push(&items[i])) {}
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    std::atomic<int> total_consumed{0};
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *item = queue.pop();
+            if (item != nullptr) {
+                consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                total_consumed.fetch_add(1, std::memory_order_relaxed);
+            } else if (producers_done.load(std::memory_order_acquire) == cfg.producers) {
+                // Drain remaining
+                while ((item = queue.pop()) != nullptr) {
+                    consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                    total_consumed.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < cfg.producers; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < cfg.consumers; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(total_consumed.load(), total);
+    for (int i = 0; i < total; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1)
+            << "Item " << i << " consumed " << consumed_count[i].load() << " times (expected 1)";
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MPMCVariants, ReadyQueueMPMCTest,
+    ::testing::Values(
+        MPMCConfig{2, 2, 200},  // TwoProducersTwoConsumers
+        MPMCConfig{1, 4, 500},  // OneProducerNConsumers
+        MPMCConfig{4, 4, 1250}  // HighContentionStress
+    )
+);
+
+// =============================================================================
+// LocalReadyBuffer
+// =============================================================================
+
+class LocalReadyBufferTest : public ::testing::Test {
+protected:
+    static constexpr int CAPACITY = 8;
+
+    PTO2LocalReadyBuffer buffer;
+    PTO2TaskSlotState *backing[CAPACITY];
+
+    void SetUp() override { buffer.reset(backing, CAPACITY); }
+};
+
+// --- Normal path ---
+
+TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) { EXPECT_EQ(buffer.pop(), nullptr); }
+
+// LIFO dispatch: try_push appends at count++, pop returns slot_states[--count].
+TEST_F(LocalReadyBufferTest, LIFOOrdering) {
+    PTO2TaskSlotState a, b;
+
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    EXPECT_EQ(buffer.pop(), &b);
+    EXPECT_EQ(buffer.pop(), &a);
+    EXPECT_EQ(buffer.pop(), nullptr);
+}
+
+TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) {
+    PTO2TaskSlotState items[CAPACITY + 1];
+
+    for (int i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(buffer.try_push(&items[i]));
+    }
+
+    EXPECT_FALSE(buffer.try_push(&items[CAPACITY]));
+}
+
+TEST_F(LocalReadyBufferTest, ResetSetsCleanState) {
+    EXPECT_EQ(buffer.pop(), nullptr) << "Fresh buffer is empty";
+
+    PTO2TaskSlotState a, b;
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    buffer.reset(backing, CAPACITY);
+    EXPECT_EQ(buffer.pop(), nullptr) << "Buffer is empty after reset";
+
+    PTO2TaskSlotState items[CAPACITY];
+    for (int i = 0; i < CAPACITY; i++) {
+        EXPECT_TRUE(buffer.try_push(&items[i]));
+    }
+    EXPECT_FALSE(buffer.try_push(&a)) << "Full after pushing capacity items post-reset";
+}
+
+// --- Boundary conditions ---
+
+TEST_F(LocalReadyBufferTest, NullBackingBuffer) {
+    PTO2LocalReadyBuffer buf;
+    buf.reset(nullptr, 0);
+
+    PTO2TaskSlotState item{};
+    EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing";
+    EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing";
+}
diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
new file mode 100644
index 000000000..13647c320
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SchedulerState from pto_scheduler.h
+ *
+ * Tests task state transitions, fanin/fanout logic, subtask completion.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+
+#include "scheduler/pto_scheduler.h"
+
+class SchedulerStateTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched;
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    void init_slot(
+        PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0
+    ) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = ring_id;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+    }
+};
+
+// =============================================================================
+// check_and_handle_consumed
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ConsumedNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // 1 != 2
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedTransition) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(2);  // matches fanout_count
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedNotCompletedState) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    // CAS fails because state is RUNNING, not COMPLETED
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING);
+}
+
+TEST_F(SchedulerStateTest, ConsumedIdempotent) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_CONSUMED, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// release_producer
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ReleaseProducerIncrements) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 3);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 1);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 2);
+}
+
+TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // One away
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// on_subtask_complete
+// =============================================================================
+
+TEST_F(SchedulerStateTest, SubtaskCompleteSingle) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 1;
+    slot.completed_subtasks.store(0);
+
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 6;  // 3 cores * 2 blocks
+    slot.completed_subtasks.store(0);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_FALSE(sched.on_subtask_complete(slot));
+    }
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+// =============================================================================
+// on_scope_end
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ScopeEndBatchRelease) {
+    constexpr int N = 4;
+    alignas(64) PTO2TaskSlotState slots[N];
+    PTO2TaskSlotState *ptrs[N];
+
+    for (int i = 0; i < N; i++) {
+        init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2);
+        ptrs[i] = &slots[i];
+    }
+
+    sched.on_scope_end(ptrs, N);
+
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(slots[i].fanout_refcount.load(), 1);
+    }
+}
+
+// =============================================================================
+// get_ready_tasks_batch: local buffer first
+// =============================================================================
+
+TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) {
+    alignas(64) PTO2TaskSlotState slot_a, slot_b;
+    init_slot(slot_a, PTO2_TASK_READY, 0, 1);
+    init_slot(slot_b, PTO2_TASK_PENDING, 1, 1);
+
+    PTO2TaskSlotState *local_buf_storage[4];
+    PTO2LocalReadyBuffer local_buf;
+    local_buf.reset(local_buf_storage, 4);
+    local_buf.try_push(&slot_a);
+
+    // Use src API to route slot_b into the global ready queue
+    sched.release_fanin_and_check_ready(slot_b);
+
+    PTO2TaskSlotState *out[4];
+    int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4);
+
+    EXPECT_EQ(count, 2);
+    // Local buffer drains first (LIFO), so slot_a comes first
+    EXPECT_EQ(out[0], &slot_a);
+    EXPECT_EQ(out[1], &slot_b);
+}
diff --git a/tests/ut/cpp/a2a3/test_shared_memory.cpp b/tests/ut/cpp/a2a3/test_shared_memory.cpp
new file mode 100644
index 000000000..ffcbb7821
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_shared_memory.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h
+ *
+ * Tests creation, validation, per-ring independence, alignment, size
+ * calculation, and error handling.
+ *
+ * Design contracts:
+ *
+ * - pto2_sm_validate checks `top > heap_size`.  top == heap_size is a
+ *   legitimate "filled exactly to end" state, so strict > is correct.
+ *
+ * - Zero window size: if pto2_sm_calculate_size() is called with 0, all ring
+ *   descriptors/payloads alias the same address.  Current entry path
+ *   (pto2_sm_create) is called only with valid sizes, but there is no
+ *   explicit guard.  pto2_sm_create should reject task_window_size==0.
+ *
+ * - Flow control heap_top validation: validate() does not verify
+ *   heap_top <= heap_size.  After a corruption, heap_top could exceed
+ *   heap_size without detection.  validate should check both bounds.
+ */
+
+#include <gtest/gtest.h>
+#include <cstring>
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// Fixture (default-created handle)
+// =============================================================================
+
+class SharedMemoryTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = pto2_sm_create_default();
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            pto2_sm_destroy(handle);
+            handle = nullptr;
+        }
+    }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) {
+    EXPECT_NE(handle->sm_base, nullptr);
+    EXPECT_GT(handle->sm_size, 0u);
+}
+
+TEST_F(SharedMemoryTest, IsOwner) { EXPECT_TRUE(handle->is_owner); }
+
+TEST_F(SharedMemoryTest, HeaderInitValues) {
+    auto *hdr = handle->header;
+    EXPECT_EQ(hdr->orchestrator_done.load(), 0);
+    EXPECT_EQ(hdr->orch_error_code.load(), 0);
+    EXPECT_EQ(hdr->sched_error_bitmap.load(), 0);
+    EXPECT_EQ(hdr->sched_error_code.load(), 0);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &fc = hdr->rings[r].fc;
+        EXPECT_EQ(fc.current_task_index.load(), 0);
+        EXPECT_EQ(fc.last_task_alive.load(), 0);
+    }
+}
+
+TEST_F(SharedMemoryTest, Validate) { EXPECT_TRUE(pto2_sm_validate(handle)); }
+
+TEST_F(SharedMemoryTest, PerRingIndependence) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->header->rings[r].task_descriptors, nullptr) << "Ring " << r;
+        EXPECT_NE(handle->header->rings[r].task_payloads, nullptr) << "Ring " << r;
+    }
+    for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->header->rings[r].task_descriptors, handle->header->rings[0].task_descriptors) << "Ring " << r;
+    }
+}
+
+TEST_F(SharedMemoryTest, PointerAlignment) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto addr = reinterpret_cast<uintptr_t>(handle->header->rings[r].task_descriptors);
+        EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned";
+    }
+}
+
+TEST_F(SharedMemoryTest, HeaderAlignment) {
+    uintptr_t header_addr = (uintptr_t)handle->header;
+    EXPECT_EQ(header_addr % PTO2_ALIGN_SIZE, 0u) << "Header must be cache-line aligned";
+}
+
+// Descriptor and payload regions don't overlap within or across rings.
+TEST_F(SharedMemoryTest, RegionsNonOverlapping) {
+    uint64_t ws = 64;  // Use a known window size for byte arithmetic
+    PTO2SharedMemoryHandle *h = pto2_sm_create(ws, 4096);
+    ASSERT_NE(h, nullptr);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        uintptr_t desc_start = (uintptr_t)h->header->rings[r].task_descriptors;
+        uintptr_t desc_end = desc_start + ws * sizeof(PTO2TaskDescriptor);
+        uintptr_t payload_start = (uintptr_t)h->header->rings[r].task_payloads;
+
+        EXPECT_GE(payload_start, desc_end) << "Ring " << r << ": payload region should not overlap descriptors";
+    }
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) {
+        uintptr_t this_payload_end = (uintptr_t)h->header->rings[r].task_payloads + ws * sizeof(PTO2TaskPayload);
+        uintptr_t next_desc_start = (uintptr_t)h->header->rings[r + 1].task_descriptors;
+        EXPECT_GE(next_desc_start, this_payload_end) << "Ring " << r << " and " << (r + 1) << " should not overlap";
+    }
+
+    pto2_sm_destroy(h);
+}
+
+// =============================================================================
+// Size calculation
+// =============================================================================
+
+TEST(SharedMemoryCalcSize, NonZero) {
+    uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE);
+    EXPECT_GT(size, 0u);
+}
+
+TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) {
+    uint64_t small_size = pto2_sm_calculate_size(64);
+    uint64_t large_size = pto2_sm_calculate_size(256);
+    EXPECT_GT(large_size, small_size);
+}
+
+TEST(SharedMemoryCalcSize, HeaderAligned) { EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u); }
+
+TEST(SharedMemoryCalcSize, PerRingDifferentSizes) {
+    uint64_t ws[PTO2_MAX_RING_DEPTH] = {128, 256, 512, 1024};
+    uint64_t size = pto2_sm_calculate_size_per_ring(ws);
+
+    uint64_t uniform_size = pto2_sm_calculate_size(128);
+    EXPECT_GT(size, uniform_size);
+}
+
+// =============================================================================
+// Boundary conditions
+// =============================================================================
+
+// Zero window size: all ring descriptors collapse to same address.
+TEST(SharedMemoryBoundary, ZeroWindowSize) {
+    uint64_t size = pto2_sm_calculate_size(0);
+    uint64_t header_size = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    EXPECT_EQ(size, header_size);
+
+    PTO2SharedMemoryHandle *h = pto2_sm_create(0, 4096);
+    if (h) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) {
+            EXPECT_EQ(h->header->rings[r].task_descriptors, h->header->rings[r + 1].task_descriptors)
+                << "Zero window: all rings' descriptor pointers collapse to same address";
+        }
+        pto2_sm_destroy(h);
+    }
+}
+
+TEST(SharedMemoryBoundary, ValidateDetectsCorruption) {
+    PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096);
+    ASSERT_NE(h, nullptr);
+    EXPECT_TRUE(pto2_sm_validate(h));
+
+    h->header->rings[0].fc.current_task_index.store(-1);
+    EXPECT_FALSE(pto2_sm_validate(h));
+
+    pto2_sm_destroy(h);
+}
+
+TEST(SharedMemoryBoundary, ValidateNullHandle) { EXPECT_FALSE(pto2_sm_validate(nullptr)); }
+
+TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) {
+    char buf[64]{};
+    PTO2SharedMemoryHandle *h = pto2_sm_create_from_buffer(buf, 64, 256, 4096);
+    EXPECT_EQ(h, nullptr) << "Undersized buffer should fail";
+}
diff --git a/tests/ut/cpp/a2a3/test_spsc_queue.cpp b/tests/ut/cpp/a2a3/test_spsc_queue.cpp
new file mode 100644
index 000000000..a2c80ca05
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_spsc_queue.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SpscQueue from pto_scheduler.h
+ *
+ * Tests the Rigtorp cached-index SPSC queue used as the orchestrator →
+ * scheduler wiring channel:
+ * - Basic push / pop_batch correctness
+ * - Full / empty detection (including cached-index lazy refresh)
+ * - Wrap-around via modulo indexing
+ * - Capacity is capacity-1 (one sentinel slot)
+ * - pop_batch partial reads
+ * - size() accuracy
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class SpscQueueTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t CAPACITY = 16;  // must be power of 2
+
+    PTO2SpscQueue queue{};
+    // Dummy slot states used as push values
+    alignas(64) PTO2TaskSlotState slots[64]{};
+
+    void SetUp() override {
+        memset(&queue, 0, sizeof(queue));
+        ASSERT_TRUE(queue.init(CAPACITY));
+    }
+
+    void TearDown() override { queue.destroy(); }
+};
+
+// =============================================================================
+// Initialization
+// =============================================================================
+
+TEST_F(SpscQueueTest, InitValidState) {
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.mask_, CAPACITY - 1);
+    EXPECT_NE(queue.buffer_, nullptr);
+}
+
+TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) {
+    PTO2SpscQueue bad{};
+    EXPECT_FALSE(bad.init(3));
+    EXPECT_FALSE(bad.init(7));
+    EXPECT_FALSE(bad.init(0));
+}
+
+TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
+    PTO2SpscQueue q{};
+    EXPECT_TRUE(q.init(4));
+    q.destroy();
+    EXPECT_TRUE(q.init(1024));
+    q.destroy();
+}
+
+// =============================================================================
+// Basic push / pop
+// =============================================================================
+
+TEST_F(SpscQueueTest, PushPopSingle) {
+    EXPECT_TRUE(queue.push(&slots[0]));
+    EXPECT_EQ(queue.size(), 1u);
+
+    PTO2TaskSlotState *out[1];
+    int count = queue.pop_batch(out, 1);
+    ASSERT_EQ(count, 1);
+    EXPECT_EQ(out[0], &slots[0]);
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+TEST_F(SpscQueueTest, FIFOOrdering) {
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&slots[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int count = queue.pop_batch(out, 5);
+    ASSERT_EQ(count, 5);
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(out[i], &slots[i]) << "FIFO order violated at i=" << i;
+    }
+}
+
+TEST_F(SpscQueueTest, PopBatchPartial) {
+    for (int i = 0; i < 3; i++) {
+        queue.push(&slots[i]);
+    }
+
+    // Request 5 but only 3 available
+    PTO2TaskSlotState *out[5];
+    int count = queue.pop_batch(out, 5);
+    EXPECT_EQ(count, 3);
+}
+
+TEST_F(SpscQueueTest, PopBatchEmpty) {
+    PTO2TaskSlotState *out[5];
+    int count = queue.pop_batch(out, 5);
+    EXPECT_EQ(count, 0);
+}
+
+// =============================================================================
+// Full detection
+// =============================================================================
+
+TEST_F(SpscQueueTest, FullReturnsFalse) {
+    // Usable capacity = CAPACITY - 1 = 15
+    for (uint64_t i = 0; i < CAPACITY - 1; i++) {
+        ASSERT_TRUE(queue.push(&slots[i])) << "push failed at i=" << i;
+    }
+    EXPECT_EQ(queue.size(), CAPACITY - 1);
+
+    // Queue full
+    EXPECT_FALSE(queue.push(&slots[CAPACITY - 1])) << "Push to full queue must return false";
+}
+
+TEST_F(SpscQueueTest, UsableCapacityIsCapacityMinusOne) {
+    int pushed = 0;
+    while (queue.push(&slots[pushed % 64])) {
+        pushed++;
+        if (pushed > 100) break;  // safety
+    }
+    EXPECT_EQ(pushed, static_cast<int>(CAPACITY - 1));
+}
+
+// =============================================================================
+// Full then recover
+// =============================================================================
+
+TEST_F(SpscQueueTest, FullThenPopThenPush) {
+    for (uint64_t i = 0; i < CAPACITY - 1; i++) {
+        queue.push(&slots[i]);
+    }
+    EXPECT_FALSE(queue.push(&slots[0]));
+
+    // Pop one
+    PTO2TaskSlotState *out[1];
+    int count = queue.pop_batch(out, 1);
+    ASSERT_EQ(count, 1);
+
+    // Now push should succeed
+    EXPECT_TRUE(queue.push(&slots[0]));
+}
+
+// =============================================================================
+// Wrap-around
+// =============================================================================
+
+TEST_F(SpscQueueTest, WrapAroundCorrectness) {
+    // Push-pop cycles to advance head/tail past capacity boundary
+    for (int cycle = 0; cycle < 100; cycle++) {
+        ASSERT_TRUE(queue.push(&slots[cycle % 64])) << "push failed at cycle=" << cycle;
+        PTO2TaskSlotState *out[1];
+        int count = queue.pop_batch(out, 1);
+        ASSERT_EQ(count, 1) << "pop_batch failed at cycle=" << cycle;
+        EXPECT_EQ(out[0], &slots[cycle % 64]);
+    }
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+TEST_F(SpscQueueTest, WrapAroundBatchCorrectness) {
+    // Multiple cycles of batch push/pop across wrap boundary
+    for (int cycle = 0; cycle < 20; cycle++) {
+        int batch = 5;
+        for (int i = 0; i < batch; i++) {
+            ASSERT_TRUE(queue.push(&slots[(cycle * batch + i) % 64]));
+        }
+        PTO2TaskSlotState *out[5];
+        int count = queue.pop_batch(out, batch);
+        ASSERT_EQ(count, batch);
+        for (int i = 0; i < batch; i++) {
+            EXPECT_EQ(out[i], &slots[(cycle * batch + i) % 64]);
+        }
+    }
+}
+
+// =============================================================================
+// size() accuracy
+// =============================================================================
+
+TEST_F(SpscQueueTest, SizeTracksOperations) {
+    EXPECT_EQ(queue.size(), 0u);
+
+    queue.push(&slots[0]);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.push(&slots[1]);
+    queue.push(&slots[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    PTO2TaskSlotState *out[2];
+    queue.pop_batch(out, 2);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.pop_batch(out, 1);
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// =============================================================================
+// Producer-consumer (two threads)
+// =============================================================================
+
+TEST_F(SpscQueueTest, TwoThreadProducerConsumer) {
+    constexpr int TOTAL = 10000;
+    std::vector<PTO2TaskSlotState *> consumed;
+    consumed.reserve(TOTAL);
+
+    // Use a large pool of slot states for unique pointers
+    std::vector<PTO2TaskSlotState> big_pool(TOTAL);
+
+    std::thread producer([&]() {
+        for (int i = 0; i < TOTAL; i++) {
+            while (!queue.push(&big_pool[i])) {
+                // spin
+            }
+        }
+    });
+
+    std::thread consumer([&]() {
+        int total = 0;
+        PTO2TaskSlotState *out[32];
+        while (total < TOTAL) {
+            int count = queue.pop_batch(out, 32);
+            for (int i = 0; i < count; i++) {
+                consumed.push_back(out[i]);
+            }
+            total += count;
+        }
+    });
+
+    producer.join();
+    consumer.join();
+
+    ASSERT_EQ(consumed.size(), static_cast<size_t>(TOTAL));
+    // Verify FIFO order
+    for (int i = 0; i < TOTAL; i++) {
+        EXPECT_EQ(consumed[i], &big_pool[i]) << "FIFO violated at i=" << i;
+    }
+}
+
+// =============================================================================
+// Cached index behavior
+// =============================================================================
+
+TEST_F(SpscQueueTest, CachedIndexLazyRefresh) {
+    // Fill queue
+    for (uint64_t i = 0; i < CAPACITY - 1; i++) {
+        queue.push(&slots[i]);
+    }
+
+    // Consumer pops all
+    PTO2TaskSlotState *out[16];
+    int count = queue.pop_batch(out, CAPACITY);
+    EXPECT_EQ(count, static_cast<int>(CAPACITY - 1));
+
+    // Producer's tail_cached_ is stale (still thinks queue is full)
+    // Next push should refresh tail_cached_ and succeed
+    EXPECT_TRUE(queue.push(&slots[0]));
+}
+
+TEST_F(SpscQueueTest, CachedIndexConsumerRefresh) {
+    // Consumer calls pop_batch on empty queue (head_cached_ is 0)
+    PTO2TaskSlotState *out[1];
+    EXPECT_EQ(queue.pop_batch(out, 1), 0);
+
+    // Producer pushes
+    queue.push(&slots[0]);
+
+    // Consumer's head_cached_ is stale, pop_batch must refresh
+    int count = queue.pop_batch(out, 1);
+    EXPECT_EQ(count, 1);
+    EXPECT_EQ(out[0], &slots[0]);
+}
diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp
new file mode 100644
index 000000000..383003900
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TaskAllocator from pto_ring_buffer.h
+ *
+ * Tests ring buffer allocation, heap bump logic, wrap-around, alignment,
+ * task window flow control, and heap_available semantics.
+ *
+ * The allocator is single-threaded (orchestrator thread), so no concurrency
+ * tests are needed. The unified PTO2TaskAllocator replaces the previous
+ * separate PTO2HeapRing + PTO2TaskRing.
+ *
+ * Design contracts (try_bump_heap):
+ *
+ * - Wrap-around guard uses `tail > alloc_size` (strict >).  When
+ *   tail == alloc_size the wrap branch returns nullptr.  Allowing it
+ *   would create top == tail (full/empty ambiguity).  Strict >
+ *   sacrifices one quantum of capacity.
+ *
+ * - heap_available() returns max(at_end, at_begin), not the sum.
+ *   A single allocation cannot split across the wrap boundary.
+ *
+ * - Zero-size allocation is a no-op returning the current top.
+ *   Two consecutive zero-size allocs return the SAME pointer.
+ *
+ * - Wrap path wasted space: space between old top and heap_size is not
+ *   reclaimed.  Inherent ring-buffer fragmentation cost.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <climits>
+#include <cstring>
+#include <set>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Helpers
+//
+// WHITE-BOX: consume_up_to simulates the scheduler consuming tasks by directly
+// writing descriptor.packed_buffer_end and advancing last_alive.  This binds
+// to the internal tail-derivation mechanism.  If the allocator's reclaim
+// protocol changes (e.g. explicit tail field instead of packed_buffer_end),
+// this helper and all wrap/reclaim tests must be updated.
+// =============================================================================
+
+static void consume_up_to(
+    std::vector<PTO2TaskDescriptor> &descriptors, std::atomic<int32_t> &last_alive, void *heap_base,
+    int32_t window_size, int32_t new_last_alive, uint64_t heap_tail_offset
+) {
+    int32_t last_consumed = new_last_alive - 1;
+    descriptors[last_consumed & (window_size - 1)].packed_buffer_end =
+        static_cast<char *>(heap_base) + heap_tail_offset;
+    last_alive.store(new_last_alive, std::memory_order_release);
+}
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class TaskAllocatorTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 4096;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[HEAP_SIZE]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void SetUp() override {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, InitialState) {
+    EXPECT_EQ(allocator.window_size(), WINDOW_SIZE);
+    EXPECT_EQ(allocator.active_count(), 0);
+    EXPECT_EQ(allocator.heap_top(), 0u);
+    EXPECT_EQ(allocator.heap_capacity(), HEAP_SIZE);
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE);
+}
+
+TEST_F(TaskAllocatorTest, AllocNonZeroSize) {
+    auto result = allocator.alloc(100);
+    ASSERT_FALSE(result.failed());
+    EXPECT_EQ(result.task_id, 0);
+    EXPECT_EQ(result.slot, 0);
+    EXPECT_NE(result.packed_base, nullptr);
+    // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128
+    uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE);
+    EXPECT_EQ(expected_aligned, 128u);
+    EXPECT_EQ(allocator.heap_top(), expected_aligned);
+    EXPECT_EQ(
+        static_cast<char *>(result.packed_end) - static_cast<char *>(result.packed_base),
+        static_cast<ptrdiff_t>(expected_aligned)
+    );
+}
+
+TEST_F(TaskAllocatorTest, SequentialTaskIds) {
+    int32_t prev_id = -1;
+    for (int i = 0; i < 5; i++) {
+        auto result = allocator.alloc(0);
+        ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(result.task_id, prev_id + 1) << "Task IDs must be monotonically increasing";
+        EXPECT_EQ(result.slot, result.task_id & (WINDOW_SIZE - 1));
+        prev_id = result.task_id;
+    }
+    EXPECT_EQ(allocator.active_count(), 5);
+}
+
+TEST_F(TaskAllocatorTest, OutputSizeAlignment) {
+    // 1 byte -> aligned to 64
+    auto r1 = allocator.alloc(1);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), 64u);
+
+    // Another 33 bytes -> aligned to 64, total 128
+    auto r2 = allocator.alloc(33);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator.heap_top(), 128u);
+
+    // Exactly 64 bytes -> stays 64, total 192
+    auto r3 = allocator.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator.heap_top(), 192u);
+}
+
+TEST_F(TaskAllocatorTest, SlotMappingPowerOfTwoWindow) {
+    std::set<int32_t> slots;
+    for (int i = 0; i < WINDOW_SIZE; i++) {
+        consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, i, 0);
+        auto r = allocator.alloc(0);
+        ASSERT_FALSE(r.failed());
+        EXPECT_EQ(r.slot, r.task_id & (WINDOW_SIZE - 1));
+        slots.insert(r.slot);
+    }
+    EXPECT_EQ(slots.size(), static_cast<size_t>(WINDOW_SIZE))
+        << "Every slot should be visited exactly once over one window cycle";
+}
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) {
+    auto r1 = allocator.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), 256u);
+
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u);
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 256);
+
+    // Force the allocator to observe the new last_alive by doing another alloc
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+
+    // top=256, tail=256: at_end = 4096-256=3840, at_begin = 256
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u);
+}
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) {
+    auto r1 = allocator.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, 0);
+
+    descriptors[0].packed_buffer_end = static_cast<char *>(static_cast<void *>(heap_buf)) + 64;
+    last_alive.store(1, std::memory_order_release);
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+}
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) {
+    auto r1 = allocator.alloc(128);
+    ASSERT_FALSE(r1.failed());
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128);
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    uint64_t avail_after_first = allocator.heap_available();
+
+    auto r3 = allocator.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator.heap_available(), avail_after_first);
+}
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopGeTail) {
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE);
+
+    auto r1 = allocator.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u);
+}
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopLtTail) {
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64);
+
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    // top=128, tail=HEAP_SIZE-64: available = (HEAP_SIZE-64) - 128
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 64 - 128);
+}
+
+// =============================================================================
+// Boundary conditions
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) {
+    // Allocate 4032 bytes to leave exactly 64 at end.
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE - 64u);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+    EXPECT_EQ(static_cast<char *>(r2.packed_base), reinterpret_cast<char *>(heap_buf) + HEAP_SIZE - 64);
+}
+
+// Wrap guard `tail > alloc_size` uses strict > to prevent full/empty ambiguity.
+// If the allocation were allowed, heap_top would advance to alloc_size == tail,
+// making top == tail.  Because top == tail is the canonical "empty" state, the
+// ring could not distinguish "completely full" from "completely empty".
+TEST_F(TaskAllocatorTest, HeapWrapGuardRejectsTailEqualsAllocSize) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 64);
+
+    auto r2 = allocator.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "wrap guard must reject when tail == alloc_size (full/empty ambiguity)";
+}
+
+TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf));
+    EXPECT_EQ(allocator.heap_top(), 64u);
+}
+
+// Linear-gap guard `tail - top > alloc_size` uses strict > for the same reason.
+TEST_F(TaskAllocatorTest, HeapLinearGapGuardRejectsExactFit) {
+    // Fill most of heap, leaving just 64 at end so next alloc wraps.
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64);
+
+    // Allocate 128 bytes: space_at_end = 64, not enough -> wrap.
+    // tail = HEAP_SIZE-64, which is > 128 -> wraps to beginning.
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator.heap_top(), 128u);
+
+    // Now top=128, tail=HEAP_SIZE-64 (top < tail)
+    // gap = (HEAP_SIZE-64) - 128 = HEAP_SIZE-192
+    // Allocate exactly gap bytes: gap > alloc_size -> FALSE
+    uint64_t gap = (HEAP_SIZE - 64) - 128;
+    auto r3 = allocator.alloc(gap);
+    EXPECT_TRUE(r3.failed()) << "linear-gap guard must reject exact fit (full/empty ambiguity)";
+}
+
+TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) {
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64);
+
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+
+    // gap = (HEAP_SIZE-64) - 128. Try to allocate more than gap.
+    auto r3 = allocator.alloc(HEAP_SIZE);
+    EXPECT_TRUE(r3.failed());
+    EXPECT_NE(error_code.load(), 0);
+}
+
+// heap_available reports max(at_end, at_begin), not the sum -- a single
+// allocation cannot split across the wrap boundary.
+TEST_F(TaskAllocatorTest, AvailableReportsMaxNotSum) {
+    auto r1 = allocator.alloc(3008);
+    ASSERT_FALSE(r1.failed());
+    uint64_t actual_top = allocator.heap_top();
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 1024);
+
+    auto r_probe = allocator.alloc(0);
+    ASSERT_FALSE(r_probe.failed());
+
+    uint64_t avail = allocator.heap_available();
+    uint64_t at_end = HEAP_SIZE - actual_top;
+    uint64_t at_begin = 1024;
+    EXPECT_EQ(avail, std::max(at_end, at_begin));
+    EXPECT_LT(avail, at_end + at_begin);
+}
+
+// Zero-size allocs return the same address and don't advance the top.
+TEST_F(TaskAllocatorTest, ZeroSizeAllocationAliased) {
+    auto r1 = allocator.alloc(0);
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r1.failed());
+    ASSERT_FALSE(r2.failed());
+
+    EXPECT_EQ(r1.packed_base, r2.packed_base) << "Zero-size allocs return same address";
+    EXPECT_EQ(r1.packed_base, r1.packed_end) << "packed_end == packed_base for zero-size";
+    EXPECT_EQ(allocator.heap_top(), 0u) << "top doesn't advance for zero-size allocs";
+}
+
+// Wrap path: wasted space between old top and heap_size is not reclaimed.
+TEST_F(TaskAllocatorTest, WrapPathWastedSpace) {
+    auto r1 = allocator.alloc(4000);
+    ASSERT_FALSE(r1.failed());
+    uint64_t top_after = allocator.heap_top();
+    EXPECT_GE(top_after, 4000u);
+    EXPECT_LT(top_after, HEAP_SIZE);
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, top_after);
+
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf)) << "Allocation wrapped to beginning";
+
+    uint64_t avail = allocator.heap_available();
+    EXPECT_LT(avail, HEAP_SIZE) << "Wasted space at end reduces available capacity";
+}
+
+TEST_F(TaskAllocatorTest, AllocExactlyHeapSize) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.packed_base, static_cast<void *>(heap_buf));
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    auto r2 = allocator.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "No space after full allocation";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(TaskAllocatorTest, AllocLargerThanHeap) {
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed()) << "Cannot allocate more than heap size";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(TaskAllocatorTest, TaskWindowSaturates) {
+    for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+        auto r = allocator.alloc(0);
+        ASSERT_FALSE(r.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(r.task_id, i);
+    }
+    EXPECT_EQ(allocator.active_count(), WINDOW_SIZE - 1);
+
+    auto overflow = allocator.alloc(0);
+    EXPECT_TRUE(overflow.failed());
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
+
+// Task IDs grow monotonically as int32_t. Near INT32_MAX, the same
+// signed-overflow concern applies but is cosmetic since we use
+// task_id & window_mask for indexing.
+TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
+    current_index.store(INT32_MAX - 2);
+    last_alive.store(INT32_MAX - 2);
+    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+
+    auto r1 = allocator.alloc(0);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, INT32_MAX - 2);
+    EXPECT_EQ(r1.slot, (INT32_MAX - 2) & (WINDOW_SIZE - 1));
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, INT32_MAX - 1);
+
+    auto r3 = allocator.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, INT32_MAX);
+    EXPECT_GE(r3.slot, 0);
+    EXPECT_LT(r3.slot, WINDOW_SIZE);
+}
diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp
new file mode 100644
index 000000000..7c468a9e7
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_task_state.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TaskSlotState lifecycle through PTO2SchedulerState API.
+ *
+ * These tests drive state transitions via src methods (release_fanin,
+ * on_subtask_complete, check_and_handle_consumed) rather than manually
+ * operating atomic fields.  For concurrent exactly-once semantics of
+ * fanin/subtask/fanout, see test_scheduler_state.cpp which already
+ * covers those paths via the same API.
+ *
+ * This file focuses on:
+ * - Full lifecycle through src API
+ * - Non-profiling ready path behavior (task_state stays PENDING)
+ * - Double subtask completion (counter-model weakness)
+ */
+
+#include <gtest/gtest.h>
+#include <atomic>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include "scheduler/pto_scheduler.h"
+
+class TaskStateTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched;
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    void init_slot(PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = 0;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+    }
+};
+
+// =============================================================================
+// Full lifecycle through src API: PENDING -> (fanin) -> READY-equivalent
+// -> (subtask) -> COMPLETED -> (fanout) -> CONSUMED
+// =============================================================================
+TEST_F(TaskStateTest, FullLifecycleThroughAPI) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
+    slot.total_required_subtasks = 1;
+    slot.completed_subtasks.store(0);
+
+    // Fanin satisfied -> task becomes ready
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    EXPECT_TRUE(ready);
+
+    // Subtask completes -> task done
+    bool done = sched.on_subtask_complete(slot);
+    EXPECT_TRUE(done);
+
+    // Manually transition to COMPLETED (normally done by scheduler dispatch loop)
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+
+    // Fanout released -> CONSUMED
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// Non-profiling release_fanin does not CAS task_state to READY.
+//
+// Readiness is determined solely by fanin_refcount reaching fanin_count.
+// task_state stays PENDING after the non-profiling ready path. This is
+// correct by design -- the profiling overload adds the CAS only to count
+// atomic operations.
+// =============================================================================
+TEST_F(TaskStateTest, NonProfilingReadyPathStaysPending) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
+
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    ASSERT_TRUE(ready) << "Task should be detected as ready via refcount";
+
+    // task_state remains PENDING -- this is correct by design.
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING)
+        << "Non-profiling path intentionally does not transition task_state to READY";
+}
+
+// =============================================================================
+// Multi-fanin: partial release does not trigger ready
+// =============================================================================
+TEST_F(TaskStateTest, MultiFaninPartialNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 3, 1);
+
+    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
+    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
+    EXPECT_TRUE(sched.release_fanin_and_check_ready(slot));
+}
+
+// =============================================================================
+// Concurrent fanin: exactly one thread detects ready (via src API)
+// =============================================================================
+TEST_F(TaskStateTest, ConcurrentFaninExactlyOneReady) {
+    constexpr int ROUNDS = 500;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        alignas(64) PTO2TaskSlotState slot;
+        init_slot(slot, PTO2_TASK_PENDING, 3, 1);
+        std::atomic<int> ready_count{0};
+
+        auto release = [&]() {
+            if (sched.release_fanin_and_check_ready(slot)) {
+                ready_count.fetch_add(1);
+            }
+        };
+
+        std::thread t1(release), t2(release), t3(release);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(ready_count.load(), 1) << "Round " << round;
+    }
+}
+
+// =============================================================================
+// Concurrent subtask completion: exactly one thread sees done (via src API)
+// =============================================================================
+TEST_F(TaskStateTest, ConcurrentSubtaskCompletion) {
+    constexpr int ROUNDS = 500;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        alignas(64) PTO2TaskSlotState slot;
+        init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+        slot.total_required_subtasks = 3;
+        slot.completed_subtasks.store(0);
+        std::atomic<int> done_count{0};
+
+        auto complete = [&]() {
+            if (sched.on_subtask_complete(slot)) {
+                done_count.fetch_add(1);
+            }
+        };
+
+        std::thread t1(complete), t2(complete), t3(complete);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(done_count.load(), 1) << "Round " << round;
+        EXPECT_EQ(slot.completed_subtasks.load(), 3);
+    }
+}
+
+// =============================================================================
+// Double subtask completion (counter-model weakness).
+// With the counter model, double-completing the same subtask increments
+// completed_subtasks twice, potentially reaching total prematurely.
+// Unlike the old bitmask model, the counter cannot detect duplicates.
+// =============================================================================
+TEST_F(TaskStateTest, DoubleSubtaskCompletionCounterWeakness) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 2;
+    slot.completed_subtasks.store(0);
+
+    // First subtask completion
+    bool done1 = sched.on_subtask_complete(slot);
+    EXPECT_FALSE(done1) << "Single completion doesn't complete the task";
+
+    // Same subtask completes AGAIN (logic error at caller level)
+    bool done2 = sched.on_subtask_complete(slot);
+    EXPECT_TRUE(done2) << "Counter model: double-completion falsely triggers done";
+}
diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp
new file mode 100644
index 000000000..10eef0317
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_tensormap.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TensorMap from pto_tensormap.h / pto_tensormap.cpp
+ *
+ * Tests hash-table-based producer lookup with overlap detection:
+ * - Hash function distribution (golden-ratio multiplicative hash)
+ * - Insert / lookup / cleanup lifecycle
+ * - Overlap detection: fast-path (is_all_offset_zero) and slow-path (offsets)
+ * - Lazy invalidation (stale entries skipped, not truncated)
+ * - Multi-ring isolation in the same hash chain
+ * - Lookup returns all matches (no silent 16-result cap post-#669)
+ * - Entry pool allocation and free-list recycling
+ * - cleanup_retired correctness across task windows
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <set>
+#include <vector>
+
+#include "pto_orchestration_api.h"
+#include "pto_tensormap.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+// Test-local mirror of the old stack-buffered lookup result. PR #669 removed
+// PTO2LookupResult in favor of a callback-based API; these tests collect
+// matches into a vector-like struct so assertions remain readable.
+struct TestLookupResult {
+    struct Entry {
+        PTO2TensorMapEntry *entry;
+        OverlapStatus overlap_status;
+    };
+    std::vector<Entry> entries;
+    int count = 0;
+};
+
+static void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) {
+    tmap.lookup(tensor, [&](PTO2TensorMapEntry &e, OverlapStatus s) -> bool {
+        out.entries.push_back({&e, s});
+        out.count++;
+        return true;
+    });
+}
+
+static Tensor make_test_tensor(uint64_t addr, uint32_t shape0, uint32_t ndims = 1, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, ndims, DataType::FLOAT32, false, version);
+}
+
+static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {s0, s1};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 2, DataType::FLOAT32, false, version);
+}
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class TensorMapTest : public ::testing::Test {
+protected:
+    static constexpr int32_t NUM_BUCKETS = 16;
+    static constexpr int32_t POOL_SIZE = 64;
+    static constexpr int32_t WINDOW_SIZE = 32;
+
+    PTO2TensorMap tmap{};
+
+    void SetUp() override {
+        int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE};
+        ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes));
+    }
+
+    void TearDown() override { tmap.destroy(); }
+};
+
+// =============================================================================
+// Initialization
+// =============================================================================
+
+TEST_F(TensorMapTest, InitValidState) {
+    EXPECT_EQ(tmap.num_buckets, NUM_BUCKETS);
+    EXPECT_EQ(tmap.pool_size, POOL_SIZE);
+    EXPECT_EQ(tmap.next_entry_idx, 0);
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.valid_count(), 0);
+}
+
+TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
+    PTO2TensorMap bad{};
+    int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
+    EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail";
+    EXPECT_FALSE(bad.init(7, 64, ws));
+    EXPECT_TRUE(bad.init(8, 64, ws));
+    bad.destroy();
+}
+
+// =============================================================================
+// Hash function
+// =============================================================================
+
+TEST_F(TensorMapTest, HashDeterministic) {
+    uint64_t addr = 0x1000;
+    EXPECT_EQ(tmap.hash(addr), tmap.hash(addr));
+}
+
+TEST_F(TensorMapTest, HashDistributesAlignedAddresses) {
+    std::set<uint32_t> hit_buckets;
+    // Aligned addresses (64KB stride) should still distribute across buckets
+    for (uint64_t i = 0; i < 64; i++) {
+        uint64_t addr = i * 65536;
+        hit_buckets.insert(tmap.hash(addr));
+    }
+    // With golden-ratio hash, 64 aligned addresses across 16 buckets
+    // should hit at least 12 distinct buckets
+    EXPECT_GE(hit_buckets.size(), 12u) << "Aligned addresses must distribute well";
+}
+
+TEST_F(TensorMapTest, HashBoundedByBucketCount) {
+    for (uint64_t addr = 0; addr < 1000; addr++) {
+        EXPECT_LT(tmap.hash(addr), static_cast<uint32_t>(NUM_BUCKETS));
+    }
+}
+
+// =============================================================================
+// Insert and lookup: basic
+// =============================================================================
+
+TEST_F(TensorMapTest, InsertThenLookupFindsProducer) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    PTO2TaskId tid = PTO2TaskId::make(0, 0);
+    tmap.insert(t, tid);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, tid);
+}
+
+TEST_F(TensorMapTest, LookupEmptyReturnsZero) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 0);
+}
+
+TEST_F(TensorMapTest, InsertMultipleSameBuffer) {
+    Tensor t1 = make_test_tensor(0x1000, 256);
+    Tensor t2 = make_test_tensor(0x1000, 128);
+    PTO2TaskId tid1 = PTO2TaskId::make(0, 0);
+    PTO2TaskId tid2 = PTO2TaskId::make(0, 1);
+
+    tmap.insert(t1, tid1);
+    tmap.insert(t2, tid2);
+
+    TestLookupResult result;
+    run_lookup(tmap, t1, result);
+    // Both entries share same buffer_addr, so both should be found
+    EXPECT_EQ(result.count, 2);
+}
+
+TEST_F(TensorMapTest, InsertDifferentBuffersNoCollision) {
+    Tensor t1 = make_test_tensor(0x1000, 256);
+    Tensor t2 = make_test_tensor(0x2000, 256);
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    TestLookupResult r1;
+    run_lookup(tmap, t1, r1);
+    EXPECT_EQ(r1.count, 1);
+    EXPECT_EQ(r1.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0));
+
+    TestLookupResult r2;
+    run_lookup(tmap, t2, r2);
+    EXPECT_EQ(r2.count, 1);
+    EXPECT_EQ(r2.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1));
+}
+
+// =============================================================================
+// Overlap detection: fast path (is_all_offset_zero)
+// =============================================================================
+
+TEST_F(TensorMapTest, OverlapFastPathCovered) {
+    // Producer output: shape [256], consumer input: shape [512]
+    // Consumer covers producer -> COVERED
+    Tensor producer = make_test_tensor(0x1000, 256);
+    Tensor consumer = make_test_tensor(0x1000, 512);
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+TEST_F(TensorMapTest, OverlapFastPathOther) {
+    // Producer output: shape [512], consumer input: shape [256]
+    // Consumer does NOT cover producer -> OTHER
+    Tensor producer = make_test_tensor(0x1000, 512);
+    Tensor consumer = make_test_tensor(0x1000, 256);
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+TEST_F(TensorMapTest, OverlapFastPathExactMatch) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+// =============================================================================
+// Overlap detection: slow path (offsets via view)
+// =============================================================================
+
+TEST_F(TensorMapTest, OverlapSlowPathNoOverlap) {
+    // Producer writes [0..128), consumer reads [128..256) -> NO_OVERLAP
+    Tensor base = make_test_tensor_2d(0x1000, 256, 1);
+    uint32_t prod_shapes[] = {128, 1};
+    uint32_t prod_offsets[] = {0, 0};
+    Tensor producer = base.view(prod_shapes, prod_offsets);
+
+    uint32_t con_shapes[] = {128, 1};
+    uint32_t con_offsets[] = {128, 0};
+    Tensor consumer = base.view(con_shapes, con_offsets);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    EXPECT_EQ(result.count, 0) << "Non-overlapping regions must return no results";
+}
+
+TEST_F(TensorMapTest, OverlapSlowPathPartialOverlap) {
+    // Producer writes [0..192), consumer reads [64..256) -> overlapping, OTHER
+    Tensor base = make_test_tensor_2d(0x1000, 256, 1);
+    uint32_t prod_shapes[] = {192, 1};
+    uint32_t prod_offsets[] = {0, 0};
+    Tensor producer = base.view(prod_shapes, prod_offsets);
+
+    uint32_t con_shapes[] = {192, 1};
+    uint32_t con_offsets[] = {64, 0};
+    Tensor consumer = base.view(con_shapes, con_offsets);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+TEST_F(TensorMapTest, OverlapSlowPathCovered) {
+    // Producer writes [64..192), consumer reads [0..256) -> consumer covers producer
+    Tensor base = make_test_tensor_2d(0x1000, 256, 1);
+    uint32_t prod_shapes[] = {128, 1};
+    uint32_t prod_offsets[] = {64, 0};
+    Tensor producer = base.view(prod_shapes, prod_offsets);
+
+    uint32_t con_shapes[] = {256, 1};
+    uint32_t con_offsets[] = {0, 0};
+    Tensor consumer = base.view(con_shapes, con_offsets);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+// =============================================================================
+// Version-based overlap detection
+// =============================================================================
+
+TEST_F(TensorMapTest, VersionMismatchReturnsOther) {
+    // Producer v0, consumer v1 -> always OTHER regardless of shape match
+    Tensor producer = make_test_tensor(0x1000, 256, 1, 0);
+    Tensor consumer = make_test_tensor(0x1000, 256, 1, 1);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// =============================================================================
+// Lazy invalidation
+// =============================================================================
+
+TEST_F(TensorMapTest, StaleEntriesSkippedDuringLookup) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(0, 1));
+
+    // Advance validity to skip task 0
+    tmap.sync_validity(0, 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1));
+}
+
+TEST_F(TensorMapTest, StaleEntriesNotTruncatedAcrossRings) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    // Ring 0, task 0 and Ring 1, task 0 -> same bucket
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(1, 0));
+
+    // Invalidate ring 0 only
+    tmap.sync_validity(0, 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    // Ring 1 task 0 still valid, ring 0 task 0 invalidated
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0));
+}
+
+// =============================================================================
+// cleanup_retired
+// =============================================================================
+
+TEST_F(TensorMapTest, CleanupRetiredRemovesEntriesForRetiredTasks) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(0, 1));
+    tmap.insert(t, PTO2TaskId::make(0, 2));
+    EXPECT_EQ(tmap.valid_count(), 3);
+
+    // Cleanup tasks [0, 2) on ring 0
+    tmap.cleanup_retired(0, 0, 2);
+
+    EXPECT_EQ(tmap.valid_count(), 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 2));
+}
+
+TEST_F(TensorMapTest, CleanupRetiredPreservesOtherRings) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(1, 0));
+
+    tmap.cleanup_retired(0, 0, 1);
+
+    EXPECT_EQ(tmap.valid_count(), 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0));
+}
+
+TEST_F(TensorMapTest, CleanupRetiredFreesEntriesToPool) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.next_entry_idx, 1);
+
+    tmap.cleanup_retired(0, 0, 1);
+
+    EXPECT_EQ(tmap.free_num, 1) << "Cleaned entry should be in free list";
+
+    // New insert should reuse free entry instead of allocating fresh
+    tmap.insert(t, PTO2TaskId::make(0, 1));
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.next_entry_idx, 1) << "Should reuse freed entry, not allocate new";
+}
+
+// =============================================================================
+// Multi-ring isolation
+// =============================================================================
+
+TEST_F(TensorMapTest, MultiRingIndependentLookup) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 5));
+    tmap.insert(t, PTO2TaskId::make(1, 3));
+    tmap.insert(t, PTO2TaskId::make(2, 7));
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 3);
+
+    // Invalidate ring 0 up to task 6 and ring 2 up to task 8
+    tmap.sync_validity(0, 6);
+    tmap.sync_validity(2, 8);
+
+    TestLookupResult result2;
+    run_lookup(tmap, t, result2);
+    EXPECT_EQ(result2.count, 1);
+    EXPECT_EQ(result2.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 3));
+}
+
+// =============================================================================
+// Lookup returns all matches (PR #669 removed the 16-slot cap)
+// =============================================================================
+
+TEST_F(TensorMapTest, LookupReturnsAllMatches) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    // Insert 20 entries for the same buffer (was capped at 16 before #669)
+    for (int i = 0; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 20) << "Lookup must return every overlapping entry, no silent cap";
+}
+
+// =============================================================================
+// Entry pool lifecycle
+// =============================================================================
+
+TEST_F(TensorMapTest, PoolExhaustionAsserts) {
+    // With pool_size=64, inserting 64 entries should work, 65th should fail
+    for (int i = 0; i < POOL_SIZE; i++) {
+        Tensor t = make_test_tensor(0x1000 + i * 0x100, 256);
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    EXPECT_EQ(tmap.next_entry_idx, POOL_SIZE);
+    EXPECT_EQ(tmap.free_num, 0);
+
+    // 65th insert should trigger always_assert (pool overflow)
+    Tensor overflow = make_test_tensor(0x9000, 256);
+    EXPECT_THROW(tmap.insert(overflow, PTO2TaskId::make(0, POOL_SIZE)), std::runtime_error);
+}
+
+TEST_F(TensorMapTest, FreeListRecycling) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    // Insert and cleanup 10 entries
+    for (int i = 0; i < 10; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    tmap.cleanup_retired(0, 0, 10);
+    EXPECT_EQ(tmap.free_num, 10);
+
+    // Re-insert should use free list
+    for (int i = 10; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.next_entry_idx, 10) << "No new pool entries consumed when free list available";
+}
+
+// =============================================================================
+// Task chain integrity (per-task entry list)
+// =============================================================================
+
+TEST_F(TensorMapTest, PerTaskEntryListTracksMultipleOutputs) {
+    Tensor t1 = make_test_tensor(0x1000, 256);
+    Tensor t2 = make_test_tensor(0x2000, 128);
+    PTO2TaskId tid = PTO2TaskId::make(0, 5);
+
+    tmap.insert(t1, tid);
+    tmap.insert(t2, tid);
+    EXPECT_EQ(tmap.valid_count(), 2);
+
+    // Cleanup task 5 should remove both entries
+    tmap.cleanup_retired(0, 5, 6);
+    EXPECT_EQ(tmap.valid_count(), 0);
+    EXPECT_EQ(tmap.free_num, 2);
+}
+
+// =============================================================================
+// Bucket chain integrity (doubly-linked list)
+// =============================================================================
+
+TEST_F(TensorMapTest, RemoveMiddleEntryPreservesChain) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    PTO2TaskId tid0 = PTO2TaskId::make(0, 0);
+    PTO2TaskId tid1 = PTO2TaskId::make(0, 1);
+    PTO2TaskId tid2 = PTO2TaskId::make(0, 2);
+
+    tmap.insert(t, tid0);
+    tmap.insert(t, tid1);
+    tmap.insert(t, tid2);
+
+    // Remove middle entry (task 1)
+    tmap.cleanup_retired(0, 1, 2);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 2);
+
+    std::set<uint32_t> found_locals;
+    for (int i = 0; i < result.count; i++) {
+        found_locals.insert(result.entries[i].entry->producer_task_id.local());
+    }
+    EXPECT_TRUE(found_locals.count(0));
+    EXPECT_TRUE(found_locals.count(2));
+}
+
+// =============================================================================
+// PTO2TaskId encoding/decoding
+// =============================================================================
+
+TEST(TaskIdTest, MakeAndDecode) {
+    auto tid = PTO2TaskId::make(3, 42);
+    EXPECT_EQ(tid.ring(), 3);
+    EXPECT_EQ(tid.local(), 42u);
+}
+
+TEST(TaskIdTest, InvalidSentinel) {
+    auto inv = PTO2TaskId::invalid();
+    EXPECT_FALSE(inv.is_valid());
+    EXPECT_EQ(inv.raw, UINT64_MAX);
+}
+
+TEST(TaskIdTest, Equality) {
+    auto a = PTO2TaskId::make(1, 100);
+    auto b = PTO2TaskId::make(1, 100);
+    auto c = PTO2TaskId::make(2, 100);
+    EXPECT_EQ(a, b);
+    EXPECT_NE(a, c);
+}
+
+TEST(TaskIdTest, RingIdMaxValue) {
+    auto tid = PTO2TaskId::make(255, 0);
+    EXPECT_EQ(tid.ring(), 255);
+    EXPECT_EQ(tid.local(), 0u);
+}
+
+TEST(TaskIdTest, LocalIdMaxValue) {
+    auto tid = PTO2TaskId::make(0, UINT32_MAX);
+    EXPECT_EQ(tid.ring(), 0);
+    EXPECT_EQ(tid.local(), UINT32_MAX);
+}
diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp
new file mode 100644
index 000000000..964e826f8
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_wiring.cpp
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for scheduler wiring and completion paths:
+ *
+ * 1. wire_task()         — fanout wiring, early-finished detection,
+ *                          fanin_count initialization, ready push
+ * 2. on_mixed_task_complete() — COMPLETED transition, fanout traversal,
+ *                               consumer fanin release
+ * 3. on_task_release()   — fanin traversal, producer release,
+ *                          self-CONSUMED check
+ * 4. advance_ring_pointers() — CONSUMED slot scan, reset_for_reuse
+ *
+ * These tests exercise the core scheduling hot-paths that had zero coverage.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Fixture: sets up a scheduler with shared memory and provides helpers
+// =============================================================================
+
+class WiringTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched{};
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    // Initialize a slot for testing wiring/completion
+    void init_slot(
+        PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0
+    ) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = ring_id;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+        slot.dep_pool_mark = 0;
+    }
+};
+
+// =============================================================================
+// wire_task: no fanin (independent task)
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskNoFaninBecomesReady) {
+    // A task with 0 actual fanins should immediately be pushed to ready queue
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 0);
+
+    // fanin_count set to 0 + 1 = 1 (the wiring "+1" sentinel)
+    EXPECT_EQ(task_slot.fanin_count, 1);
+    // fanin_refcount should be 1 (the +1 from no-fanin path)
+    EXPECT_EQ(task_slot.fanin_refcount.load(), 1);
+
+    // Task should be in ready queue
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, &task_slot);
+}
+
+// =============================================================================
+// wire_task: with fanin, all producers already completed (early-finished)
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskAllProducersEarlyFinished) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producer_slots[2];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    // Set up 2 producers that are already COMPLETED
+    for (int i = 0; i < 2; i++) {
+        init_slot(producer_slots[i], PTO2_TASK_COMPLETED, 1, 2);
+    }
+
+    // Consumer task with 2 fanins
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 2;
+    payload.fanin_inline_slot_states[0] = &producer_slots[0];
+    payload.fanin_inline_slot_states[1] = &producer_slots[1];
+
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 2);
+
+    // fanin_count = 2 + 1 = 3
+    EXPECT_EQ(task_slot.fanin_count, 3);
+    // early_finished = 2, init_rc = 2 + 1 = 3, so refcount should hit fanin_count
+    EXPECT_GE(task_slot.fanin_refcount.load(), task_slot.fanin_count);
+
+    // Task should be in ready queue
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, &task_slot);
+}
+
+// =============================================================================
+// wire_task: with fanin, producers still pending (task NOT ready)
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskProducersPendingTaskNotReady) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producer_slots[2];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    // Producers are RUNNING (not yet completed)
+    for (int i = 0; i < 2; i++) {
+        init_slot(producer_slots[i], PTO2_TASK_RUNNING, 1, 2);
+    }
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 2;
+    payload.fanin_inline_slot_states[0] = &producer_slots[0];
+    payload.fanin_inline_slot_states[1] = &producer_slots[1];
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 2);
+
+    // fanin_count = 3 (2 + 1)
+    EXPECT_EQ(task_slot.fanin_count, 3);
+    // early_finished = 0, init_rc = 1 -> not ready
+    EXPECT_EQ(task_slot.fanin_refcount.load(), 1);
+    EXPECT_LT(task_slot.fanin_refcount.load(), task_slot.fanin_count);
+
+    // Ready queue should be empty
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, nullptr);
+
+    // Producers should have fanout_head pointing to task_slot
+    EXPECT_NE(producer_slots[0].fanout_head, nullptr);
+    EXPECT_EQ(producer_slots[0].fanout_head->slot_state, &task_slot);
+    EXPECT_NE(producer_slots[1].fanout_head, nullptr);
+    EXPECT_EQ(producer_slots[1].fanout_head->slot_state, &task_slot);
+}
+
+// =============================================================================
+// wire_task: mixed early-finished and pending producers
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskMixedProducerStates) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producers[3];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(producers[0], PTO2_TASK_COMPLETED, 1, 2);  // early finished
+    init_slot(producers[1], PTO2_TASK_RUNNING, 1, 2);    // still running
+    init_slot(producers[2], PTO2_TASK_CONSUMED, 1, 2);   // early finished (>= COMPLETED)
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 3;
+    for (int i = 0; i < 3; i++) {
+        payload.fanin_inline_slot_states[i] = &producers[i];
+    }
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 3);
+
+    // fanin_count = 4 (3 + 1)
+    EXPECT_EQ(task_slot.fanin_count, 4);
+    // early_finished = 2 (COMPLETED + CONSUMED), init_rc = 3
+    // Not yet 4 -> not ready (one producer still running)
+    EXPECT_EQ(task_slot.fanin_refcount.load(), 3);
+
+    // Only the running producer should have the consumer in its fanout chain
+    EXPECT_EQ(producers[0].fanout_head, nullptr);  // early finished, no dep entry added
+    EXPECT_NE(producers[1].fanout_head, nullptr);  // running, dep entry added
+    EXPECT_EQ(producers[2].fanout_head, nullptr);  // early finished
+}
+
+// =============================================================================
+// on_mixed_task_complete: notifies consumers via fanout chain
+// =============================================================================
+
+TEST_F(WiringTest, OnMixedTaskCompleteNotifiesConsumers) {
+    alignas(64) PTO2TaskSlotState producer;
+    alignas(64) PTO2TaskSlotState consumer1, consumer2;
+    alignas(64) PTO2TaskPayload prod_payload;
+    memset(&prod_payload, 0, sizeof(prod_payload));
+    PTO2TaskDescriptor desc{};
+
+    // Set up producer in RUNNING state with 2 consumers in fanout chain
+    init_slot(producer, PTO2_TASK_RUNNING, 1, 1);
+    producer.payload = &prod_payload;
+    producer.task = &desc;
+
+    // Consumer1: needs 1 more fanin to become ready
+    init_slot(consumer1, PTO2_TASK_PENDING, 2, 1);
+    consumer1.fanin_refcount.store(1);  // 1 of 2 satisfied
+    consumer1.active_mask = PTO2_SUBTASK_MASK_AIC;
+
+    // Consumer2: this release will make it ready
+    init_slot(consumer2, PTO2_TASK_PENDING, 2, 1);
+    consumer2.fanin_refcount.store(1);  // 1 of 2 satisfied
+    consumer2.active_mask = PTO2_SUBTASK_MASK_AIC;
+
+    // Build fanout chain: producer -> consumer2 -> consumer1
+    PTO2DepListEntry dep_entries[2];
+    dep_entries[0].slot_state = &consumer1;
+    dep_entries[0].next = nullptr;
+    dep_entries[1].slot_state = &consumer2;
+    dep_entries[1].next = &dep_entries[0];
+    producer.fanout_head = &dep_entries[1];
+
+    sched.on_mixed_task_complete(producer);
+
+    // Producer should be COMPLETED
+    EXPECT_EQ(producer.task_state.load(), PTO2_TASK_COMPLETED);
+
+    // Both consumers should have fanin_refcount incremented
+    EXPECT_EQ(consumer1.fanin_refcount.load(), 2);
+    EXPECT_EQ(consumer2.fanin_refcount.load(), 2);
+
+    // Both consumers should be ready (fanin_refcount == fanin_count)
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(consumer1.active_mask);
+    auto *r1 = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    auto *r2 = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_TRUE((r1 == &consumer1 && r2 == &consumer2) || (r1 == &consumer2 && r2 == &consumer1));
+}
+
+// =============================================================================
+// on_task_release: releases producers via fanin traversal
+// =============================================================================
+
+TEST_F(WiringTest, OnTaskReleaseReleasesProducers) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producers[2];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    // 2 producers, each COMPLETED with fanout_count=1
+    for (int i = 0; i < 2; i++) {
+        init_slot(producers[i], PTO2_TASK_COMPLETED, 1, 1);
+    }
+
+    init_slot(task_slot, PTO2_TASK_COMPLETED, 3, 1);
+    payload.fanin_actual_count = 2;
+    payload.fanin_inline_slot_states[0] = &producers[0];
+    payload.fanin_inline_slot_states[1] = &producers[1];
+    // Need a valid fanin_spill_pool even though we don't spill
+    PTO2FaninPool dummy_pool{};
+    PTO2FaninSpillEntry dummy_entries[4];
+    std::atomic<int32_t> dummy_error{PTO2_ERROR_NONE};
+    dummy_pool.init(dummy_entries, 4, &dummy_error);
+    payload.fanin_spill_pool = &dummy_pool;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    int32_t fanin_count = sched.on_task_release(task_slot);
+    EXPECT_EQ(fanin_count, 2);
+
+    // Each producer should have fanout_refcount incremented
+    EXPECT_EQ(producers[0].fanout_refcount.load(), 1);
+    EXPECT_EQ(producers[1].fanout_refcount.load(), 1);
+
+    // Producers with fanout_refcount == fanout_count AND COMPLETED -> CONSUMED
+    EXPECT_EQ(producers[0].task_state.load(), PTO2_TASK_CONSUMED);
+    EXPECT_EQ(producers[1].task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// advance_ring_pointers: scans CONSUMED slots, resets, advances last_alive
+// =============================================================================
+
+TEST_F(WiringTest, AdvanceRingPointersScansConsumed) {
+    auto &rss = sched.ring_sched_states[0];
+    auto *ring = rss.ring;
+
+    // Submit 3 tasks via flow control
+    ring->fc.current_task_index.store(3, std::memory_order_release);
+
+    // Mark all 3 as CONSUMED
+    for (int i = 0; i < 3; i++) {
+        auto &slot = ring->get_slot_state_by_task_id(i);
+        slot.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_release);
+    }
+
+    EXPECT_EQ(rss.last_task_alive, 0);
+    rss.advance_ring_pointers();
+    EXPECT_EQ(rss.last_task_alive, 3);
+
+    // Verify SM was synced
+    EXPECT_EQ(ring->fc.last_task_alive.load(), 3);
+}
+
+TEST_F(WiringTest, AdvanceRingPointersStopsAtNonConsumed) {
+    auto &rss = sched.ring_sched_states[0];
+    auto *ring = rss.ring;
+
+    ring->fc.current_task_index.store(5, std::memory_order_release);
+
+    // Tasks 0,1 CONSUMED; task 2 COMPLETED (not consumed)
+    ring->get_slot_state_by_task_id(0).task_state.store(PTO2_TASK_CONSUMED);
+    ring->get_slot_state_by_task_id(1).task_state.store(PTO2_TASK_CONSUMED);
+    ring->get_slot_state_by_task_id(2).task_state.store(PTO2_TASK_COMPLETED);
+
+    rss.advance_ring_pointers();
+    EXPECT_EQ(rss.last_task_alive, 2) << "Should stop at first non-CONSUMED slot";
+}
+
+TEST_F(WiringTest, AdvanceRingPointersResetsSlots) {
+    auto &rss = sched.ring_sched_states[0];
+    auto *ring = rss.ring;
+
+    ring->fc.current_task_index.store(1, std::memory_order_release);
+
+    auto &slot = ring->get_slot_state_by_task_id(0);
+    slot.task_state.store(PTO2_TASK_CONSUMED);
+    slot.fanout_count = 5;
+    slot.fanin_refcount.store(3);
+    slot.fanout_refcount.store(2);
+    slot.completed_subtasks.store(1);
+
+    rss.advance_ring_pointers();
+
+    // After reset_for_reuse: fanout_count=1, fanin_refcount=0, etc.
+    EXPECT_EQ(slot.fanout_count, 1);
+    EXPECT_EQ(slot.fanin_refcount.load(), 0);
+    EXPECT_EQ(slot.fanout_refcount.load(), 0);
+    EXPECT_EQ(slot.completed_subtasks.load(), 0);
+    EXPECT_EQ(slot.fanout_head, nullptr);
+}
+
+// =============================================================================
+// drain_wiring_queue: pushes tasks through SPSC queue
+// =============================================================================
+
+TEST_F(WiringTest, DrainWiringQueueProcessesTasks) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    // Push into wiring SPSC queue (orchestrator side)
+    ASSERT_TRUE(sched.wiring.queue.push(&task_slot));
+
+    // Drain (scheduler thread 0 side)
+    int wired = sched.drain_wiring_queue(true /* force_drain */);
+    EXPECT_EQ(wired, 1);
+
+    // Task should be ready
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, &task_slot);
+}
+
+TEST_F(WiringTest, DrainWiringQueueBackoffDefers) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    sched.wiring.queue.push(&task_slot);
+
+    // Without force_drain, single item < BATCH_SIZE → backoff
+    sched.wiring.backoff_counter = 0;
+    int wired = sched.drain_wiring_queue(false);
+    EXPECT_EQ(wired, 0) << "Backoff should defer when queue < BATCH_SIZE";
+    EXPECT_EQ(sched.wiring.backoff_counter, 1);
+}
+
+TEST_F(WiringTest, DrainWiringQueueBackoffLimitForcesProcess) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    sched.wiring.queue.push(&task_slot);
+
+    // Set backoff at limit → should process
+    sched.wiring.backoff_counter = PTO2SchedulerState::WiringState::BACKOFF_LIMIT;
+    int wired = sched.drain_wiring_queue(false);
+    EXPECT_EQ(wired, 1) << "Backoff limit reached should force processing";
+}
diff --git a/tests/ut/cpp/a5/test_dep_list_pool.cpp b/tests/ut/cpp/a5/test_dep_list_pool.cpp
new file mode 100644
index 000000000..a86a393d1
--- /dev/null
+++ b/tests/ut/cpp/a5/test_dep_list_pool.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2DepListPool from pto_ring_buffer.h
+ *
+ * Tests dependency list pool allocation, prepend chaining, overflow detection,
+ * tail advancement, and high-water mark tracking.
+ *
+ * Design contracts:
+ *
+ * - advance_tail(new_tail) only advances if new_tail > tail; it does
+ *   not validate new_tail <= top.  Caller contract (monotonic,
+ *   top-bounded).
+ *
+ * - The list terminator is literal nullptr.  base[0] is a normal pool entry;
+ *   init clearing it is incidental, not an invariant.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class DepListPoolTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 8;
+    PTO2DepListEntry entries[POOL_CAP]{};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2DepListPool pool{};
+
+    void SetUp() override {
+        std::memset(entries, 0, sizeof(entries));
+        error_code.store(PTO2_ERROR_NONE);
+        pool.init(entries, POOL_CAP, &error_code);
+    }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(DepListPoolTest, InitialState) {
+    EXPECT_EQ(pool.used(), 0);
+    EXPECT_EQ(pool.available(), POOL_CAP);
+}
+
+TEST_F(DepListPoolTest, SingleAlloc) {
+    PTO2DepListEntry *entry = pool.alloc();
+    ASSERT_NE(entry, nullptr);
+    EXPECT_EQ(pool.used(), 1);
+    EXPECT_EQ(pool.available(), POOL_CAP - 1);
+}
+
+TEST_F(DepListPoolTest, OverflowDetection) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        PTO2DepListEntry *e = pool.alloc();
+        ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i;
+    }
+    EXPECT_EQ(pool.used(), POOL_CAP);
+    EXPECT_EQ(pool.available(), 0);
+
+    PTO2DepListEntry *overflow = pool.alloc();
+    EXPECT_EQ(overflow, nullptr);
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW);
+}
+
+// Prepend builds LIFO linked list: verify each slot_state pointer.
+TEST_F(DepListPoolTest, PrependChainCorrectness) {
+    PTO2TaskSlotState slots[5]{};
+    PTO2DepListEntry *head = nullptr;
+
+    for (int i = 0; i < 5; i++) {
+        head = pool.prepend(head, &slots[i]);
+        ASSERT_NE(head, nullptr);
+    }
+
+    // LIFO order: head -> slots[4] -> slots[3] -> ... -> slots[0] -> nullptr.
+    PTO2DepListEntry *cur = head;
+    for (int i = 4; i >= 0; i--) {
+        ASSERT_NE(cur, nullptr);
+        EXPECT_EQ(cur->slot_state, &slots[i]) << "Entry " << (4 - i) << " should point to slots[" << i << "]";
+        cur = cur->next;
+    }
+    EXPECT_EQ(cur, nullptr) << "Chain should terminate with nullptr";
+}
+
+TEST_F(DepListPoolTest, AdvanceTail) {
+    for (int i = 0; i < 4; i++) {
+        pool.alloc();
+    }
+    EXPECT_EQ(pool.used(), 4);
+    EXPECT_EQ(pool.available(), POOL_CAP - 4);
+
+    pool.advance_tail(4);
+    EXPECT_EQ(pool.used(), 1);
+    EXPECT_EQ(pool.available(), POOL_CAP - 1);
+}
+
+TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) {
+    pool.alloc();
+    pool.alloc();
+    pool.advance_tail(3);
+    int32_t used_after = pool.used();
+
+    pool.advance_tail(2);
+    EXPECT_EQ(pool.used(), used_after);
+
+    pool.advance_tail(3);
+    EXPECT_EQ(pool.used(), used_after);
+}
+
+TEST_F(DepListPoolTest, HighWaterAccuracy) {
+    for (int i = 0; i < 5; i++)
+        pool.alloc();
+    EXPECT_EQ(pool.high_water, 5);
+
+    pool.advance_tail(4);
+    EXPECT_EQ(pool.high_water, 5) << "High water never decreases";
+
+    for (int i = 0; i < 3; i++)
+        pool.alloc();
+    EXPECT_GE(pool.high_water, 5);
+}
+
+// =============================================================================
+// Boundary conditions
+// =============================================================================
+
+// Prepend chain integrity under pool exhaustion: chain must be walkable.
+TEST_F(DepListPoolTest, PrependUnderExhaustion) {
+    PTO2TaskSlotState slots[POOL_CAP]{};
+    PTO2DepListEntry *head = nullptr;
+
+    int count = 0;
+    while (count < POOL_CAP + 5) {
+        PTO2DepListEntry *new_head = pool.prepend(head, &slots[count % POOL_CAP]);
+        if (!new_head) break;
+        head = new_head;
+        count++;
+    }
+
+    int walk = 0;
+    PTO2DepListEntry *cur = head;
+    while (cur) {
+        walk++;
+        cur = cur->next;
+        if (walk > count + 1) {
+            FAIL() << "Chain has cycle -- walked more entries than allocated";
+            break;
+        }
+    }
+    EXPECT_EQ(walk, count);
+}
diff --git a/tests/ut/cpp/a5/test_fanin_pool.cpp b/tests/ut/cpp/a5/test_fanin_pool.cpp
new file mode 100644
index 000000000..29199ae2e
--- /dev/null
+++ b/tests/ut/cpp/a5/test_fanin_pool.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2FaninPool and pto2_for_each_fanin_storage/slot_state
+ * from pto_ring_buffer.h / pto_ring_buffer.cpp
+ *
+ * Tests:
+ * 1. PTO2FaninPool — ring buffer allocation, overflow, tail advance,
+ *    high-water tracking
+ * 2. pto2_for_each_fanin_storage — inline-only, spill without wrap,
+ *    spill with wrap, callback early return
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// FaninPool fixture
+// =============================================================================
+
+class FaninPoolTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 32;
+
+    std::vector<PTO2FaninSpillEntry> entries;
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2FaninPool pool{};
+
+    void SetUp() override {
+        entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr});
+        error_code.store(PTO2_ERROR_NONE);
+        pool.init(entries.data(), POOL_CAP, &error_code);
+    }
+};
+
+// =============================================================================
+// FaninPool: basic operations
+// =============================================================================
+
+TEST_F(FaninPoolTest, InitialState) {
+    EXPECT_EQ(pool.used(), 0);
+    EXPECT_EQ(pool.available(), POOL_CAP);
+    EXPECT_EQ(pool.top, 1);
+    EXPECT_EQ(pool.tail, 1);
+    EXPECT_EQ(pool.high_water, 0);
+}
+
+TEST_F(FaninPoolTest, AllocReturnsCorrectModuloIndex) {
+    // First alloc at index top%cap = 1%32 = 1
+    auto *e1 = pool.alloc();
+    EXPECT_EQ(e1, &entries[1]);
+
+    auto *e2 = pool.alloc();
+    EXPECT_EQ(e2, &entries[2]);
+}
+
+TEST_F(FaninPoolTest, AllocFillsPool) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        auto *e = pool.alloc();
+        ASSERT_NE(e, nullptr) << "Alloc failed at i=" << i;
+    }
+    EXPECT_EQ(pool.used(), POOL_CAP);
+    EXPECT_EQ(pool.available(), 0);
+}
+
+TEST_F(FaninPoolTest, OverflowReturnsNullptr) {
+    for (int i = 0; i < POOL_CAP; i++) {
+        pool.alloc();
+    }
+    auto *overflow = pool.alloc();
+    EXPECT_EQ(overflow, nullptr);
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW);
+}
+
+TEST_F(FaninPoolTest, AdvanceTailFreesSpace) {
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+    EXPECT_EQ(pool.used(), 10);
+
+    pool.advance_tail(pool.tail + 5);
+    EXPECT_EQ(pool.used(), 5);
+    EXPECT_EQ(pool.available(), POOL_CAP - 5);
+}
+
+TEST_F(FaninPoolTest, AdvanceTailBackwardsIsNoop) {
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+    int32_t old_tail = pool.tail;
+    pool.advance_tail(old_tail - 1);
+    EXPECT_EQ(pool.tail, old_tail);
+    EXPECT_EQ(pool.used(), 10);
+}
+
+TEST_F(FaninPoolTest, HighWaterNeverDecreases) {
+    for (int i = 0; i < 10; i++) {
+        pool.alloc();
+    }
+    EXPECT_EQ(pool.high_water, 10);
+
+    pool.advance_tail(pool.tail + 5);
+    EXPECT_EQ(pool.high_water, 10) << "high_water must never decrease";
+}
+
+TEST_F(FaninPoolTest, WrapAroundAllocation) {
+    // Fill and drain, then fill again to wrap
+    for (int i = 0; i < POOL_CAP; i++) {
+        pool.alloc();
+    }
+    pool.advance_tail(pool.top);
+    EXPECT_EQ(pool.used(), 0);
+
+    // New allocations wrap around
+    for (int i = 0; i < 5; i++) {
+        auto *e = pool.alloc();
+        ASSERT_NE(e, nullptr);
+        // Verify modulo indexing
+        int32_t expected_idx = (pool.top - 1) % POOL_CAP;
+        EXPECT_EQ(e, &entries[expected_idx]);
+    }
+    EXPECT_EQ(pool.used(), 5);
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: inline only
+// =============================================================================
+
+class ForEachFaninTest : public ::testing::Test {
+protected:
+    static constexpr int32_t POOL_CAP = 32;
+
+    std::vector<PTO2FaninSpillEntry> spill_entries;
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2FaninPool spill_pool{};
+
+    alignas(64) PTO2TaskSlotState slots[64];
+
+    void SetUp() override {
+        spill_entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr});
+        error_code.store(PTO2_ERROR_NONE);
+        spill_pool.init(spill_entries.data(), POOL_CAP, &error_code);
+        memset(slots, 0, sizeof(slots));
+    }
+};
+
+TEST_F(ForEachFaninTest, InlineOnlyVoid) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < 5; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    std::vector<PTO2TaskSlotState *> visited;
+    pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *s) {
+        visited.push_back(s);
+    });
+
+    ASSERT_EQ(visited.size(), 5u);
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(visited[i], &slots[i]);
+    }
+}
+
+TEST_F(ForEachFaninTest, InlineOnlyBoolEarlyReturn) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < 5; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    int count = 0;
+    bool result = pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *) -> bool {
+        count++;
+        return count < 3;  // stop after 3rd
+    });
+
+    EXPECT_FALSE(result) << "Should return false when callback returns false";
+    EXPECT_EQ(count, 3);
+}
+
+TEST_F(ForEachFaninTest, InlineOnlyBoolAllTrue) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < 3; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    bool result = pto2_for_each_fanin_storage(inline_slots, 3, 0, spill_pool, [](PTO2TaskSlotState *) -> bool {
+        return true;
+    });
+
+    EXPECT_TRUE(result);
+}
+
+TEST_F(ForEachFaninTest, ZeroFanin) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    int count = 0;
+    pto2_for_each_fanin_storage(inline_slots, 0, 0, spill_pool, [&](PTO2TaskSlotState *) {
+        count++;
+    });
+    EXPECT_EQ(count, 0);
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: spill without wrap
+// =============================================================================
+
+TEST_F(ForEachFaninTest, SpillNoWrap) {
+    // 18 fanins = 16 inline + 2 spill
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    // Allocate 2 spill entries
+    auto *s0 = spill_pool.alloc();
+    int32_t spill_start = spill_pool.top - 1;
+    s0->slot_state = &slots[16];
+    auto *s1 = spill_pool.alloc();
+    s1->slot_state = &slots[17];
+
+    std::vector<PTO2TaskSlotState *> visited;
+    pto2_for_each_fanin_storage(inline_slots, 18, spill_start, spill_pool, [&](PTO2TaskSlotState *s) {
+        visited.push_back(s);
+    });
+
+    ASSERT_EQ(visited.size(), 18u);
+    for (int i = 0; i < 16; i++) {
+        EXPECT_EQ(visited[i], &slots[i]) << "Inline slot " << i;
+    }
+    EXPECT_EQ(visited[16], &slots[16]);
+    EXPECT_EQ(visited[17], &slots[17]);
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: spill with wrap
+// =============================================================================
+
+TEST_F(ForEachFaninTest, SpillWithWrap) {
+    // Push pool near end so spill wraps around
+    // Pool cap = 32, advance top to 30 so next alloc is at index 30
+    spill_pool.top = POOL_CAP - 2;
+    spill_pool.tail = POOL_CAP - 2;
+
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    // 4 spill entries: indices 30, 31, 0, 1 (wraps around)
+    int32_t spill_start = spill_pool.top;
+    for (int i = 0; i < 4; i++) {
+        auto *e = spill_pool.alloc();
+        ASSERT_NE(e, nullptr);
+        e->slot_state = &slots[16 + i];
+    }
+
+    std::vector<PTO2TaskSlotState *> visited;
+    pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *s) {
+        visited.push_back(s);
+    });
+
+    ASSERT_EQ(visited.size(), 20u);
+    // Inline
+    for (int i = 0; i < 16; i++) {
+        EXPECT_EQ(visited[i], &slots[i]);
+    }
+    // Spill (wrapped)
+    for (int i = 0; i < 4; i++) {
+        EXPECT_EQ(visited[16 + i], &slots[16 + i]);
+    }
+}
+
+// =============================================================================
+// pto2_for_each_fanin_storage: spill with bool callback early return
+// =============================================================================
+
+TEST_F(ForEachFaninTest, SpillBoolEarlyReturnInSpillRegion) {
+    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {};
+    for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) {
+        inline_slots[i] = &slots[i];
+    }
+
+    int32_t spill_start = spill_pool.top;
+    for (int i = 0; i < 4; i++) {
+        auto *e = spill_pool.alloc();
+        e->slot_state = &slots[16 + i];
+    }
+
+    int count = 0;
+    bool result =
+        pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *) -> bool {
+            count++;
+            return count < 17;  // stop on 17th (first spill entry)
+        });
+
+    EXPECT_FALSE(result);
+    EXPECT_EQ(count, 17);
+}
diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp
new file mode 100644
index 000000000..1a139a8f1
--- /dev/null
+++ b/tests/ut/cpp/a5/test_ready_queue.cpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h
+ *
+ * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local
+ * ready buffer used for local-first dispatch optimization.
+ *
+ * Design contracts:
+ *
+ * - Sequence wrap: The sequence counter is int64_t.  Practically unreachable
+ *   wrap at 2^63; two's-complement comparisons still work.
+ *
+ * - Pop fast-path: pop() checks enqueue_pos == dequeue_pos as an early-empty
+ *   hint.  A push between the hint and the CAS can race; standard TOCTOU of
+ *   Vyukov MPMC, acceptable.
+ *
+ * - Push near full: All producers that see a full slot return false
+ *   simultaneously even if a pop happens right after.  Acceptable
+ *   back-pressure.
+ *
+ * - size() relaxed ordering: size() reads both positions with
+ *   memory_order_relaxed and is a hint, not a snapshot.  If a stale read
+ *   produces d > e the guard returns 0.
+ *
+ * - LocalReadyBuffer LIFO dispatch: try_push appends at count++, pop returns
+ *   slot_states[--count].  LIFO reversal is intentional for cache-locality
+ *   when a producer immediately dispatches its fanout.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// ReadyQueue: Single-threaded fixture (malloc-backed)
+// =============================================================================
+
+class ReadyQueueTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t CAPACITY = 16;  // Power of 2
+
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); }
+
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) { EXPECT_EQ(queue.pop(), nullptr); }
+
+TEST_F(ReadyQueueTest, SinglePushPop) {
+    PTO2TaskSlotState item;
+    ASSERT_TRUE(queue.push(&item));
+
+    PTO2TaskSlotState *result = queue.pop();
+    EXPECT_EQ(result, &item);
+}
+
+TEST_F(ReadyQueueTest, FIFOOrdering) {
+    PTO2TaskSlotState a, b, c;
+
+    ASSERT_TRUE(queue.push(&a));
+    ASSERT_TRUE(queue.push(&b));
+    ASSERT_TRUE(queue.push(&c));
+
+    EXPECT_EQ(queue.pop(), &a);
+    EXPECT_EQ(queue.pop(), &b);
+    EXPECT_EQ(queue.pop(), &c);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, QueueFullReturnsFalse) {
+    std::vector<PTO2TaskSlotState> items(CAPACITY);
+
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState extra;
+    EXPECT_FALSE(queue.push(&extra));
+}
+
+TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) {
+    std::vector<PTO2TaskSlotState> items(CAPACITY);
+
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+    for (uint64_t i = 0; i < CAPACITY; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) {
+    constexpr int BATCH_SIZE = 5;
+    PTO2TaskSlotState items[BATCH_SIZE];
+    PTO2TaskSlotState *ptrs[BATCH_SIZE];
+    for (int i = 0; i < BATCH_SIZE; i++) {
+        ptrs[i] = &items[i];
+    }
+
+    queue.push_batch(ptrs, BATCH_SIZE);
+
+    for (int i = 0; i < BATCH_SIZE; i++) {
+        EXPECT_EQ(queue.pop(), &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) {
+    queue.push_batch(nullptr, 0);
+
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueTest, PopBatchReturnsFive) {
+    constexpr int PUSH_COUNT = 10;
+    PTO2TaskSlotState items[PUSH_COUNT];
+
+    for (int i = 0; i < PUSH_COUNT; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 5);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+TEST_F(ReadyQueueTest, PopBatchPartial) {
+    constexpr int PUSH_COUNT = 3;
+    PTO2TaskSlotState items[PUSH_COUNT];
+
+    for (int i = 0; i < PUSH_COUNT; i++) {
+        ASSERT_TRUE(queue.push(&items[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, PUSH_COUNT);
+
+    for (int i = 0; i < PUSH_COUNT; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+}
+
+TEST_F(ReadyQueueTest, PopBatchEmpty) {
+    PTO2TaskSlotState *out[5];
+    int popped = queue.pop_batch(out, 5);
+    EXPECT_EQ(popped, 0);
+}
+
+TEST_F(ReadyQueueTest, SizeAccuracy) {
+    EXPECT_EQ(queue.size(), 0u);
+
+    PTO2TaskSlotState items[8];
+
+    queue.push(&items[0]);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.push(&items[1]);
+    queue.push(&items[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+
+    for (int i = 0; i < 5; i++) {
+        queue.push(&items[i]);
+    }
+    EXPECT_EQ(queue.size(), 5u);
+}
+
+// =============================================================================
+// Boundary conditions (small capacity for precise boundary testing)
+// =============================================================================
+
+class ReadyQueueBoundaryTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t QUEUE_CAP = 8;  // Small for boundary testing
+    PTO2ReadyQueue queue{};
+    PTO2TaskSlotState dummy[8]{};
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, QUEUE_CAP)); }
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) {
+    int pushed = 0;
+    for (uint64_t i = 0; i < QUEUE_CAP; i++) {
+        if (queue.push(&dummy[i % 8])) pushed++;
+        else break;
+    }
+    EXPECT_GE(pushed, (int)(QUEUE_CAP - 1));
+
+    for (int i = 0; i < pushed; i++) {
+        EXPECT_NE(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+}
+
+TEST_F(ReadyQueueBoundaryTest, PushToFullThenRecover) {
+    int pushed = 0;
+    while (queue.push(&dummy[0]))
+        pushed++;
+
+    EXPECT_FALSE(queue.push(&dummy[1])) << "Push to full queue returns false";
+
+    EXPECT_NE(queue.pop(), nullptr);
+    EXPECT_TRUE(queue.push(&dummy[1])) << "Push succeeds after pop from full queue";
+}
+
+// size() with relaxed ordering: exact in single-threaded context.
+TEST_F(ReadyQueueBoundaryTest, SizeRelaxedOrdering) {
+    queue.push(&dummy[0]);
+    queue.push(&dummy[1]);
+    queue.push(&dummy[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    queue.pop();
+    EXPECT_EQ(queue.size(), 2u);
+
+    queue.pop();
+    queue.pop();
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// size() guard: after many push/pop cycles, never goes negative.
+TEST_F(ReadyQueueBoundaryTest, SizeNeverNegative) {
+    for (int i = 0; i < 100; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0]));
+        queue.pop();
+    }
+    EXPECT_EQ(queue.size(), 0u) << "size() returns 0 after balanced push/pop cycles";
+}
+
+TEST_F(ReadyQueueBoundaryTest, RepeatedEmptyPop) {
+    for (int i = 0; i < 100; i++) {
+        EXPECT_EQ(queue.pop(), nullptr);
+    }
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// Sequence numbers grow large after many cycles but remain correct.
+TEST_F(ReadyQueueBoundaryTest, ManyPushPopCycles) {
+    for (int i = 0; i < 10000; i++) {
+        ASSERT_TRUE(queue.push(&dummy[0]));
+        PTO2TaskSlotState *s = queue.pop();
+        ASSERT_NE(s, nullptr);
+        EXPECT_EQ(s, &dummy[0]);
+    }
+
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_TRUE(queue.push(&dummy[1]));
+    EXPECT_EQ(queue.pop(), &dummy[1]);
+}
+
+// =============================================================================
+// Concurrency
+// =============================================================================
+
+// Parameterized MPMC stress test: {producers, consumers, items_per_producer}
+struct MPMCConfig {
+    int producers;
+    int consumers;
+    int items_per_producer;
+};
+
+class ReadyQueueMPMCTest : public ::testing::TestWithParam<MPMCConfig> {
+protected:
+    static constexpr uint64_t CAPACITY = 1024;
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); }
+    void TearDown() override { pto2_ready_queue_destroy(&queue); }
+};
+
+TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) {
+    auto cfg = GetParam();
+    int total = cfg.producers * cfg.items_per_producer;
+
+    std::vector<PTO2TaskSlotState> items(total);
+    std::vector<std::atomic<int>> consumed_count(total);
+    for (int i = 0; i < total; i++) {
+        consumed_count[i].store(0, std::memory_order_relaxed);
+    }
+
+    auto item_index = [&](PTO2TaskSlotState *s) -> int {
+        return static_cast<int>(s - items.data());
+    };
+
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        for (int i = id; i < total; i += cfg.producers) {
+            while (!queue.push(&items[i])) {}
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    std::atomic<int> total_consumed{0};
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *item = queue.pop();
+            if (item != nullptr) {
+                consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                total_consumed.fetch_add(1, std::memory_order_relaxed);
+            } else if (producers_done.load(std::memory_order_acquire) == cfg.producers) {
+                // Drain remaining
+                while ((item = queue.pop()) != nullptr) {
+                    consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed);
+                    total_consumed.fetch_add(1, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < cfg.producers; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < cfg.consumers; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(total_consumed.load(), total);
+    for (int i = 0; i < total; i++) {
+        EXPECT_EQ(consumed_count[i].load(), 1)
+            << "Item " << i << " consumed " << consumed_count[i].load() << " times (expected 1)";
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MPMCVariants, ReadyQueueMPMCTest,
+    ::testing::Values(
+        MPMCConfig{2, 2, 200},  // TwoProducersTwoConsumers
+        MPMCConfig{1, 4, 500},  // OneProducerNConsumers
+        MPMCConfig{4, 4, 1250}  // HighContentionStress
+    )
+);
+
+// =============================================================================
+// LocalReadyBuffer
+// =============================================================================
+
+class LocalReadyBufferTest : public ::testing::Test {
+protected:
+    static constexpr int CAPACITY = 8;
+
+    PTO2LocalReadyBuffer buffer;
+    PTO2TaskSlotState *backing[CAPACITY];
+
+    void SetUp() override { buffer.reset(backing, CAPACITY); }
+};
+
+// --- Normal path ---
+
+TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) { EXPECT_EQ(buffer.pop(), nullptr); }
+
+// LIFO dispatch: try_push appends at count++, pop returns slot_states[--count].
+TEST_F(LocalReadyBufferTest, LIFOOrdering) {
+    PTO2TaskSlotState a, b;
+
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    EXPECT_EQ(buffer.pop(), &b);
+    EXPECT_EQ(buffer.pop(), &a);
+    EXPECT_EQ(buffer.pop(), nullptr);
+}
+
+TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) {
+    PTO2TaskSlotState items[CAPACITY + 1];
+
+    for (int i = 0; i < CAPACITY; i++) {
+        ASSERT_TRUE(buffer.try_push(&items[i]));
+    }
+
+    EXPECT_FALSE(buffer.try_push(&items[CAPACITY]));
+}
+
+TEST_F(LocalReadyBufferTest, ResetSetsCleanState) {
+    EXPECT_EQ(buffer.pop(), nullptr) << "Fresh buffer is empty";
+
+    PTO2TaskSlotState a, b;
+    ASSERT_TRUE(buffer.try_push(&a));
+    ASSERT_TRUE(buffer.try_push(&b));
+
+    buffer.reset(backing, CAPACITY);
+    EXPECT_EQ(buffer.pop(), nullptr) << "Buffer is empty after reset";
+
+    PTO2TaskSlotState items[CAPACITY];
+    for (int i = 0; i < CAPACITY; i++) {
+        EXPECT_TRUE(buffer.try_push(&items[i]));
+    }
+    EXPECT_FALSE(buffer.try_push(&a)) << "Full after pushing capacity items post-reset";
+}
+
+// --- Boundary conditions ---
+
+TEST_F(LocalReadyBufferTest, NullBackingBuffer) {
+    PTO2LocalReadyBuffer buf;
+    buf.reset(nullptr, 0);
+
+    PTO2TaskSlotState item{};
+    EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing";
+    EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing";
+}
diff --git a/tests/ut/cpp/a5/test_scheduler_state.cpp b/tests/ut/cpp/a5/test_scheduler_state.cpp
new file mode 100644
index 000000000..13647c320
--- /dev/null
+++ b/tests/ut/cpp/a5/test_scheduler_state.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SchedulerState from pto_scheduler.h
+ *
+ * Tests task state transitions, fanin/fanout logic, subtask completion.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+
+#include "scheduler/pto_scheduler.h"
+
+class SchedulerStateTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched;
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    void init_slot(
+        PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0
+    ) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = ring_id;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+    }
+};
+
+// =============================================================================
+// check_and_handle_consumed
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ConsumedNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // 1 != 2
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedTransition) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(2);  // matches fanout_count
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+TEST_F(SchedulerStateTest, ConsumedNotCompletedState) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    // CAS fails because state is RUNNING, not COMPLETED
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING);
+}
+
+TEST_F(SchedulerStateTest, ConsumedIdempotent) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_CONSUMED, 1, 1);
+    slot.fanout_refcount.store(1);
+
+    sched.check_and_handle_consumed(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// release_producer
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ReleaseProducerIncrements) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 3);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 1);
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.fanout_refcount.load(), 2);
+}
+
+TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_COMPLETED, 1, 2);
+    slot.fanout_refcount.store(1);  // One away
+
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// on_subtask_complete
+// =============================================================================
+
+TEST_F(SchedulerStateTest, SubtaskCompleteSingle) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 1;
+    slot.completed_subtasks.store(0);
+
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 6;  // 3 cores * 2 blocks
+    slot.completed_subtasks.store(0);
+
+    for (int i = 0; i < 5; i++) {
+        EXPECT_FALSE(sched.on_subtask_complete(slot));
+    }
+    EXPECT_TRUE(sched.on_subtask_complete(slot));
+}
+
+// =============================================================================
+// on_scope_end
+// =============================================================================
+
+TEST_F(SchedulerStateTest, ScopeEndBatchRelease) {
+    constexpr int N = 4;
+    alignas(64) PTO2TaskSlotState slots[N];
+    PTO2TaskSlotState *ptrs[N];
+
+    for (int i = 0; i < N; i++) {
+        init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2);
+        ptrs[i] = &slots[i];
+    }
+
+    sched.on_scope_end(ptrs, N);
+
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(slots[i].fanout_refcount.load(), 1);
+    }
+}
+
+// =============================================================================
+// get_ready_tasks_batch: local buffer first
+// =============================================================================
+
+TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) {
+    alignas(64) PTO2TaskSlotState slot_a, slot_b;
+    init_slot(slot_a, PTO2_TASK_READY, 0, 1);
+    init_slot(slot_b, PTO2_TASK_PENDING, 1, 1);
+
+    PTO2TaskSlotState *local_buf_storage[4];
+    PTO2LocalReadyBuffer local_buf;
+    local_buf.reset(local_buf_storage, 4);
+    local_buf.try_push(&slot_a);
+
+    // Use src API to route slot_b into the global ready queue
+    sched.release_fanin_and_check_ready(slot_b);
+
+    PTO2TaskSlotState *out[4];
+    int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4);
+
+    EXPECT_EQ(count, 2);
+    // Local buffer drains first (LIFO), so slot_a comes first
+    EXPECT_EQ(out[0], &slot_a);
+    EXPECT_EQ(out[1], &slot_b);
+}
diff --git a/tests/ut/cpp/a5/test_shared_memory.cpp b/tests/ut/cpp/a5/test_shared_memory.cpp
new file mode 100644
index 000000000..ffcbb7821
--- /dev/null
+++ b/tests/ut/cpp/a5/test_shared_memory.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h
+ *
+ * Tests creation, validation, per-ring independence, alignment, size
+ * calculation, and error handling.
+ *
+ * Design contracts:
+ *
+ * - pto2_sm_validate checks `top > heap_size`.  top == heap_size is a
+ *   legitimate "filled exactly to end" state, so strict > is correct.
+ *
+ * - Zero window size: if pto2_sm_calculate_size() is called with 0, all ring
+ *   descriptors/payloads alias the same address.  Current entry path
+ *   (pto2_sm_create) is called only with valid sizes, but there is no
+ *   explicit guard.  pto2_sm_create should reject task_window_size==0.
+ *
+ * - Flow control heap_top validation: validate() does not verify
+ *   heap_top <= heap_size.  After a corruption, heap_top could exceed
+ *   heap_size without detection.  validate should check both bounds.
+ */
+
+#include <gtest/gtest.h>
+#include <cstring>
+#include "pto_shared_memory.h"
+
+// =============================================================================
+// Fixture (default-created handle)
+// =============================================================================
+
+class SharedMemoryTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = pto2_sm_create_default();
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            pto2_sm_destroy(handle);
+            handle = nullptr;
+        }
+    }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) {
+    EXPECT_NE(handle->sm_base, nullptr);
+    EXPECT_GT(handle->sm_size, 0u);
+}
+
+TEST_F(SharedMemoryTest, IsOwner) { EXPECT_TRUE(handle->is_owner); }
+
+TEST_F(SharedMemoryTest, HeaderInitValues) {
+    auto *hdr = handle->header;
+    EXPECT_EQ(hdr->orchestrator_done.load(), 0);
+    EXPECT_EQ(hdr->orch_error_code.load(), 0);
+    EXPECT_EQ(hdr->sched_error_bitmap.load(), 0);
+    EXPECT_EQ(hdr->sched_error_code.load(), 0);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &fc = hdr->rings[r].fc;
+        EXPECT_EQ(fc.current_task_index.load(), 0);
+        EXPECT_EQ(fc.last_task_alive.load(), 0);
+    }
+}
+
+TEST_F(SharedMemoryTest, Validate) { EXPECT_TRUE(pto2_sm_validate(handle)); }
+
+TEST_F(SharedMemoryTest, PerRingIndependence) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->header->rings[r].task_descriptors, nullptr) << "Ring " << r;
+        EXPECT_NE(handle->header->rings[r].task_payloads, nullptr) << "Ring " << r;
+    }
+    for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) {
+        EXPECT_NE(handle->header->rings[r].task_descriptors, handle->header->rings[0].task_descriptors) << "Ring " << r;
+    }
+}
+
+TEST_F(SharedMemoryTest, PointerAlignment) {
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto addr = reinterpret_cast<uintptr_t>(handle->header->rings[r].task_descriptors);
+        EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned";
+    }
+}
+
+TEST_F(SharedMemoryTest, HeaderAlignment) {
+    uintptr_t header_addr = (uintptr_t)handle->header;
+    EXPECT_EQ(header_addr % PTO2_ALIGN_SIZE, 0u) << "Header must be cache-line aligned";
+}
+
+// Descriptor and payload regions don't overlap within or across rings.
+TEST_F(SharedMemoryTest, RegionsNonOverlapping) {
+    uint64_t ws = 64;  // Use a known window size for byte arithmetic
+    PTO2SharedMemoryHandle *h = pto2_sm_create(ws, 4096);
+    ASSERT_NE(h, nullptr);
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        uintptr_t desc_start = (uintptr_t)h->header->rings[r].task_descriptors;
+        uintptr_t desc_end = desc_start + ws * sizeof(PTO2TaskDescriptor);
+        uintptr_t payload_start = (uintptr_t)h->header->rings[r].task_payloads;
+
+        EXPECT_GE(payload_start, desc_end) << "Ring " << r << ": payload region should not overlap descriptors";
+    }
+
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) {
+        uintptr_t this_payload_end = (uintptr_t)h->header->rings[r].task_payloads + ws * sizeof(PTO2TaskPayload);
+        uintptr_t next_desc_start = (uintptr_t)h->header->rings[r + 1].task_descriptors;
+        EXPECT_GE(next_desc_start, this_payload_end) << "Ring " << r << " and " << (r + 1) << " should not overlap";
+    }
+
+    pto2_sm_destroy(h);
+}
+
+// =============================================================================
+// Size calculation
+// =============================================================================
+
+TEST(SharedMemoryCalcSize, NonZero) {
+    uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE);
+    EXPECT_GT(size, 0u);
+}
+
+TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) {
+    uint64_t small_size = pto2_sm_calculate_size(64);
+    uint64_t large_size = pto2_sm_calculate_size(256);
+    EXPECT_GT(large_size, small_size);
+}
+
+TEST(SharedMemoryCalcSize, HeaderAligned) { EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u); }
+
+TEST(SharedMemoryCalcSize, PerRingDifferentSizes) {
+    uint64_t ws[PTO2_MAX_RING_DEPTH] = {128, 256, 512, 1024};
+    uint64_t size = pto2_sm_calculate_size_per_ring(ws);
+
+    uint64_t uniform_size = pto2_sm_calculate_size(128);
+    EXPECT_GT(size, uniform_size);
+}
+
+// =============================================================================
+// Boundary conditions
+// =============================================================================
+
+// Zero window size: all ring descriptors collapse to same address.
+TEST(SharedMemoryBoundary, ZeroWindowSize) {
+    uint64_t size = pto2_sm_calculate_size(0);
+    uint64_t header_size = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+    EXPECT_EQ(size, header_size);
+
+    PTO2SharedMemoryHandle *h = pto2_sm_create(0, 4096);
+    if (h) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) {
+            EXPECT_EQ(h->header->rings[r].task_descriptors, h->header->rings[r + 1].task_descriptors)
+                << "Zero window: all rings' descriptor pointers collapse to same address";
+        }
+        pto2_sm_destroy(h);
+    }
+}
+
+TEST(SharedMemoryBoundary, ValidateDetectsCorruption) {
+    PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096);
+    ASSERT_NE(h, nullptr);
+    EXPECT_TRUE(pto2_sm_validate(h));
+
+    h->header->rings[0].fc.current_task_index.store(-1);
+    EXPECT_FALSE(pto2_sm_validate(h));
+
+    pto2_sm_destroy(h);
+}
+
+TEST(SharedMemoryBoundary, ValidateNullHandle) { EXPECT_FALSE(pto2_sm_validate(nullptr)); }
+
+TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) {
+    char buf[64]{};
+    PTO2SharedMemoryHandle *h = pto2_sm_create_from_buffer(buf, 64, 256, 4096);
+    EXPECT_EQ(h, nullptr) << "Undersized buffer should fail";
+}
diff --git a/tests/ut/cpp/a5/test_spsc_queue.cpp b/tests/ut/cpp/a5/test_spsc_queue.cpp
new file mode 100644
index 000000000..a2c80ca05
--- /dev/null
+++ b/tests/ut/cpp/a5/test_spsc_queue.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2SpscQueue from pto_scheduler.h
+ *
+ * Tests the Rigtorp cached-index SPSC queue used as the orchestrator →
+ * scheduler wiring channel:
+ * - Basic push / pop_batch correctness
+ * - Full / empty detection (including cached-index lazy refresh)
+ * - Wrap-around via modulo indexing
+ * - Capacity is capacity-1 (one sentinel slot)
+ * - pop_batch partial reads
+ * - size() accuracy
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class SpscQueueTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t CAPACITY = 16;  // must be power of 2
+
+    PTO2SpscQueue queue{};
+    // Dummy slot states used as push values
+    alignas(64) PTO2TaskSlotState slots[64]{};
+
+    void SetUp() override {
+        memset(&queue, 0, sizeof(queue));
+        ASSERT_TRUE(queue.init(CAPACITY));
+    }
+
+    void TearDown() override { queue.destroy(); }
+};
+
+// =============================================================================
+// Initialization
+// =============================================================================
+
+TEST_F(SpscQueueTest, InitValidState) {
+    EXPECT_EQ(queue.size(), 0u);
+    EXPECT_EQ(queue.mask_, CAPACITY - 1);
+    EXPECT_NE(queue.buffer_, nullptr);
+}
+
+TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) {
+    PTO2SpscQueue bad{};
+    EXPECT_FALSE(bad.init(3));
+    EXPECT_FALSE(bad.init(7));
+    EXPECT_FALSE(bad.init(0));
+}
+
+TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) {
+    PTO2SpscQueue q{};
+    EXPECT_TRUE(q.init(4));
+    q.destroy();
+    EXPECT_TRUE(q.init(1024));
+    q.destroy();
+}
+
+// =============================================================================
+// Basic push / pop
+// =============================================================================
+
+TEST_F(SpscQueueTest, PushPopSingle) {
+    EXPECT_TRUE(queue.push(&slots[0]));
+    EXPECT_EQ(queue.size(), 1u);
+
+    PTO2TaskSlotState *out[1];
+    int count = queue.pop_batch(out, 1);
+    ASSERT_EQ(count, 1);
+    EXPECT_EQ(out[0], &slots[0]);
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+TEST_F(SpscQueueTest, FIFOOrdering) {
+    for (int i = 0; i < 5; i++) {
+        ASSERT_TRUE(queue.push(&slots[i]));
+    }
+
+    PTO2TaskSlotState *out[5];
+    int count = queue.pop_batch(out, 5);
+    ASSERT_EQ(count, 5);
+    for (int i = 0; i < 5; i++) {
+        EXPECT_EQ(out[i], &slots[i]) << "FIFO order violated at i=" << i;
+    }
+}
+
+TEST_F(SpscQueueTest, PopBatchPartial) {
+    for (int i = 0; i < 3; i++) {
+        queue.push(&slots[i]);
+    }
+
+    // Request 5 but only 3 available
+    PTO2TaskSlotState *out[5];
+    int count = queue.pop_batch(out, 5);
+    EXPECT_EQ(count, 3);
+}
+
+TEST_F(SpscQueueTest, PopBatchEmpty) {
+    PTO2TaskSlotState *out[5];
+    int count = queue.pop_batch(out, 5);
+    EXPECT_EQ(count, 0);
+}
+
+// =============================================================================
+// Full detection
+// =============================================================================
+
+TEST_F(SpscQueueTest, FullReturnsFalse) {
+    // Usable capacity = CAPACITY - 1 = 15
+    for (uint64_t i = 0; i < CAPACITY - 1; i++) {
+        ASSERT_TRUE(queue.push(&slots[i])) << "push failed at i=" << i;
+    }
+    EXPECT_EQ(queue.size(), CAPACITY - 1);
+
+    // Queue full
+    EXPECT_FALSE(queue.push(&slots[CAPACITY - 1])) << "Push to full queue must return false";
+}
+
+TEST_F(SpscQueueTest, UsableCapacityIsCapacityMinusOne) {
+    int pushed = 0;
+    while (queue.push(&slots[pushed % 64])) {
+        pushed++;
+        if (pushed > 100) break;  // safety
+    }
+    EXPECT_EQ(pushed, static_cast<int>(CAPACITY - 1));
+}
+
+// =============================================================================
+// Full then recover
+// =============================================================================
+
+TEST_F(SpscQueueTest, FullThenPopThenPush) {
+    for (uint64_t i = 0; i < CAPACITY - 1; i++) {
+        queue.push(&slots[i]);
+    }
+    EXPECT_FALSE(queue.push(&slots[0]));
+
+    // Pop one
+    PTO2TaskSlotState *out[1];
+    int count = queue.pop_batch(out, 1);
+    ASSERT_EQ(count, 1);
+
+    // Now push should succeed
+    EXPECT_TRUE(queue.push(&slots[0]));
+}
+
+// =============================================================================
+// Wrap-around
+// =============================================================================
+
+TEST_F(SpscQueueTest, WrapAroundCorrectness) {
+    // Push-pop cycles to advance head/tail past capacity boundary
+    for (int cycle = 0; cycle < 100; cycle++) {
+        ASSERT_TRUE(queue.push(&slots[cycle % 64])) << "push failed at cycle=" << cycle;
+        PTO2TaskSlotState *out[1];
+        int count = queue.pop_batch(out, 1);
+        ASSERT_EQ(count, 1) << "pop_batch failed at cycle=" << cycle;
+        EXPECT_EQ(out[0], &slots[cycle % 64]);
+    }
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+TEST_F(SpscQueueTest, WrapAroundBatchCorrectness) {
+    // Multiple cycles of batch push/pop across wrap boundary
+    for (int cycle = 0; cycle < 20; cycle++) {
+        int batch = 5;
+        for (int i = 0; i < batch; i++) {
+            ASSERT_TRUE(queue.push(&slots[(cycle * batch + i) % 64]));
+        }
+        PTO2TaskSlotState *out[5];
+        int count = queue.pop_batch(out, batch);
+        ASSERT_EQ(count, batch);
+        for (int i = 0; i < batch; i++) {
+            EXPECT_EQ(out[i], &slots[(cycle * batch + i) % 64]);
+        }
+    }
+}
+
+// =============================================================================
+// size() accuracy
+// =============================================================================
+
+TEST_F(SpscQueueTest, SizeTracksOperations) {
+    EXPECT_EQ(queue.size(), 0u);
+
+    queue.push(&slots[0]);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.push(&slots[1]);
+    queue.push(&slots[2]);
+    EXPECT_EQ(queue.size(), 3u);
+
+    PTO2TaskSlotState *out[2];
+    queue.pop_batch(out, 2);
+    EXPECT_EQ(queue.size(), 1u);
+
+    queue.pop_batch(out, 1);
+    EXPECT_EQ(queue.size(), 0u);
+}
+
+// =============================================================================
+// Producer-consumer (two threads)
+// =============================================================================
+
+TEST_F(SpscQueueTest, TwoThreadProducerConsumer) {
+    constexpr int TOTAL = 10000;
+    std::vector<PTO2TaskSlotState *> consumed;
+    consumed.reserve(TOTAL);
+
+    // Use a large pool of slot states for unique pointers
+    std::vector<PTO2TaskSlotState> big_pool(TOTAL);
+
+    std::thread producer([&]() {
+        for (int i = 0; i < TOTAL; i++) {
+            while (!queue.push(&big_pool[i])) {
+                // spin
+            }
+        }
+    });
+
+    std::thread consumer([&]() {
+        int total = 0;
+        PTO2TaskSlotState *out[32];
+        while (total < TOTAL) {
+            int count = queue.pop_batch(out, 32);
+            for (int i = 0; i < count; i++) {
+                consumed.push_back(out[i]);
+            }
+            total += count;
+        }
+    });
+
+    producer.join();
+    consumer.join();
+
+    ASSERT_EQ(consumed.size(), static_cast<size_t>(TOTAL));
+    // Verify FIFO order
+    for (int i = 0; i < TOTAL; i++) {
+        EXPECT_EQ(consumed[i], &big_pool[i]) << "FIFO violated at i=" << i;
+    }
+}
+
+// =============================================================================
+// Cached index behavior
+// =============================================================================
+
+TEST_F(SpscQueueTest, CachedIndexLazyRefresh) {
+    // Fill queue
+    for (uint64_t i = 0; i < CAPACITY - 1; i++) {
+        queue.push(&slots[i]);
+    }
+
+    // Consumer pops all
+    PTO2TaskSlotState *out[16];
+    int count = queue.pop_batch(out, CAPACITY);
+    EXPECT_EQ(count, static_cast<int>(CAPACITY - 1));
+
+    // Producer's tail_cached_ is stale (still thinks queue is full)
+    // Next push should refresh tail_cached_ and succeed
+    EXPECT_TRUE(queue.push(&slots[0]));
+}
+
+TEST_F(SpscQueueTest, CachedIndexConsumerRefresh) {
+    // Consumer calls pop_batch on empty queue (head_cached_ is 0)
+    PTO2TaskSlotState *out[1];
+    EXPECT_EQ(queue.pop_batch(out, 1), 0);
+
+    // Producer pushes
+    queue.push(&slots[0]);
+
+    // Consumer's head_cached_ is stale, pop_batch must refresh
+    int count = queue.pop_batch(out, 1);
+    EXPECT_EQ(count, 1);
+    EXPECT_EQ(out[0], &slots[0]);
+}
diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp
new file mode 100644
index 000000000..383003900
--- /dev/null
+++ b/tests/ut/cpp/a5/test_task_allocator.cpp
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TaskAllocator from pto_ring_buffer.h
+ *
+ * Tests ring buffer allocation, heap bump logic, wrap-around, alignment,
+ * task window flow control, and heap_available semantics.
+ *
+ * The allocator is single-threaded (orchestrator thread), so no concurrency
+ * tests are needed. The unified PTO2TaskAllocator replaces the previous
+ * separate PTO2HeapRing + PTO2TaskRing.
+ *
+ * Design contracts (try_bump_heap):
+ *
+ * - Wrap-around guard uses `tail > alloc_size` (strict >).  When
+ *   tail == alloc_size the wrap branch returns nullptr.  Allowing it
+ *   would create top == tail (full/empty ambiguity).  Strict >
+ *   sacrifices one quantum of capacity.
+ *
+ * - heap_available() returns max(at_end, at_begin), not the sum.
+ *   A single allocation cannot split across the wrap boundary.
+ *
+ * - Zero-size allocation is a no-op returning the current top.
+ *   Two consecutive zero-size allocs return the SAME pointer.
+ *
+ * - Wrap path wasted space: space between old top and heap_size is not
+ *   reclaimed.  Inherent ring-buffer fragmentation cost.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <climits>
+#include <cstring>
+#include <set>
+#include <vector>
+
+#include "pto_ring_buffer.h"
+
+// =============================================================================
+// Helpers
+//
+// WHITE-BOX: consume_up_to simulates the scheduler consuming tasks by directly
+// writing descriptor.packed_buffer_end and advancing last_alive.  This binds
+// to the internal tail-derivation mechanism.  If the allocator's reclaim
+// protocol changes (e.g. explicit tail field instead of packed_buffer_end),
+// this helper and all wrap/reclaim tests must be updated.
+// =============================================================================
+
+static void consume_up_to(
+    std::vector<PTO2TaskDescriptor> &descriptors, std::atomic<int32_t> &last_alive, void *heap_base,
+    int32_t window_size, int32_t new_last_alive, uint64_t heap_tail_offset
+) {
+    int32_t last_consumed = new_last_alive - 1;
+    descriptors[last_consumed & (window_size - 1)].packed_buffer_end =
+        static_cast<char *>(heap_base) + heap_tail_offset;
+    last_alive.store(new_last_alive, std::memory_order_release);
+}
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class TaskAllocatorTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 4096;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[HEAP_SIZE]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void SetUp() override {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+// =============================================================================
+// Normal path
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, InitialState) {
+    EXPECT_EQ(allocator.window_size(), WINDOW_SIZE);
+    EXPECT_EQ(allocator.active_count(), 0);
+    EXPECT_EQ(allocator.heap_top(), 0u);
+    EXPECT_EQ(allocator.heap_capacity(), HEAP_SIZE);
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE);
+}
+
+TEST_F(TaskAllocatorTest, AllocNonZeroSize) {
+    auto result = allocator.alloc(100);
+    ASSERT_FALSE(result.failed());
+    EXPECT_EQ(result.task_id, 0);
+    EXPECT_EQ(result.slot, 0);
+    EXPECT_NE(result.packed_base, nullptr);
+    // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128
+    uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE);
+    EXPECT_EQ(expected_aligned, 128u);
+    EXPECT_EQ(allocator.heap_top(), expected_aligned);
+    EXPECT_EQ(
+        static_cast<char *>(result.packed_end) - static_cast<char *>(result.packed_base),
+        static_cast<ptrdiff_t>(expected_aligned)
+    );
+}
+
+TEST_F(TaskAllocatorTest, SequentialTaskIds) {
+    int32_t prev_id = -1;
+    for (int i = 0; i < 5; i++) {
+        auto result = allocator.alloc(0);
+        ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(result.task_id, prev_id + 1) << "Task IDs must be monotonically increasing";
+        EXPECT_EQ(result.slot, result.task_id & (WINDOW_SIZE - 1));
+        prev_id = result.task_id;
+    }
+    EXPECT_EQ(allocator.active_count(), 5);
+}
+
+TEST_F(TaskAllocatorTest, OutputSizeAlignment) {
+    // 1 byte -> aligned to 64
+    auto r1 = allocator.alloc(1);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), 64u);
+
+    // Another 33 bytes -> aligned to 64, total 128
+    auto r2 = allocator.alloc(33);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator.heap_top(), 128u);
+
+    // Exactly 64 bytes -> stays 64, total 192
+    auto r3 = allocator.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator.heap_top(), 192u);
+}
+
+TEST_F(TaskAllocatorTest, SlotMappingPowerOfTwoWindow) {
+    std::set<int32_t> slots;
+    for (int i = 0; i < WINDOW_SIZE; i++) {
+        consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, i, 0);
+        auto r = allocator.alloc(0);
+        ASSERT_FALSE(r.failed());
+        EXPECT_EQ(r.slot, r.task_id & (WINDOW_SIZE - 1));
+        slots.insert(r.slot);
+    }
+    EXPECT_EQ(slots.size(), static_cast<size_t>(WINDOW_SIZE))
+        << "Every slot should be visited exactly once over one window cycle";
+}
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) {
+    auto r1 = allocator.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), 256u);
+
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u);
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 256);
+
+    // Force the allocator to observe the new last_alive by doing another alloc
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+
+    // top=256, tail=256: at_end = 4096-256=3840, at_begin = 256
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u);
+}
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) {
+    auto r1 = allocator.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, 0);
+
+    descriptors[0].packed_buffer_end = static_cast<char *>(static_cast<void *>(heap_buf)) + 64;
+    last_alive.store(1, std::memory_order_release);
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+}
+
+TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) {
+    auto r1 = allocator.alloc(128);
+    ASSERT_FALSE(r1.failed());
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128);
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    uint64_t avail_after_first = allocator.heap_available();
+
+    auto r3 = allocator.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(allocator.heap_available(), avail_after_first);
+}
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopGeTail) {
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE);
+
+    auto r1 = allocator.alloc(256);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u);
+}
+
+TEST_F(TaskAllocatorTest, HeapAvailableTopLtTail) {
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64);
+
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    // top=128, tail=HEAP_SIZE-64: available = (HEAP_SIZE-64) - 128
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 64 - 128);
+}
+
+// =============================================================================
+// Boundary conditions
+// =============================================================================
+
+TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) {
+    // Allocate 4032 bytes to leave exactly 64 at end.
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE - 64u);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+    EXPECT_EQ(static_cast<char *>(r2.packed_base), reinterpret_cast<char *>(heap_buf) + HEAP_SIZE - 64);
+}
+
+// Wrap guard `tail > alloc_size` uses strict > to prevent full/empty ambiguity.
+// If the allocation were allowed, heap_top would advance to alloc_size == tail,
+// making top == tail.  Because top == tail is the canonical "empty" state, the
+// ring could not distinguish "completely full" from "completely empty".
+TEST_F(TaskAllocatorTest, HeapWrapGuardRejectsTailEqualsAllocSize) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 64);
+
+    auto r2 = allocator.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "wrap guard must reject when tail == alloc_size (full/empty ambiguity)";
+}
+
+TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf));
+    EXPECT_EQ(allocator.heap_top(), 64u);
+}
+
+// Linear-gap guard `tail - top > alloc_size` uses strict > for the same reason.
+TEST_F(TaskAllocatorTest, HeapLinearGapGuardRejectsExactFit) {
+    // Fill most of heap, leaving just 64 at end so next alloc wraps.
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64);
+
+    // Allocate 128 bytes: space_at_end = 64, not enough -> wrap.
+    // tail = HEAP_SIZE-64, which is > 128 -> wraps to beginning.
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(allocator.heap_top(), 128u);
+
+    // Now top=128, tail=HEAP_SIZE-64 (top < tail)
+    // gap = (HEAP_SIZE-64) - 128 = HEAP_SIZE-192
+    // Allocate exactly gap bytes: gap > alloc_size -> FALSE
+    uint64_t gap = (HEAP_SIZE - 64) - 128;
+    auto r3 = allocator.alloc(gap);
+    EXPECT_TRUE(r3.failed()) << "linear-gap guard must reject exact fit (full/empty ambiguity)";
+}
+
+TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) {
+    auto r1 = allocator.alloc(HEAP_SIZE - 64);
+    ASSERT_FALSE(r1.failed());
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64);
+
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+
+    // gap = (HEAP_SIZE-64) - 128. Try to allocate more than gap.
+    auto r3 = allocator.alloc(HEAP_SIZE);
+    EXPECT_TRUE(r3.failed());
+    EXPECT_NE(error_code.load(), 0);
+}
+
+// heap_available reports max(at_end, at_begin), not the sum -- a single
+// allocation cannot split across the wrap boundary.
+TEST_F(TaskAllocatorTest, AvailableReportsMaxNotSum) {
+    auto r1 = allocator.alloc(3008);
+    ASSERT_FALSE(r1.failed());
+    uint64_t actual_top = allocator.heap_top();
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 1024);
+
+    auto r_probe = allocator.alloc(0);
+    ASSERT_FALSE(r_probe.failed());
+
+    uint64_t avail = allocator.heap_available();
+    uint64_t at_end = HEAP_SIZE - actual_top;
+    uint64_t at_begin = 1024;
+    EXPECT_EQ(avail, std::max(at_end, at_begin));
+    EXPECT_LT(avail, at_end + at_begin);
+}
+
+// Zero-size allocs return the same address and don't advance the top.
+TEST_F(TaskAllocatorTest, ZeroSizeAllocationAliased) {
+    auto r1 = allocator.alloc(0);
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r1.failed());
+    ASSERT_FALSE(r2.failed());
+
+    EXPECT_EQ(r1.packed_base, r2.packed_base) << "Zero-size allocs return same address";
+    EXPECT_EQ(r1.packed_base, r1.packed_end) << "packed_end == packed_base for zero-size";
+    EXPECT_EQ(allocator.heap_top(), 0u) << "top doesn't advance for zero-size allocs";
+}
+
+// Wrap path: wasted space between old top and heap_size is not reclaimed.
+TEST_F(TaskAllocatorTest, WrapPathWastedSpace) {
+    auto r1 = allocator.alloc(4000);
+    ASSERT_FALSE(r1.failed());
+    uint64_t top_after = allocator.heap_top();
+    EXPECT_GE(top_after, 4000u);
+    EXPECT_LT(top_after, HEAP_SIZE);
+
+    consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, top_after);
+
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.packed_base, static_cast<void *>(heap_buf)) << "Allocation wrapped to beginning";
+
+    uint64_t avail = allocator.heap_available();
+    EXPECT_LT(avail, HEAP_SIZE) << "Wasted space at end reduces available capacity";
+}
+
+TEST_F(TaskAllocatorTest, AllocExactlyHeapSize) {
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.packed_base, static_cast<void *>(heap_buf));
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    auto r2 = allocator.alloc(64);
+    EXPECT_TRUE(r2.failed()) << "No space after full allocation";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(TaskAllocatorTest, AllocLargerThanHeap) {
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed()) << "Cannot allocate more than heap size";
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(TaskAllocatorTest, TaskWindowSaturates) {
+    for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+        auto r = allocator.alloc(0);
+        ASSERT_FALSE(r.failed()) << "Alloc failed at i=" << i;
+        EXPECT_EQ(r.task_id, i);
+    }
+    EXPECT_EQ(allocator.active_count(), WINDOW_SIZE - 1);
+
+    auto overflow = allocator.alloc(0);
+    EXPECT_TRUE(overflow.failed());
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
+
+// Task IDs grow monotonically as int32_t. Near INT32_MAX, the same
+// signed-overflow concern applies but is cosmetic since we use
+// task_id & window_mask for indexing.
+TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
+    current_index.store(INT32_MAX - 2);
+    last_alive.store(INT32_MAX - 2);
+    allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+
+    auto r1 = allocator.alloc(0);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(r1.task_id, INT32_MAX - 2);
+    EXPECT_EQ(r1.slot, (INT32_MAX - 2) & (WINDOW_SIZE - 1));
+
+    auto r2 = allocator.alloc(0);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, INT32_MAX - 1);
+
+    auto r3 = allocator.alloc(0);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, INT32_MAX);
+    EXPECT_GE(r3.slot, 0);
+    EXPECT_LT(r3.slot, WINDOW_SIZE);
+}
diff --git a/tests/ut/cpp/a5/test_task_state.cpp b/tests/ut/cpp/a5/test_task_state.cpp
new file mode 100644
index 000000000..7c468a9e7
--- /dev/null
+++ b/tests/ut/cpp/a5/test_task_state.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TaskSlotState lifecycle through PTO2SchedulerState API.
+ *
+ * These tests drive state transitions via src methods (release_fanin,
+ * on_subtask_complete, check_and_handle_consumed) rather than manually
+ * operating atomic fields.  For concurrent exactly-once semantics of
+ * fanin/subtask/fanout, see test_scheduler_state.cpp which already
+ * covers those paths via the same API.
+ *
+ * This file focuses on:
+ * - Full lifecycle through src API
+ * - Non-profiling ready path behavior (task_state stays PENDING)
+ * - Double subtask completion (counter-model weakness)
+ */
+
+#include <gtest/gtest.h>
+#include <atomic>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include "scheduler/pto_scheduler.h"
+
+class TaskStateTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched;
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    void init_slot(PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = 0;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+    }
+};
+
+// =============================================================================
+// Full lifecycle through src API: PENDING -> (fanin) -> READY-equivalent
+// -> (subtask) -> COMPLETED -> (fanout) -> CONSUMED
+// =============================================================================
+TEST_F(TaskStateTest, FullLifecycleThroughAPI) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
+    slot.total_required_subtasks = 1;
+    slot.completed_subtasks.store(0);
+
+    // Fanin satisfied -> task becomes ready
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    EXPECT_TRUE(ready);
+
+    // Subtask completes -> task done
+    bool done = sched.on_subtask_complete(slot);
+    EXPECT_TRUE(done);
+
+    // Manually transition to COMPLETED (normally done by scheduler dispatch loop)
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+
+    // Fanout released -> CONSUMED
+    sched.release_producer(slot);
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// Non-profiling release_fanin does not CAS task_state to READY.
+//
+// Readiness is determined solely by fanin_refcount reaching fanin_count.
+// task_state stays PENDING after the non-profiling ready path. This is
+// correct by design -- the profiling overload adds the CAS only to count
+// atomic operations.
+// =============================================================================
+TEST_F(TaskStateTest, NonProfilingReadyPathStaysPending) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
+
+    bool ready = sched.release_fanin_and_check_ready(slot);
+    ASSERT_TRUE(ready) << "Task should be detected as ready via refcount";
+
+    // task_state remains PENDING -- this is correct by design.
+    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING)
+        << "Non-profiling path intentionally does not transition task_state to READY";
+}
+
+// =============================================================================
+// Multi-fanin: partial release does not trigger ready
+// =============================================================================
+TEST_F(TaskStateTest, MultiFaninPartialNotReady) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_PENDING, 3, 1);
+
+    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
+    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
+    EXPECT_TRUE(sched.release_fanin_and_check_ready(slot));
+}
+
+// =============================================================================
+// Concurrent fanin: exactly one thread detects ready (via src API)
+// =============================================================================
+TEST_F(TaskStateTest, ConcurrentFaninExactlyOneReady) {
+    constexpr int ROUNDS = 500;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        alignas(64) PTO2TaskSlotState slot;
+        init_slot(slot, PTO2_TASK_PENDING, 3, 1);
+        std::atomic<int> ready_count{0};
+
+        auto release = [&]() {
+            if (sched.release_fanin_and_check_ready(slot)) {
+                ready_count.fetch_add(1);
+            }
+        };
+
+        std::thread t1(release), t2(release), t3(release);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(ready_count.load(), 1) << "Round " << round;
+    }
+}
+
+// =============================================================================
+// Concurrent subtask completion: exactly one thread sees done (via src API)
+// =============================================================================
+TEST_F(TaskStateTest, ConcurrentSubtaskCompletion) {
+    constexpr int ROUNDS = 500;
+
+    for (int round = 0; round < ROUNDS; round++) {
+        alignas(64) PTO2TaskSlotState slot;
+        init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+        slot.total_required_subtasks = 3;
+        slot.completed_subtasks.store(0);
+        std::atomic<int> done_count{0};
+
+        auto complete = [&]() {
+            if (sched.on_subtask_complete(slot)) {
+                done_count.fetch_add(1);
+            }
+        };
+
+        std::thread t1(complete), t2(complete), t3(complete);
+        t1.join();
+        t2.join();
+        t3.join();
+
+        EXPECT_EQ(done_count.load(), 1) << "Round " << round;
+        EXPECT_EQ(slot.completed_subtasks.load(), 3);
+    }
+}
+
+// =============================================================================
+// Double subtask completion (counter-model weakness).
+// With the counter model, double-completing the same subtask increments
+// completed_subtasks twice, potentially reaching total prematurely.
+// Unlike the old bitmask model, the counter cannot detect duplicates.
+// =============================================================================
+TEST_F(TaskStateTest, DoubleSubtaskCompletionCounterWeakness) {
+    alignas(64) PTO2TaskSlotState slot;
+    init_slot(slot, PTO2_TASK_RUNNING, 1, 1);
+    slot.total_required_subtasks = 2;
+    slot.completed_subtasks.store(0);
+
+    // First subtask completion
+    bool done1 = sched.on_subtask_complete(slot);
+    EXPECT_FALSE(done1) << "Single completion doesn't complete the task";
+
+    // Same subtask completes AGAIN (logic error at caller level)
+    bool done2 = sched.on_subtask_complete(slot);
+    EXPECT_TRUE(done2) << "Counter model: double-completion falsely triggers done";
+}
diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp
new file mode 100644
index 000000000..10eef0317
--- /dev/null
+++ b/tests/ut/cpp/a5/test_tensormap.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for PTO2TensorMap from pto_tensormap.h / pto_tensormap.cpp
+ *
+ * Tests hash-table-based producer lookup with overlap detection:
+ * - Hash function distribution (golden-ratio multiplicative hash)
+ * - Insert / lookup / cleanup lifecycle
+ * - Overlap detection: fast-path (is_all_offset_zero) and slow-path (offsets)
+ * - Lazy invalidation (stale entries skipped, not truncated)
+ * - Multi-ring isolation in the same hash chain
+ * - Lookup returns all matches (no silent 16-result cap post-#669)
+ * - Entry pool allocation and free-list recycling
+ * - cleanup_retired correctness across task windows
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <set>
+#include <vector>
+
+#include "pto_orchestration_api.h"
+#include "pto_tensormap.h"
+
+// =============================================================================
+// Helpers
+// =============================================================================
+
+// Test-local mirror of the old stack-buffered lookup result. PR #669 removed
+// PTO2LookupResult in favor of a callback-based API; these tests collect
+// matches into a vector-like struct so assertions remain readable.
+struct TestLookupResult {
+    struct Entry {
+        PTO2TensorMapEntry *entry;
+        OverlapStatus overlap_status;
+    };
+    std::vector<Entry> entries;
+    int count = 0;
+};
+
+static void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) {
+    tmap.lookup(tensor, [&](PTO2TensorMapEntry &e, OverlapStatus s) -> bool {
+        out.entries.push_back({&e, s});
+        out.count++;
+        return true;
+    });
+}
+
+static Tensor make_test_tensor(uint64_t addr, uint32_t shape0, uint32_t ndims = 1, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, ndims, DataType::FLOAT32, false, version);
+}
+
+static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {s0, s1};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 2, DataType::FLOAT32, false, version);
+}
+
+// =============================================================================
+// Fixture
+// =============================================================================
+
+class TensorMapTest : public ::testing::Test {
+protected:
+    static constexpr int32_t NUM_BUCKETS = 16;
+    static constexpr int32_t POOL_SIZE = 64;
+    static constexpr int32_t WINDOW_SIZE = 32;
+
+    PTO2TensorMap tmap{};
+
+    void SetUp() override {
+        int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE};
+        ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes));
+    }
+
+    void TearDown() override { tmap.destroy(); }
+};
+
+// =============================================================================
+// Initialization
+// =============================================================================
+
+TEST_F(TensorMapTest, InitValidState) {
+    EXPECT_EQ(tmap.num_buckets, NUM_BUCKETS);
+    EXPECT_EQ(tmap.pool_size, POOL_SIZE);
+    EXPECT_EQ(tmap.next_entry_idx, 0);
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.valid_count(), 0);
+}
+
+TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) {
+    PTO2TensorMap bad{};
+    int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8};
+    EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail";
+    EXPECT_FALSE(bad.init(7, 64, ws));
+    EXPECT_TRUE(bad.init(8, 64, ws));
+    bad.destroy();
+}
+
+// =============================================================================
+// Hash function
+// =============================================================================
+
+TEST_F(TensorMapTest, HashDeterministic) {
+    uint64_t addr = 0x1000;
+    EXPECT_EQ(tmap.hash(addr), tmap.hash(addr));
+}
+
+TEST_F(TensorMapTest, HashDistributesAlignedAddresses) {
+    std::set<uint32_t> hit_buckets;
+    // Aligned addresses (64KB stride) should still distribute across buckets
+    for (uint64_t i = 0; i < 64; i++) {
+        uint64_t addr = i * 65536;
+        hit_buckets.insert(tmap.hash(addr));
+    }
+    // With golden-ratio hash, 64 aligned addresses across 16 buckets
+    // should hit at least 12 distinct buckets
+    EXPECT_GE(hit_buckets.size(), 12u) << "Aligned addresses must distribute well";
+}
+
+TEST_F(TensorMapTest, HashBoundedByBucketCount) {
+    for (uint64_t addr = 0; addr < 1000; addr++) {
+        EXPECT_LT(tmap.hash(addr), static_cast<uint32_t>(NUM_BUCKETS));
+    }
+}
+
+// =============================================================================
+// Insert and lookup: basic
+// =============================================================================
+
+TEST_F(TensorMapTest, InsertThenLookupFindsProducer) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    PTO2TaskId tid = PTO2TaskId::make(0, 0);
+    tmap.insert(t, tid);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, tid);
+}
+
+TEST_F(TensorMapTest, LookupEmptyReturnsZero) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 0);
+}
+
+TEST_F(TensorMapTest, InsertMultipleSameBuffer) {
+    Tensor t1 = make_test_tensor(0x1000, 256);
+    Tensor t2 = make_test_tensor(0x1000, 128);
+    PTO2TaskId tid1 = PTO2TaskId::make(0, 0);
+    PTO2TaskId tid2 = PTO2TaskId::make(0, 1);
+
+    tmap.insert(t1, tid1);
+    tmap.insert(t2, tid2);
+
+    TestLookupResult result;
+    run_lookup(tmap, t1, result);
+    // Both entries share same buffer_addr, so both should be found
+    EXPECT_EQ(result.count, 2);
+}
+
+TEST_F(TensorMapTest, InsertDifferentBuffersNoCollision) {
+    Tensor t1 = make_test_tensor(0x1000, 256);
+    Tensor t2 = make_test_tensor(0x2000, 256);
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    TestLookupResult r1;
+    run_lookup(tmap, t1, r1);
+    EXPECT_EQ(r1.count, 1);
+    EXPECT_EQ(r1.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0));
+
+    TestLookupResult r2;
+    run_lookup(tmap, t2, r2);
+    EXPECT_EQ(r2.count, 1);
+    EXPECT_EQ(r2.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1));
+}
+
+// =============================================================================
+// Overlap detection: fast path (is_all_offset_zero)
+// =============================================================================
+
+TEST_F(TensorMapTest, OverlapFastPathCovered) {
+    // Producer output: shape [256], consumer input: shape [512]
+    // Consumer covers producer -> COVERED
+    Tensor producer = make_test_tensor(0x1000, 256);
+    Tensor consumer = make_test_tensor(0x1000, 512);
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+TEST_F(TensorMapTest, OverlapFastPathOther) {
+    // Producer output: shape [512], consumer input: shape [256]
+    // Consumer does NOT cover producer -> OTHER
+    Tensor producer = make_test_tensor(0x1000, 512);
+    Tensor consumer = make_test_tensor(0x1000, 256);
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+TEST_F(TensorMapTest, OverlapFastPathExactMatch) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+// =============================================================================
+// Overlap detection: slow path (offsets via view)
+// =============================================================================
+
+TEST_F(TensorMapTest, OverlapSlowPathNoOverlap) {
+    // Producer writes [0..128), consumer reads [128..256) -> NO_OVERLAP
+    Tensor base = make_test_tensor_2d(0x1000, 256, 1);
+    uint32_t prod_shapes[] = {128, 1};
+    uint32_t prod_offsets[] = {0, 0};
+    Tensor producer = base.view(prod_shapes, prod_offsets);
+
+    uint32_t con_shapes[] = {128, 1};
+    uint32_t con_offsets[] = {128, 0};
+    Tensor consumer = base.view(con_shapes, con_offsets);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    EXPECT_EQ(result.count, 0) << "Non-overlapping regions must return no results";
+}
+
+TEST_F(TensorMapTest, OverlapSlowPathPartialOverlap) {
+    // Producer writes [0..192), consumer reads [64..256) -> overlapping, OTHER
+    Tensor base = make_test_tensor_2d(0x1000, 256, 1);
+    uint32_t prod_shapes[] = {192, 1};
+    uint32_t prod_offsets[] = {0, 0};
+    Tensor producer = base.view(prod_shapes, prod_offsets);
+
+    uint32_t con_shapes[] = {192, 1};
+    uint32_t con_offsets[] = {64, 0};
+    Tensor consumer = base.view(con_shapes, con_offsets);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+TEST_F(TensorMapTest, OverlapSlowPathCovered) {
+    // Producer writes [64..192), consumer reads [0..256) -> consumer covers producer
+    Tensor base = make_test_tensor_2d(0x1000, 256, 1);
+    uint32_t prod_shapes[] = {128, 1};
+    uint32_t prod_offsets[] = {64, 0};
+    Tensor producer = base.view(prod_shapes, prod_offsets);
+
+    uint32_t con_shapes[] = {256, 1};
+    uint32_t con_offsets[] = {0, 0};
+    Tensor consumer = base.view(con_shapes, con_offsets);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+// =============================================================================
+// Version-based overlap detection
+// =============================================================================
+
+TEST_F(TensorMapTest, VersionMismatchReturnsOther) {
+    // Producer v0, consumer v1 -> always OTHER regardless of shape match
+    Tensor producer = make_test_tensor(0x1000, 256, 1, 0);
+    Tensor consumer = make_test_tensor(0x1000, 256, 1, 1);
+
+    tmap.insert(producer, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, consumer, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER);
+}
+
+// =============================================================================
+// Lazy invalidation
+// =============================================================================
+
+TEST_F(TensorMapTest, StaleEntriesSkippedDuringLookup) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(0, 1));
+
+    // Advance validity to skip task 0
+    tmap.sync_validity(0, 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1));
+}
+
+TEST_F(TensorMapTest, StaleEntriesNotTruncatedAcrossRings) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    // Ring 0, task 0 and Ring 1, task 0 -> same bucket
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(1, 0));
+
+    // Invalidate ring 0 only
+    tmap.sync_validity(0, 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    // Ring 1 task 0 still valid, ring 0 task 0 invalidated
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0));
+}
+
+// =============================================================================
+// cleanup_retired
+// =============================================================================
+
+TEST_F(TensorMapTest, CleanupRetiredRemovesEntriesForRetiredTasks) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(0, 1));
+    tmap.insert(t, PTO2TaskId::make(0, 2));
+    EXPECT_EQ(tmap.valid_count(), 3);
+
+    // Cleanup tasks [0, 2) on ring 0
+    tmap.cleanup_retired(0, 0, 2);
+
+    EXPECT_EQ(tmap.valid_count(), 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 2));
+}
+
+TEST_F(TensorMapTest, CleanupRetiredPreservesOtherRings) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    tmap.insert(t, PTO2TaskId::make(1, 0));
+
+    tmap.cleanup_retired(0, 0, 1);
+
+    EXPECT_EQ(tmap.valid_count(), 1);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0));
+}
+
+TEST_F(TensorMapTest, CleanupRetiredFreesEntriesToPool) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.next_entry_idx, 1);
+
+    tmap.cleanup_retired(0, 0, 1);
+
+    EXPECT_EQ(tmap.free_num, 1) << "Cleaned entry should be in free list";
+
+    // New insert should reuse free entry instead of allocating fresh
+    tmap.insert(t, PTO2TaskId::make(0, 1));
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.next_entry_idx, 1) << "Should reuse freed entry, not allocate new";
+}
+
+// =============================================================================
+// Multi-ring isolation
+// =============================================================================
+
+TEST_F(TensorMapTest, MultiRingIndependentLookup) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    tmap.insert(t, PTO2TaskId::make(0, 5));
+    tmap.insert(t, PTO2TaskId::make(1, 3));
+    tmap.insert(t, PTO2TaskId::make(2, 7));
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 3);
+
+    // Invalidate ring 0 up to task 6 and ring 2 up to task 8
+    tmap.sync_validity(0, 6);
+    tmap.sync_validity(2, 8);
+
+    TestLookupResult result2;
+    run_lookup(tmap, t, result2);
+    EXPECT_EQ(result2.count, 1);
+    EXPECT_EQ(result2.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 3));
+}
+
+// =============================================================================
+// Lookup returns all matches (PR #669 removed the 16-slot cap)
+// =============================================================================
+
+TEST_F(TensorMapTest, LookupReturnsAllMatches) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    // Insert 20 entries for the same buffer (was capped at 16 before #669)
+    for (int i = 0; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 20) << "Lookup must return every overlapping entry, no silent cap";
+}
+
+// =============================================================================
+// Entry pool lifecycle
+// =============================================================================
+
+TEST_F(TensorMapTest, PoolExhaustionAsserts) {
+    // With pool_size=64, inserting 64 entries should work, 65th should fail
+    for (int i = 0; i < POOL_SIZE; i++) {
+        Tensor t = make_test_tensor(0x1000 + i * 0x100, 256);
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    EXPECT_EQ(tmap.next_entry_idx, POOL_SIZE);
+    EXPECT_EQ(tmap.free_num, 0);
+
+    // 65th insert should trigger always_assert (pool overflow)
+    Tensor overflow = make_test_tensor(0x9000, 256);
+    EXPECT_THROW(tmap.insert(overflow, PTO2TaskId::make(0, POOL_SIZE)), std::runtime_error);
+}
+
+TEST_F(TensorMapTest, FreeListRecycling) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    // Insert and cleanup 10 entries
+    for (int i = 0; i < 10; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    tmap.cleanup_retired(0, 0, 10);
+    EXPECT_EQ(tmap.free_num, 10);
+
+    // Re-insert should use free list
+    for (int i = 10; i < 20; i++) {
+        tmap.insert(t, PTO2TaskId::make(0, i));
+    }
+    EXPECT_EQ(tmap.free_num, 0);
+    EXPECT_EQ(tmap.next_entry_idx, 10) << "No new pool entries consumed when free list available";
+}
+
+// =============================================================================
+// Task chain integrity (per-task entry list)
+// =============================================================================
+
+TEST_F(TensorMapTest, PerTaskEntryListTracksMultipleOutputs) {
+    Tensor t1 = make_test_tensor(0x1000, 256);
+    Tensor t2 = make_test_tensor(0x2000, 128);
+    PTO2TaskId tid = PTO2TaskId::make(0, 5);
+
+    tmap.insert(t1, tid);
+    tmap.insert(t2, tid);
+    EXPECT_EQ(tmap.valid_count(), 2);
+
+    // Cleanup task 5 should remove both entries
+    tmap.cleanup_retired(0, 5, 6);
+    EXPECT_EQ(tmap.valid_count(), 0);
+    EXPECT_EQ(tmap.free_num, 2);
+}
+
+// =============================================================================
+// Bucket chain integrity (doubly-linked list)
+// =============================================================================
+
+TEST_F(TensorMapTest, RemoveMiddleEntryPreservesChain) {
+    Tensor t = make_test_tensor(0x1000, 256);
+    PTO2TaskId tid0 = PTO2TaskId::make(0, 0);
+    PTO2TaskId tid1 = PTO2TaskId::make(0, 1);
+    PTO2TaskId tid2 = PTO2TaskId::make(0, 2);
+
+    tmap.insert(t, tid0);
+    tmap.insert(t, tid1);
+    tmap.insert(t, tid2);
+
+    // Remove middle entry (task 1)
+    tmap.cleanup_retired(0, 1, 2);
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    EXPECT_EQ(result.count, 2);
+
+    std::set<uint32_t> found_locals;
+    for (int i = 0; i < result.count; i++) {
+        found_locals.insert(result.entries[i].entry->producer_task_id.local());
+    }
+    EXPECT_TRUE(found_locals.count(0));
+    EXPECT_TRUE(found_locals.count(2));
+}
+
+// =============================================================================
+// PTO2TaskId encoding/decoding
+// =============================================================================
+
+TEST(TaskIdTest, MakeAndDecode) {
+    auto tid = PTO2TaskId::make(3, 42);
+    EXPECT_EQ(tid.ring(), 3);
+    EXPECT_EQ(tid.local(), 42u);
+}
+
+TEST(TaskIdTest, InvalidSentinel) {
+    auto inv = PTO2TaskId::invalid();
+    EXPECT_FALSE(inv.is_valid());
+    EXPECT_EQ(inv.raw, UINT64_MAX);
+}
+
+TEST(TaskIdTest, Equality) {
+    auto a = PTO2TaskId::make(1, 100);
+    auto b = PTO2TaskId::make(1, 100);
+    auto c = PTO2TaskId::make(2, 100);
+    EXPECT_EQ(a, b);
+    EXPECT_NE(a, c);
+}
+
+TEST(TaskIdTest, RingIdMaxValue) {
+    auto tid = PTO2TaskId::make(255, 0);
+    EXPECT_EQ(tid.ring(), 255);
+    EXPECT_EQ(tid.local(), 0u);
+}
+
+TEST(TaskIdTest, LocalIdMaxValue) {
+    auto tid = PTO2TaskId::make(0, UINT32_MAX);
+    EXPECT_EQ(tid.ring(), 0);
+    EXPECT_EQ(tid.local(), UINT32_MAX);
+}
diff --git a/tests/ut/cpp/a5/test_wiring.cpp b/tests/ut/cpp/a5/test_wiring.cpp
new file mode 100644
index 000000000..964e826f8
--- /dev/null
+++ b/tests/ut/cpp/a5/test_wiring.cpp
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Unit tests for scheduler wiring and completion paths:
+ *
+ * 1. wire_task()         — fanout wiring, early-finished detection,
+ *                          fanin_count initialization, ready push
+ * 2. on_mixed_task_complete() — COMPLETED transition, fanout traversal,
+ *                               consumer fanin release
+ * 3. on_task_release()   — fanin traversal, producer release,
+ *                          self-CONSUMED check
+ * 4. advance_ring_pointers() — CONSUMED slot scan, reset_for_reuse
+ *
+ * These tests exercise the core scheduling hot-paths that had zero coverage.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <thread>
+#include <vector>
+
+#include "scheduler/pto_scheduler.h"
+
+// =============================================================================
+// Fixture: sets up a scheduler with shared memory and provides helpers
+// =============================================================================
+
+class WiringTest : public ::testing::Test {
+protected:
+    PTO2SchedulerState sched{};
+    PTO2SharedMemoryHandle *sm_handle = nullptr;
+
+    void SetUp() override {
+        sm_handle = pto2_sm_create_default();
+        ASSERT_NE(sm_handle, nullptr);
+        bool ok = pto2_scheduler_init(&sched, sm_handle->header);
+        ASSERT_TRUE(ok);
+    }
+
+    void TearDown() override {
+        pto2_scheduler_destroy(&sched);
+        if (sm_handle) {
+            pto2_sm_destroy(sm_handle);
+        }
+    }
+
+    // Initialize a slot for testing wiring/completion
+    void init_slot(
+        PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0
+    ) {
+        memset(&slot, 0, sizeof(slot));
+        slot.task_state.store(state);
+        slot.fanin_count = fanin_count;
+        slot.fanin_refcount.store(0);
+        slot.fanout_count = fanout_count;
+        slot.fanout_refcount.store(0);
+        slot.fanout_lock.store(0);
+        slot.fanout_head = nullptr;
+        slot.ring_id = ring_id;
+        slot.active_mask = PTO2_SUBTASK_MASK_AIC;
+        slot.completed_subtasks.store(0);
+        slot.total_required_subtasks = 1;
+        slot.logical_block_num = 1;
+        slot.dep_pool_mark = 0;
+    }
+};
+
+// =============================================================================
+// wire_task: no fanin (independent task)
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskNoFaninBecomesReady) {
+    // A task with 0 actual fanins should immediately be pushed to ready queue
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 0);
+
+    // fanin_count set to 0 + 1 = 1 (the wiring "+1" sentinel)
+    EXPECT_EQ(task_slot.fanin_count, 1);
+    // fanin_refcount should be 1 (the +1 from no-fanin path)
+    EXPECT_EQ(task_slot.fanin_refcount.load(), 1);
+
+    // Task should be in ready queue
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, &task_slot);
+}
+
+// =============================================================================
+// wire_task: with fanin, all producers already completed (early-finished)
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskAllProducersEarlyFinished) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producer_slots[2];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    // Set up 2 producers that are already COMPLETED
+    for (int i = 0; i < 2; i++) {
+        init_slot(producer_slots[i], PTO2_TASK_COMPLETED, 1, 2);
+    }
+
+    // Consumer task with 2 fanins
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 2;
+    payload.fanin_inline_slot_states[0] = &producer_slots[0];
+    payload.fanin_inline_slot_states[1] = &producer_slots[1];
+
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 2);
+
+    // fanin_count = 2 + 1 = 3
+    EXPECT_EQ(task_slot.fanin_count, 3);
+    // early_finished = 2, init_rc = 2 + 1 = 3, so refcount should hit fanin_count
+    EXPECT_GE(task_slot.fanin_refcount.load(), task_slot.fanin_count);
+
+    // Task should be in ready queue
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, &task_slot);
+}
+
+// =============================================================================
+// wire_task: with fanin, producers still pending (task NOT ready)
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskProducersPendingTaskNotReady) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producer_slots[2];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    // Producers are RUNNING (not yet completed)
+    for (int i = 0; i < 2; i++) {
+        init_slot(producer_slots[i], PTO2_TASK_RUNNING, 1, 2);
+    }
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 2;
+    payload.fanin_inline_slot_states[0] = &producer_slots[0];
+    payload.fanin_inline_slot_states[1] = &producer_slots[1];
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 2);
+
+    // fanin_count = 3 (2 + 1)
+    EXPECT_EQ(task_slot.fanin_count, 3);
+    // early_finished = 0, init_rc = 1 -> not ready
+    EXPECT_EQ(task_slot.fanin_refcount.load(), 1);
+    EXPECT_LT(task_slot.fanin_refcount.load(), task_slot.fanin_count);
+
+    // Ready queue should be empty
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, nullptr);
+
+    // Producers should have fanout_head pointing to task_slot
+    EXPECT_NE(producer_slots[0].fanout_head, nullptr);
+    EXPECT_EQ(producer_slots[0].fanout_head->slot_state, &task_slot);
+    EXPECT_NE(producer_slots[1].fanout_head, nullptr);
+    EXPECT_EQ(producer_slots[1].fanout_head->slot_state, &task_slot);
+}
+
+// =============================================================================
+// wire_task: mixed early-finished and pending producers
+// =============================================================================
+
+TEST_F(WiringTest, WireTaskMixedProducerStates) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producers[3];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(producers[0], PTO2_TASK_COMPLETED, 1, 2);  // early finished
+    init_slot(producers[1], PTO2_TASK_RUNNING, 1, 2);    // still running
+    init_slot(producers[2], PTO2_TASK_CONSUMED, 1, 2);   // early finished (>= COMPLETED)
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 3;
+    for (int i = 0; i < 3; i++) {
+        payload.fanin_inline_slot_states[i] = &producers[i];
+    }
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    auto &rss = sched.ring_sched_states[0];
+    sched.wire_task(rss, &task_slot, 3);
+
+    // fanin_count = 4 (3 + 1)
+    EXPECT_EQ(task_slot.fanin_count, 4);
+    // early_finished = 2 (COMPLETED + CONSUMED), init_rc = 3
+    // Not yet 4 -> not ready (one producer still running)
+    EXPECT_EQ(task_slot.fanin_refcount.load(), 3);
+
+    // Only the running producer should have the consumer in its fanout chain
+    EXPECT_EQ(producers[0].fanout_head, nullptr);  // early finished, no dep entry added
+    EXPECT_NE(producers[1].fanout_head, nullptr);  // running, dep entry added
+    EXPECT_EQ(producers[2].fanout_head, nullptr);  // early finished
+}
+
+// =============================================================================
+// on_mixed_task_complete: notifies consumers via fanout chain
+// =============================================================================
+
+TEST_F(WiringTest, OnMixedTaskCompleteNotifiesConsumers) {
+    alignas(64) PTO2TaskSlotState producer;
+    alignas(64) PTO2TaskSlotState consumer1, consumer2;
+    alignas(64) PTO2TaskPayload prod_payload;
+    memset(&prod_payload, 0, sizeof(prod_payload));
+    PTO2TaskDescriptor desc{};
+
+    // Set up producer in RUNNING state with 2 consumers in fanout chain
+    init_slot(producer, PTO2_TASK_RUNNING, 1, 1);
+    producer.payload = &prod_payload;
+    producer.task = &desc;
+
+    // Consumer1: needs 1 more fanin to become ready
+    init_slot(consumer1, PTO2_TASK_PENDING, 2, 1);
+    consumer1.fanin_refcount.store(1);  // 1 of 2 satisfied
+    consumer1.active_mask = PTO2_SUBTASK_MASK_AIC;
+
+    // Consumer2: this release will make it ready
+    init_slot(consumer2, PTO2_TASK_PENDING, 2, 1);
+    consumer2.fanin_refcount.store(1);  // 1 of 2 satisfied
+    consumer2.active_mask = PTO2_SUBTASK_MASK_AIC;
+
+    // Build fanout chain: producer -> consumer2 -> consumer1
+    PTO2DepListEntry dep_entries[2];
+    dep_entries[0].slot_state = &consumer1;
+    dep_entries[0].next = nullptr;
+    dep_entries[1].slot_state = &consumer2;
+    dep_entries[1].next = &dep_entries[0];
+    producer.fanout_head = &dep_entries[1];
+
+    sched.on_mixed_task_complete(producer);
+
+    // Producer should be COMPLETED
+    EXPECT_EQ(producer.task_state.load(), PTO2_TASK_COMPLETED);
+
+    // Both consumers should have fanin_refcount incremented
+    EXPECT_EQ(consumer1.fanin_refcount.load(), 2);
+    EXPECT_EQ(consumer2.fanin_refcount.load(), 2);
+
+    // Both consumers should be ready (fanin_refcount == fanin_count)
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(consumer1.active_mask);
+    auto *r1 = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    auto *r2 = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_TRUE((r1 == &consumer1 && r2 == &consumer2) || (r1 == &consumer2 && r2 == &consumer1));
+}
+
+// =============================================================================
+// on_task_release: releases producers via fanin traversal
+// =============================================================================
+
+TEST_F(WiringTest, OnTaskReleaseReleasesProducers) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskSlotState producers[2];
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    // 2 producers, each COMPLETED with fanout_count=1
+    for (int i = 0; i < 2; i++) {
+        init_slot(producers[i], PTO2_TASK_COMPLETED, 1, 1);
+    }
+
+    init_slot(task_slot, PTO2_TASK_COMPLETED, 3, 1);
+    payload.fanin_actual_count = 2;
+    payload.fanin_inline_slot_states[0] = &producers[0];
+    payload.fanin_inline_slot_states[1] = &producers[1];
+    // Need a valid fanin_spill_pool even though we don't spill
+    PTO2FaninPool dummy_pool{};
+    PTO2FaninSpillEntry dummy_entries[4];
+    std::atomic<int32_t> dummy_error{PTO2_ERROR_NONE};
+    dummy_pool.init(dummy_entries, 4, &dummy_error);
+    payload.fanin_spill_pool = &dummy_pool;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    int32_t fanin_count = sched.on_task_release(task_slot);
+    EXPECT_EQ(fanin_count, 2);
+
+    // Each producer should have fanout_refcount incremented
+    EXPECT_EQ(producers[0].fanout_refcount.load(), 1);
+    EXPECT_EQ(producers[1].fanout_refcount.load(), 1);
+
+    // Producers with fanout_refcount == fanout_count AND COMPLETED -> CONSUMED
+    EXPECT_EQ(producers[0].task_state.load(), PTO2_TASK_CONSUMED);
+    EXPECT_EQ(producers[1].task_state.load(), PTO2_TASK_CONSUMED);
+}
+
+// =============================================================================
+// advance_ring_pointers: scans CONSUMED slots, resets, advances last_alive
+// =============================================================================
+
+TEST_F(WiringTest, AdvanceRingPointersScansConsumed) {
+    auto &rss = sched.ring_sched_states[0];
+    auto *ring = rss.ring;
+
+    // Submit 3 tasks via flow control
+    ring->fc.current_task_index.store(3, std::memory_order_release);
+
+    // Mark all 3 as CONSUMED
+    for (int i = 0; i < 3; i++) {
+        auto &slot = ring->get_slot_state_by_task_id(i);
+        slot.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_release);
+    }
+
+    EXPECT_EQ(rss.last_task_alive, 0);
+    rss.advance_ring_pointers();
+    EXPECT_EQ(rss.last_task_alive, 3);
+
+    // Verify SM was synced
+    EXPECT_EQ(ring->fc.last_task_alive.load(), 3);
+}
+
+TEST_F(WiringTest, AdvanceRingPointersStopsAtNonConsumed) {
+    auto &rss = sched.ring_sched_states[0];
+    auto *ring = rss.ring;
+
+    ring->fc.current_task_index.store(5, std::memory_order_release);
+
+    // Tasks 0,1 CONSUMED; task 2 COMPLETED (not consumed)
+    ring->get_slot_state_by_task_id(0).task_state.store(PTO2_TASK_CONSUMED);
+    ring->get_slot_state_by_task_id(1).task_state.store(PTO2_TASK_CONSUMED);
+    ring->get_slot_state_by_task_id(2).task_state.store(PTO2_TASK_COMPLETED);
+
+    rss.advance_ring_pointers();
+    EXPECT_EQ(rss.last_task_alive, 2) << "Should stop at first non-CONSUMED slot";
+}
+
+TEST_F(WiringTest, AdvanceRingPointersResetsSlots) {
+    auto &rss = sched.ring_sched_states[0];
+    auto *ring = rss.ring;
+
+    ring->fc.current_task_index.store(1, std::memory_order_release);
+
+    auto &slot = ring->get_slot_state_by_task_id(0);
+    slot.task_state.store(PTO2_TASK_CONSUMED);
+    slot.fanout_count = 5;
+    slot.fanin_refcount.store(3);
+    slot.fanout_refcount.store(2);
+    slot.completed_subtasks.store(1);
+
+    rss.advance_ring_pointers();
+
+    // After reset_for_reuse: fanout_count=1, fanin_refcount=0, etc.
+    EXPECT_EQ(slot.fanout_count, 1);
+    EXPECT_EQ(slot.fanin_refcount.load(), 0);
+    EXPECT_EQ(slot.fanout_refcount.load(), 0);
+    EXPECT_EQ(slot.completed_subtasks.load(), 0);
+    EXPECT_EQ(slot.fanout_head, nullptr);
+}
+
+// =============================================================================
+// drain_wiring_queue: pushes tasks through SPSC queue
+// =============================================================================
+
+TEST_F(WiringTest, DrainWiringQueueProcessesTasks) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    // Push into wiring SPSC queue (orchestrator side)
+    ASSERT_TRUE(sched.wiring.queue.push(&task_slot));
+
+    // Drain (scheduler thread 0 side)
+    int wired = sched.drain_wiring_queue(true /* force_drain */);
+    EXPECT_EQ(wired, 1);
+
+    // Task should be ready
+    PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask);
+    auto *popped = sched.ready_queues[static_cast<int32_t>(shape)].pop();
+    EXPECT_EQ(popped, &task_slot);
+}
+
+TEST_F(WiringTest, DrainWiringQueueBackoffDefers) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    sched.wiring.queue.push(&task_slot);
+
+    // Without force_drain, single item < BATCH_SIZE → backoff
+    sched.wiring.backoff_counter = 0;
+    int wired = sched.drain_wiring_queue(false);
+    EXPECT_EQ(wired, 0) << "Backoff should defer when queue < BATCH_SIZE";
+    EXPECT_EQ(sched.wiring.backoff_counter, 1);
+}
+
+TEST_F(WiringTest, DrainWiringQueueBackoffLimitForcesProcess) {
+    alignas(64) PTO2TaskSlotState task_slot;
+    alignas(64) PTO2TaskPayload payload;
+    memset(&payload, 0, sizeof(payload));
+    PTO2TaskDescriptor desc{};
+
+    init_slot(task_slot, PTO2_TASK_PENDING, 0, 1);
+    payload.fanin_actual_count = 0;
+    task_slot.payload = &payload;
+    task_slot.task = &desc;
+
+    sched.wiring.queue.push(&task_slot);
+
+    // Set backoff at limit → should process
+    sched.wiring.backoff_counter = PTO2SchedulerState::WiringState::BACKOFF_LIMIT;
+    int wired = sched.drain_wiring_queue(false);
+    EXPECT_EQ(wired, 1) << "Backoff limit reached should force processing";
+}