From 75a8a25cce1dc2f215963c89bfa6358952ab69ae Mon Sep 17 00:00:00 2001
From: chenshengxin <hw_chenshengxin@163.com>
Date: Wed, 22 Apr 2026 14:34:36 +0800
Subject: [PATCH] Add: PTO2 runtime unit tests for A2A3 and A5 (tensormap,
 orchestrator, coupling, link isolation)

Refactor and extend PTO2 runtime unit test coverage for A2A3 and A5:

A2A3 changes:
- Remove outdated edge/stub tests: test_tensormap_edge, test_coupling,
  test_coupling_stub, test_boundary_edge
- Add link isolation tests (test_link_isolation) and runtime coupling
  tests (test_runtime_coupling)
- Rename test_orchestrator_fatal -> test_orchestrator_report_fatal
- Extend existing tests: tensormap, shared_memory, task_allocator,
  ready_queue, orchestrator_submit, runtime_lifecycle

A5 additions:
- Add runtime-linked tests: test_a5_runtime_coupling,
  test_a5_link_isolation, test_a5_orchestrator_submit,
  test_a5_orchestrator_report_fatal, test_a5_runtime_lifecycle
- Extend existing tests: ready_queue, shared_memory, task_allocator,
  tensormap

Build:
- Add add_a5_runtime_test() CMake helper and A5 runtime source sets
---
 tests/ut/cpp/CMakeLists.txt                   | 130 ++++++++
 tests/ut/cpp/a2a3/test_link_isolation.cpp     | 199 ++++++++++++
 .../a2a3/test_orchestrator_report_fatal.cpp   | 194 ++++++++++++
 .../ut/cpp/a2a3/test_orchestrator_submit.cpp  | 273 +++++++++++++++++
 tests/ut/cpp/a2a3/test_ready_queue.cpp        | 122 +++++++-
 tests/ut/cpp/a2a3/test_runtime_coupling.cpp   | 283 ++++++++++++++++++
 tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp  | 189 ++++++++++++
 tests/ut/cpp/a2a3/test_shared_memory.cpp      | 157 ++++++++++
 tests/ut/cpp/a2a3/test_task_allocator.cpp     |  96 ++++++
 tests/ut/cpp/a2a3/test_tensormap.cpp          | 126 ++++++++
 tests/ut/cpp/a5/test_link_isolation.cpp       | 199 ++++++++++++
 .../cpp/a5/test_orchestrator_report_fatal.cpp | 194 ++++++++++++
 tests/ut/cpp/a5/test_orchestrator_submit.cpp  | 273 +++++++++++++++++
 tests/ut/cpp/a5/test_ready_queue.cpp          | 122 +++++++-
 tests/ut/cpp/a5/test_runtime_coupling.cpp     | 283 ++++++++++++++++++
 tests/ut/cpp/a5/test_runtime_lifecycle.cpp    | 189 ++++++++++++
 tests/ut/cpp/a5/test_shared_memory.cpp        | 170 ++++++++++-
 tests/ut/cpp/a5/test_task_allocator.cpp       |  96 ++++++
 tests/ut/cpp/a5/test_tensormap.cpp            | 125 ++++++++
 19 files changed, 3405 insertions(+), 15 deletions(-)
 create mode 100644 tests/ut/cpp/a2a3/test_link_isolation.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_orchestrator_submit.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_runtime_coupling.cpp
 create mode 100644 tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp
 create mode 100644 tests/ut/cpp/a5/test_link_isolation.cpp
 create mode 100644 tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp
 create mode 100644 tests/ut/cpp/a5/test_orchestrator_submit.cpp
 create mode 100644 tests/ut/cpp/a5/test_runtime_coupling.cpp
 create mode 100644 tests/ut/cpp/a5/test_runtime_lifecycle.cpp

diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index f923f290c..2c111d24d 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -94,6 +94,26 @@ set(A2A3_COMMON_INCLUDE_DIRS
     ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
 )
 
+# ---------------------------------------------------------------------------
+# A5 runtime sources and stubs for ring-buffer / tensormap tests
+# ---------------------------------------------------------------------------
+set(A5_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime)
+set(A5_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp)
+set(A5_RUNTIME_SOURCES
+    ${A5_RUNTIME_DIR}/pto_ring_buffer.cpp
+    ${A5_RUNTIME_DIR}/shared/pto_shared_memory.cpp
+    ${A5_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
+    ${A5_RUNTIME_DIR}/pto_tensormap.cpp
+)
+
+set(A5_COMMON_INCLUDE_DIRS
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/orchestration
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/common
+    ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface
+)
+
 function(add_a2a3_runtime_test name)
     cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
     set(_all_sources ${ARG_SOURCES} ${A2A3_STUB_SOURCES})
@@ -117,6 +137,29 @@ function(add_a2a3_runtime_test name)
     set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
 endfunction()
 
+function(add_a5_runtime_test name)
+    cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN})
+    set(_all_sources ${ARG_SOURCES} ${A5_STUB_SOURCES})
+    foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES})
+        if(EXISTS ${src})
+            list(APPEND _all_sources ${src})
+        endif()
+    endforeach()
+    add_executable(${name} ${_all_sources})
+    target_include_directories(${name} PRIVATE
+        ${GTEST_INCLUDE_DIRS}
+        ${A5_COMMON_INCLUDE_DIRS}
+    )
+    target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0)
+    target_link_libraries(${name} PRIVATE
+        ${GTEST_MAIN_LIB}
+        ${GTEST_LIB}
+        pthread
+    )
+    add_test(NAME ${name} COMMAND ${name})
+    set_tests_properties(${name} PROPERTIES LABELS "no_hardware")
+endfunction()
+
 # ---------------------------------------------------------------------------
 # Distributed runtime sources under test
 # ---------------------------------------------------------------------------
@@ -248,6 +291,30 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp)
 add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp)
 add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp)
 
+# PTO2 runtime-linked tests (tensormap, orchestrator, coupling, boundary)
+add_a2a3_runtime_test(test_runtime_coupling
+    SOURCES a2a3/test_runtime_coupling.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_runtime_test(test_link_isolation
+    SOURCES a2a3/test_link_isolation.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+)
+add_a2a3_runtime_test(test_orchestrator_submit
+    SOURCES a2a3/test_orchestrator_submit.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_runtime_test(test_orchestrator_report_fatal
+    SOURCES a2a3/test_orchestrator_report_fatal.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a2a3_runtime_test(test_runtime_lifecycle
+    SOURCES a2a3/test_runtime_lifecycle.cpp
+    EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES}
+                  ${A2A3_RUNTIME_DIR}/pto_runtime2.cpp
+                  ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+
 # ---------------------------------------------------------------------------
 # A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/)
 # ---------------------------------------------------------------------------
@@ -300,6 +367,69 @@ add_a2a3_runtime_test(test_wiring
 # ---------------------------------------------------------------------------
 add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp)
 
+add_a5_runtime_test(test_a5_runtime_coupling
+    SOURCES a5/test_runtime_coupling.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES} ${A5_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a5_runtime_test(test_a5_link_isolation
+    SOURCES a5/test_link_isolation.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_orchestrator_submit
+    SOURCES a5/test_orchestrator_submit.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES} ${A5_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a5_runtime_test(test_a5_orchestrator_report_fatal
+    SOURCES a5/test_orchestrator_report_fatal.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES} ${A5_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a5_runtime_test(test_a5_runtime_lifecycle
+    SOURCES a5/test_runtime_lifecycle.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+                  ${A5_RUNTIME_DIR}/pto_runtime2.cpp
+                  ${A5_RUNTIME_DIR}/pto_orchestrator.cpp
+)
+add_a5_runtime_test(test_a5_task_allocator
+    SOURCES a5/test_task_allocator.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_dep_list_pool
+    SOURCES a5/test_dep_list_pool.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_scheduler_state
+    SOURCES a5/test_scheduler_state.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_task_state
+    SOURCES a5/test_task_state.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_ready_queue
+    SOURCES a5/test_ready_queue.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_shared_memory
+    SOURCES a5/test_shared_memory.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_tensormap
+    SOURCES a5/test_tensormap.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_fanin_pool
+    SOURCES a5/test_fanin_pool.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_spsc_queue
+    SOURCES a5/test_spsc_queue.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+add_a5_runtime_test(test_a5_wiring
+    SOURCES a5/test_wiring.cpp
+    EXTRA_SOURCES ${A5_RUNTIME_SOURCES}
+)
+
 # Host logger silent/off behavior — no runtime deps, just compile host_log.cpp
 # alongside the test.
 set(A5_HOST_LOG_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/src/host)
diff --git a/tests/ut/cpp/a2a3/test_link_isolation.cpp b/tests/ut/cpp/a2a3/test_link_isolation.cpp
new file mode 100644
index 000000000..78f1378a9
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_link_isolation.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Behavior tests for runtime components built without pto_orchestrator.cpp.
+ *
+ * The CMake target for this file deliberately omits the orchestrator source.
+ * Passing build and runtime assertions here verifies that scheduler, ring
+ * buffer, shared memory, and TensorMap behavior does not require an
+ * orchestrator link dependency.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_orchestration_api.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+
+struct TestLookupResult {
+    struct Entry {
+        PTO2TensorMapEntry *entry;
+        OverlapStatus overlap_status;
+    };
+    std::vector<Entry> entries;
+    int count = 0;
+};
+
+void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) {
+    tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool {
+        out.entries.push_back({&entry, status});
+        out.count++;
+        return true;
+    });
+}
+
+Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 1, DataType::FLOAT32, false, version);
+}
+
+struct DepPoolFixture {
+    PTO2DepListEntry entries[512];
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2DepListPool pool{};
+
+    void Init() {
+        std::memset(entries, 0, sizeof(entries));
+        error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        pool.init(entries, 512, &error_code);
+    }
+
+    void AllocN(int count) {
+        for (int i = 0; i < count; i++) {
+            ASSERT_NE(pool.alloc(), nullptr);
+        }
+    }
+};
+
+}  // namespace
+
+TEST(LinkIsolationDepPool, ReclaimBelowIntervalKeepsAllocatedEntries) {
+    DepPoolFixture fixture;
+    fixture.Init();
+    fixture.AllocN(100);
+    int32_t used_before = fixture.pool.used();
+
+    PTO2SharedMemoryRingHeader ring_header{};
+    fixture.pool.reclaim(ring_header, PTO2_DEP_POOL_CLEANUP_INTERVAL - 1);
+
+    EXPECT_EQ(fixture.pool.used(), used_before);
+    EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+}
+
+TEST(LinkIsolationDepPool, ReclaimAtIntervalUsesConsumedTaskMark) {
+    DepPoolFixture fixture;
+    fixture.Init();
+    fixture.AllocN(100);
+
+    std::vector<PTO2TaskSlotState> slots(kWindowSize);
+    PTO2SharedMemoryRingHeader ring_header{};
+    ring_header.slot_states = slots.data();
+    ring_header.task_window_size = kWindowSize;
+    ring_header.task_window_mask = kWindowSize - 1;
+
+    int32_t last_alive = PTO2_DEP_POOL_CLEANUP_INTERVAL;
+    int32_t mark_slot = (last_alive - 1) & ring_header.task_window_mask;
+    slots[mark_slot].dep_pool_mark = 50;
+
+    fixture.pool.reclaim(ring_header, last_alive);
+
+    EXPECT_EQ(fixture.pool.used(), 51);
+    EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+}
+
+TEST(LinkIsolationScheduler, ReleaseFaninPushesReadyTask) {
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+    ASSERT_NE(sm, nullptr);
+
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(sched.init(sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = ActiveMask(PTO2_SUBTASK_MASK_AIV0);
+
+    EXPECT_TRUE(sched.release_fanin_and_check_ready(slot, nullptr));
+
+    PTO2ResourceShape shape = slot.active_mask.to_shape();
+    EXPECT_EQ(sched.ready_queues[static_cast<int>(shape)].pop(), &slot);
+
+    sched.destroy();
+    sm->destroy();
+}
+
+TEST(LinkIsolationScheduler, CompletedTaskCanBecomeConsumed) {
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+    ASSERT_NE(sm, nullptr);
+
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(sched.init(sm->header));
+
+    PTO2TaskDescriptor desc{};
+    PTO2TaskSlotState &slot = sm->header->rings[0].get_slot_state_by_slot(0);
+    slot.task = &desc;
+    slot.ring_id = 0;
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED);
+
+    sched.destroy();
+    sm->destroy();
+}
+
+TEST(LinkIsolationReadyQueue, PushPopBatchWithoutOrchestrator) {
+    PTO2ReadyQueue queue{};
+    ASSERT_TRUE(ready_queue_init(&queue, 16));
+
+    PTO2TaskSlotState items[4]{};
+    PTO2TaskSlotState *in[4] = {&items[0], &items[1], &items[2], &items[3]};
+    queue.push_batch(in, 4);
+
+    PTO2TaskSlotState *out[4]{};
+    EXPECT_EQ(queue.pop_batch(out, 4), 4);
+    for (int i = 0; i < 4; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+
+    ready_queue_destroy(&queue);
+}
+
+TEST(LinkIsolationTensorMap, InsertLookupAndValidityWithoutOrchestrator) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {kWindowSize, kWindowSize, kWindowSize, kWindowSize};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor tensor = make_tensor(0x3000);
+    for (int i = 0; i < kWindowSize; i++) {
+        tmap.insert(tensor, PTO2TaskId::make(0, i));
+    }
+
+    tmap.sync_validity(0, kWindowSize / 2);
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+    EXPECT_EQ(result.count, kWindowSize / 2);
+    for (const auto &entry : result.entries) {
+        EXPECT_GE(entry.entry->producer_task_id.local(), static_cast<uint32_t>(kWindowSize / 2));
+        EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED);
+    }
+
+    tmap.destroy();
+}
diff --git a/tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp b/tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp
new file mode 100644
index 000000000..06fdd16d7
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for the orchestrator-side fatal reporting path.
+ *
+ * Targets orch_.report_fatal (pto_orchestrator.cpp) and verifies:
+ *  - orch->fatal latches to true on any non-zero error code
+ *  - the first non-zero code wins via CAS into sm_header->orch_error_code
+ *  - subsequent fatal reports do NOT overwrite the first code
+ *  - PTO2_ERROR_NONE never latches the shared-memory code (but still flips
+ *    the local fatal flag -- by design, callers may use it to mark fatal
+ *    without writing a code)
+ *
+ * This test exercises the real symbol against a fully-initialized
+ * orchestrator + shared memory pair, complementing the fake-runtime test
+ * (test_a2a3_pto2_fatal.cpp) that only validates the ops-table dispatch.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime_status.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+class OrchestratorFatalTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = sched_.init(sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) orch_.destroy();
+        if (sched_ok_) sched_.destroy();
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) sm_->destroy();
+    }
+
+    int32_t shared_orch_code() const { return sm_->header->orch_error_code.load(std::memory_order_acquire); }
+};
+
+}  // namespace
+
+// ---------- baseline ----------
+
+TEST_F(OrchestratorFatalTest, InitialState_NoFatalNoSharedCode) {
+    // Verify no fatal state via the observable shared memory output
+    EXPECT_FALSE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- happy path: single fatal latches both local flag and shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatal_SetsLocalFlagAndSharedCode) {
+    orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", "deadlock at ring %d", 3);
+
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+// ---------- CAS first-writer-wins ----------
+
+TEST_F(OrchestratorFatalTest, SecondReportFatal_DoesNotOverwriteSharedCode) {
+    orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", nullptr);
+    orch_.report_fatal(PTO2_ERROR_DEP_POOL_OVERFLOW, "test", nullptr);
+
+    // Second report must NOT overwrite the first latched code.
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(OrchestratorFatalTest, RepeatedSameCode_StaysLatched) {
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+// ---------- PTO2_ERROR_NONE: marks fatal locally, does NOT touch shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatalWithErrorNone_DoesNotWriteSharedCode) {
+    orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr);
+
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- PTO2_ERROR_NONE first does not block a real code from latching ----------
+
+TEST_F(OrchestratorFatalTest, ErrorNoneFirst_RealCodeStillLatchesAfter) {
+    orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr);
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+
+    orch_.report_fatal(PTO2_ERROR_SCOPE_DEADLOCK, "test", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_SCOPE_DEADLOCK);
+}
+
+// ---------- coverage of every defined orchestrator code ----------
+
+TEST_F(OrchestratorFatalTest, EveryOrchCode_LatchesIntoSharedMemory) {
+    const int32_t codes[] = {
+        PTO2_ERROR_SCOPE_DEADLOCK,
+        PTO2_ERROR_HEAP_RING_DEADLOCK,
+        PTO2_ERROR_FLOW_CONTROL_DEADLOCK,
+        PTO2_ERROR_DEP_POOL_OVERFLOW,
+        PTO2_ERROR_INVALID_ARGS,
+        PTO2_ERROR_DEPENDENCY_OVERFLOW,
+        PTO2_ERROR_REQUIRE_SYNC_START_INVALID,
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT,
+        PTO2_ERROR_EXPLICIT_ORCH_FATAL,
+    };
+    for (int32_t code : codes) {
+        // Reset latches between iterations. Direct field access is unavoidable here
+        // since there is no public reset API for the orchestrator fatal state.
+        sm_->header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_release);
+        orch_.fatal = false;
+
+        orch_.report_fatal(code, "test", "code=%d", code);
+
+        SCOPED_TRACE(testing::Message() << "code=" << code);
+        EXPECT_TRUE(orch_.fatal);
+        EXPECT_EQ(shared_orch_code(), code);
+    }
+}
+
+// ---------- format-string variants must not crash ----------
+
+TEST_F(OrchestratorFatalTest, NullFmt_DoesNotCrash) {
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", nullptr);
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, EmptyFmt_DoesNotCrash) {
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", "");
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, FmtWithVarArgs_DoesNotCrash) {
+    orch_.report_fatal(
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT, "func", "tensor=%p slot=%d msg=%s", reinterpret_cast<void *>(0xdeadbeef), 17,
+        "boom"
+    );
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_TENSOR_WAIT_TIMEOUT);
+}
+
+// ---------- end-to-end: status helper sees latched code ----------
+
+TEST_F(OrchestratorFatalTest, StatusHelperReadsLatchedOrchCode) {
+    orch_.report_fatal(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, "func", nullptr);
+
+    int32_t orch_code = shared_orch_code();
+    int32_t sched_code = sm_->header->sched_error_code.load(std::memory_order_acquire);
+    EXPECT_EQ(runtime_status_from_error_codes(orch_code, sched_code), -PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
diff --git a/tests/ut/cpp/a2a3/test_orchestrator_submit.cpp b/tests/ut/cpp/a2a3/test_orchestrator_submit.cpp
new file mode 100644
index 000000000..c01635f57
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_orchestrator_submit.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Orchestrator submit-path UT.
+ *
+ * Covers pto2_submit_mixed_task, pto2_alloc_tensors, pto2_orchestrator_done,
+ * and pto2_orchestrator_set_scheduler on a fully initialized
+ * (TMR) system.
+ *
+ * Follows AAA and FIRST: each TEST_F builds a fresh TMRSystem, exercises
+ * one behavior, and tears the system down in TearDown().
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestration_api.h"  // make_tensor_external, TensorCreateInfo ctor
+#include "pto_orchestrator.h"
+#include "pto_ring_buffer.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_submit_types.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+// -----------------------------------------------------------------------------
+// Fixture: minimal TMR system for orchestrator-level tests.
+// -----------------------------------------------------------------------------
+class OrchestratorSubmitTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = sched_.init(sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+
+        orch_.set_scheduler(&sched_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) orch_.destroy();
+        if (sched_ok_) sched_.destroy();
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) sm_->destroy();
+    }
+
+    // Helper: build a minimal TensorCreateInfo owning one FP32 scalar output.
+    static TensorCreateInfo make_scalar_ci() {
+        static const uint32_t kShape[1] = {1};
+        return TensorCreateInfo(kShape, 1, DataType::FLOAT32);
+    }
+
+    bool has_orch_error() const {
+        return sm_->header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE;
+    }
+};
+
+}  // namespace
+
+// ---------- set_scheduler ----------
+
+TEST_F(OrchestratorSubmitTest, SetScheduler_StoresPointer) {
+    PTO2SchedulerState other{};
+    orch_.set_scheduler(&other);
+    // Direct field read: no public getter exists for the scheduler pointer.
+    EXPECT_EQ(orch_.scheduler, &other);
+
+    // Restore for TearDown.
+    orch_.set_scheduler(&sched_);
+}
+
+// ---------- alloc_tensors: argument validation ----------
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_EmptyArgs_MarksFatal) {
+    Arg args;  // no tensors, no scalars
+
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_WithScalars_MarksFatal) {
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+    args.add_scalar(uint64_t{42});
+
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_InputArg_MarksFatal) {
+    // alloc_tensors only accepts OUTPUT TensorCreateInfo args.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_input(input);
+
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_OutputOnly_ReturnsMaterializedTensors) {
+    // Arrange: two output CIs, inside an active scope.
+    TensorCreateInfo ci1 = make_scalar_ci();
+    TensorCreateInfo ci2 = make_scalar_ci();
+    Arg args;
+    args.add_output(ci1, ci2);
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+    orch_.end_scope();
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 2U);
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_AfterFatal_ReturnsEmpty) {
+    // Arrange: force fatal.
+    orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+    ASSERT_TRUE(has_orch_error());
+
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    // Act
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+// ---------- submit_mixed_task ----------
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_AfterFatal_ReturnsEmpty) {
+    // Arrange: pre-fatal state
+    orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+    Arg args;
+
+    // Act
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_ArgWithError_MarksFatalInvalidArgs) {
+    // Arrange: craft an Arg with has_error set.
+    // Calling add_input after add_scalar triggers the ordering error path.
+    uint32_t shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_scalar(uint64_t{1});
+    args.add_input(t);  // illegal ordering -> has_error = true
+    ASSERT_TRUE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+    orch_.end_scope();
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_PureInputOnly_Succeeds) {
+    // Arrange: one input tensor, one AIC kernel, within a scope.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x2000), shape, 1);
+
+    Arg args;
+    args.add_input(input);
+    ASSERT_FALSE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 7;  // any non-invalid id
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+    orch_.end_scope();
+
+    // Assert: submit returns (no outputs), and no fatal state was set.
+    EXPECT_TRUE(result.empty());
+    EXPECT_FALSE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_OutputTensor_MaterializesResult) {
+    // Arrange: one OUTPUT TensorCreateInfo -> task produces one tensor.
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 1;
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+    orch_.end_scope();
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 1U);
+}
+
+// ---------- orchestrator_done ----------
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_SetsSharedMemoryFlag) {
+    // Arrange
+    ASSERT_EQ(sm_->header->orchestrator_done.load(), 0);
+
+    // Act
+    orch_.mark_done();
+
+    // Assert
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_IsIdempotent) {
+    orch_.mark_done();
+    orch_.mark_done();
+
+    // Flag stays 1 -- store is release-set, not increment.
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp
index 9dea3ae94..bb2a6a101 100644
--- a/tests/ut/cpp/a2a3/test_ready_queue.cpp
+++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp
@@ -371,9 +371,10 @@ TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) {
 INSTANTIATE_TEST_SUITE_P(
     MPMCVariants, ReadyQueueMPMCTest,
     ::testing::Values(
-        MPMCConfig{2, 2, 200},  // TwoProducersTwoConsumers
-        MPMCConfig{1, 4, 500},  // OneProducerNConsumers
-        MPMCConfig{4, 4, 1250}  // HighContentionStress
+        MPMCConfig{2, 2, 200},   // TwoProducersTwoConsumers
+        MPMCConfig{1, 4, 500},   // OneProducerNConsumers
+        MPMCConfig{4, 4, 1250},  // HighContentionStress
+        MPMCConfig{8, 8, 2000}   // EightProducersEightConsumers
     )
 );
 
@@ -444,3 +445,118 @@ TEST_F(LocalReadyBufferTest, NullBackingBuffer) {
     EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing";
     EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing";
 }
+
+// =============================================================================
+// High-contention stress tests
+// =============================================================================
+
+class ReadyQueueStressTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 512;
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, kCapacity)); }
+    void TearDown() override { ready_queue_destroy(&queue); }
+};
+
+TEST_F(ReadyQueueStressTest, RapidFillDrainCycles) {
+    constexpr int kCycles = 100;
+    constexpr int kItemsPerCycle = static_cast<int>(kCapacity / 2);
+
+    std::vector<PTO2TaskSlotState> items(kItemsPerCycle);
+    for (int i = 0; i < kItemsPerCycle; i++) {
+        items[i].fanin_count = i;
+    }
+
+    for (int cycle = 0; cycle < kCycles; cycle++) {
+        std::atomic<int> push_done{0};
+        std::atomic<int> popped{0};
+
+        auto producer = [&](int id) {
+            int per_thread = kItemsPerCycle / 4;
+            int base = id * per_thread;
+            for (int i = 0; i < per_thread; i++) {
+                while (!queue.push(&items[base + i])) {}
+            }
+            push_done.fetch_add(1, std::memory_order_release);
+        };
+
+        auto consumer = [&]() {
+            while (true) {
+                PTO2TaskSlotState *s = queue.pop();
+                if (s) {
+                    popped.fetch_add(1, std::memory_order_relaxed);
+                } else if (push_done.load(std::memory_order_acquire) == 4) {
+                    while ((s = queue.pop()) != nullptr) {
+                        popped.fetch_add(1, std::memory_order_relaxed);
+                    }
+                    break;
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(producer, i);
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(consumer);
+        for (auto &t : threads)
+            t.join();
+
+        ASSERT_EQ(popped.load(), kItemsPerCycle) << "Cycle " << cycle << ": lost items";
+    }
+}
+
+TEST_F(ReadyQueueStressTest, PopBatchUnderContention) {
+    constexpr int kBatchSize = 8;
+    constexpr int kBatches = 500;
+    constexpr int kProducers = 4;
+    constexpr int kTotalItems = kBatchSize * kBatches * kProducers;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++)
+        items[i].fanin_count = i;
+
+    std::atomic<int> total_consumed{0};
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        int base = id * kBatchSize * kBatches;
+        for (int b = 0; b < kBatches; b++) {
+            PTO2TaskSlotState *ptrs[kBatchSize];
+            for (int i = 0; i < kBatchSize; i++) {
+                ptrs[i] = &items[base + b * kBatchSize + i];
+            }
+            for (int i = 0; i < kBatchSize; i++) {
+                while (!queue.push(ptrs[i])) {}
+            }
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *out[kBatchSize];
+            int n = queue.pop_batch(out, kBatchSize);
+            total_consumed.fetch_add(n, std::memory_order_relaxed);
+            if (n == 0 && producers_done.load(std::memory_order_acquire) == kProducers) {
+                while (true) {
+                    n = queue.pop_batch(out, kBatchSize);
+                    if (n == 0) break;
+                    total_consumed.fetch_add(n, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kProducers; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < 4; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(total_consumed.load(), kTotalItems);
+}
diff --git a/tests/ut/cpp/a2a3/test_runtime_coupling.cpp b/tests/ut/cpp/a2a3/test_runtime_coupling.cpp
new file mode 100644
index 000000000..927a7b0fc
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_runtime_coupling.cpp
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime behavior tests for the combined TMR components.
+ *
+ * These tests keep the assertions on observable component behavior: lifecycle,
+ * scheduler state transitions, TensorMap lookup validity, and shared-memory
+ * coordination. Structural coupling checks belong in design review rather than
+ * in unit tests.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "pto_orchestration_api.h"
+#include "pto_orchestrator.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPoolSize = 256;
+
+struct TestLookupResult {
+    struct Entry {
+        PTO2TensorMapEntry *entry;
+        OverlapStatus overlap_status;
+    };
+    std::vector<Entry> entries;
+    int count = 0;
+};
+
+void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) {
+    tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool {
+        out.entries.push_back({&entry, status});
+        out.count++;
+        return true;
+    });
+}
+
+Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 1, DataType::FLOAT32, false, version);
+}
+
+struct TMRSystem {
+    PTO2SharedMemoryHandle *sm = nullptr;
+    PTO2SchedulerState sched{};
+    PTO2OrchestratorState orch{};
+    uint8_t *gm_heap = nullptr;
+    bool sched_ok = false;
+    bool orch_ok = false;
+
+    bool Init(uint64_t heap_size = kHeapSize, int32_t window_size = kWindowSize) {
+        sm = PTO2SharedMemoryHandle::create(window_size, heap_size);
+        if (sm == nullptr) return false;
+
+        gm_heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, heap_size));
+        if (gm_heap == nullptr) return false;
+
+        sched_ok = sched.init(sm->header, kDepPoolSize);
+        if (!sched_ok) return false;
+
+        orch_ok = orch.init(sm->header, gm_heap, heap_size, kDepPoolSize);
+        if (!orch_ok) return false;
+
+        orch.set_scheduler(&sched);
+        return true;
+    }
+
+    void Destroy() {
+        if (orch_ok) {
+            orch.destroy();
+            orch_ok = false;
+        }
+        if (sched_ok) {
+            sched.destroy();
+            sched_ok = false;
+        }
+        if (gm_heap != nullptr) {
+            std::free(gm_heap);
+            gm_heap = nullptr;
+        }
+        if (sm != nullptr) {
+            sm->destroy();
+            sm = nullptr;
+        }
+    }
+};
+
+}  // namespace
+
+TEST(RuntimeLifecycleBehavior, InitDestroyCanRepeat) {
+    for (int cycle = 0; cycle < 3; cycle++) {
+        TMRSystem sys;
+        ASSERT_TRUE(sys.Init()) << "cycle=" << cycle;
+        EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+        EXPECT_EQ(sys.sm->header->sched_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+        sys.Destroy();
+    }
+}
+
+TEST(RuntimeLifecycleBehavior, OrchestratorScopeWithoutSchedulerLeavesNoFatalCode) {
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+    ASSERT_NE(heap, nullptr);
+
+    PTO2OrchestratorState orch{};
+    ASSERT_TRUE(orch.init(sm->header, heap, kHeapSize, kDepPoolSize));
+
+    orch.begin_scope();
+    orch.end_scope();
+
+    EXPECT_EQ(sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+    EXPECT_FALSE(orch.fatal);
+
+    orch.destroy();
+    std::free(heap);
+    sm->destroy();
+}
+
+TEST(RuntimeSchedulerBehavior, CompletedSlotWithSatisfiedFanoutBecomesConsumed) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    PTO2TaskDescriptor desc{};
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot.task = &desc;
+    slot.ring_id = 0;
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    sys.sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED);
+
+    sys.Destroy();
+}
+
+TEST(RuntimeSchedulerBehavior, RingPointerStopsAtFirstUnconsumedTask) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    auto &ring_state = sys.sched.ring_sched_states[0];
+    PTO2TaskDescriptor descs[3]{};
+
+    PTO2TaskSlotState &slot0 = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot0.task = &descs[0];
+    slot0.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+
+    PTO2TaskSlotState &slot1 = sys.sm->header->rings[0].get_slot_state_by_slot(1);
+    slot1.task = &descs[1];
+    slot1.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    PTO2TaskSlotState &slot2 = sys.sm->header->rings[0].get_slot_state_by_slot(2);
+    slot2.task = &descs[2];
+    slot2.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+
+    sys.sm->header->rings[0].fc.current_task_index.store(3, std::memory_order_relaxed);
+
+    ring_state.advance_ring_pointers();
+
+    EXPECT_EQ(ring_state.last_task_alive, 1);
+    EXPECT_EQ(sys.sm->header->rings[0].fc.last_task_alive.load(std::memory_order_acquire), static_cast<int32_t>(1));
+
+    sys.Destroy();
+}
+
+TEST(RuntimeSchedulerBehavior, ReadyQueuesAcceptEveryResourceShape) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    for (int shape = 0; shape < PTO2_NUM_RESOURCE_SHAPES; shape++) {
+        PTO2TaskSlotState slot{};
+        EXPECT_TRUE(sys.sched.ready_queues[shape].push(&slot)) << "shape=" << shape;
+        EXPECT_EQ(sys.sched.ready_queues[shape].pop(), &slot) << "shape=" << shape;
+    }
+
+    sys.Destroy();
+}
+
+TEST(RuntimeTensorMapBehavior, StandaloneInsertLookupDoesNotNeedOrchestratorPointer) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor tensor = make_tensor(0x1000);
+    tmap.insert(tensor, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0));
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+
+    tmap.destroy();
+}
+
+TEST(RuntimeTensorMapBehavior, ValiditySkipsRetiredEntries) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {256, 256, 256, 256};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 4096, window_sizes));
+
+    Tensor tensor = make_tensor(0x2000);
+    for (int i = 0; i < 100; i++) {
+        tmap.insert(tensor, PTO2TaskId::make(0, i));
+    }
+
+    tmap.sync_validity(0, 80);
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+
+    EXPECT_EQ(result.count, 20);
+    for (const auto &entry : result.entries) {
+        EXPECT_GE(entry.entry->producer_task_id.local(), 80u);
+        EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED);
+    }
+
+    tmap.destroy();
+}
+
+TEST(RuntimeTensorMapBehavior, AllRingsCanProduceForSameTensor) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++) {
+        window_sizes[i] = kWindowSize;
+    }
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor tensor = make_tensor(0x3000);
+    for (int ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) {
+        tmap.insert(tensor, PTO2TaskId::make(ring, 0));
+    }
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+
+    EXPECT_EQ(result.count, PTO2_MAX_RING_DEPTH);
+
+    tmap.destroy();
+}
+
+TEST(RuntimeIntegrationBehavior, OrchestratorTensorMapUsesConfiguredWindow) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    Tensor tensor = make_tensor(0x4000);
+    sys.orch.begin_scope();
+    sys.orch.tensor_map.insert(tensor, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(sys.orch.tensor_map, tensor, result);
+
+    sys.orch.end_scope();
+
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0));
+    EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+
+    sys.Destroy();
+}
diff --git a/tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp b/tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp
new file mode 100644
index 000000000..b1987ed9f
--- /dev/null
+++ b/tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime lifecycle UT.
+ *
+ * Covers runtime_create / _custom / _from_sm / _destroy / set_mode.
+ *
+ * Follows AAA and FIRST: no shared mutable state between tests, each test
+ * constructs its own runtime and tears it down.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kSmallWindow = 64;
+constexpr uint64_t kSmallHeap = 64 * 1024;
+
+// -----------------------------------------------------------------------------
+// Fixture: each test gets a fresh, isolated runtime config.
+// -----------------------------------------------------------------------------
+class RuntimeLifecycleTest : public ::testing::Test {
+protected:
+    PTO2Runtime *rt_ = nullptr;
+
+    void TearDown() override {
+        if (rt_ != nullptr) {
+            runtime_destroy(rt_);
+            rt_ = nullptr;
+        }
+    }
+};
+
+}  // namespace
+
+// ---------- Happy-path creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ValidSizes_ReturnsInitializedRuntime) {
+    // Arrange + Act
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+
+    // Assert
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_NE(rt_->ops, nullptr);
+    EXPECT_NE(rt_->sm_handle, nullptr);
+    EXPECT_NE(rt_->gm_heap, nullptr);
+    EXPECT_TRUE(rt_->gm_heap_owned);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_SIMULATE);
+    EXPECT_EQ(rt_->gm_heap_size, kSmallHeap * PTO2_MAX_RING_DEPTH);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ConnectsOrchestratorToScheduler) {
+    rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+
+    ASSERT_NE(rt_, nullptr);
+    // In simulated mode the orchestrator must hold a pointer to the scheduler.
+    EXPECT_EQ(rt_->orchestrator.scheduler, &rt_->scheduler);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateDefault_UsesDefaultSizes) {
+    // create() is a thin wrapper around create_custom with PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE.
+    // Use GRAPH_ONLY to avoid executor threads.  We don't allocate the full
+    // 256MB heap in this path -- keep the assertion restricted to mode.
+    rt_ = runtime_create(PTO2_MODE_GRAPH_ONLY);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+// ---------- From-SM creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_NullHandle_ReturnsNull) {
+    // Act
+    PTO2Runtime *rt = runtime_create_from_sm(PTO2_MODE_SIMULATE, nullptr, nullptr, 0);
+
+    // Assert
+    EXPECT_EQ(rt, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_RecordsCallerBuffers) {
+    // Arrange: caller-allocated sm + gm_heap.
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kSmallWindow, kSmallHeap);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kSmallHeap));
+    ASSERT_NE(heap, nullptr);
+
+    // Act
+    rt_ = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm, heap, kSmallHeap);
+
+    // Assert: the returned runtime must NOT claim ownership of the gm_heap.
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->sm_handle, sm);
+    EXPECT_EQ(rt_->gm_heap, heap);
+    EXPECT_FALSE(rt_->gm_heap_owned);
+
+    // Cleanup: runtime_destroy consumes sm via pto2_sm_destroy (observed
+    // behavior, see pto_runtime2.cpp:339), so only free the gm_heap here.
+    runtime_destroy(rt_);
+    rt_ = nullptr;
+    std::free(heap);
+}
+
+// ---------- Destroy ----------
+
+TEST_F(RuntimeLifecycleTest, Destroy_NullRuntime_NoCrash) {
+    // Documented contract: destroy(nullptr) is a no-op.
+    runtime_destroy(nullptr);
+    SUCCEED();
+}
+
+TEST_F(RuntimeLifecycleTest, Destroy_ReleasesOwnedHeap) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    // Act: explicitly destroy and null out so TearDown doesn't double-free.
+    runtime_destroy(rt_);
+    rt_ = nullptr;
+    // Assert: reaching here without asan/ubsan complaint is the test (leak-free).
+    SUCCEED();
+}
+
+// ---------- set_mode ----------
+
+TEST_F(RuntimeLifecycleTest, SetMode_UpdatesField) {
+    rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    ASSERT_EQ(rt_->mode, PTO2_MODE_EXECUTE);
+
+    // Act
+    runtime_set_mode(rt_, PTO2_MODE_GRAPH_ONLY);
+
+    // Assert
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+TEST_F(RuntimeLifecycleTest, SetMode_NullRuntime_NoCrash) {
+    // Contract: defensive null check, mirrors destroy.
+    runtime_set_mode(nullptr, PTO2_MODE_SIMULATE);
+    SUCCEED();
+}
+
+// ---------- Ops table wiring ----------
+
+TEST_F(RuntimeLifecycleTest, OpsTable_AllFunctionPointersPopulated) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    const PTO2RuntimeOps *ops = rt_->ops;
+    ASSERT_NE(ops, nullptr);
+
+    // Hot-path ops called by the orchestration .so -- must never be null.
+    EXPECT_NE(ops->submit_task, nullptr);
+    EXPECT_NE(ops->alloc_tensors, nullptr);
+    EXPECT_NE(ops->scope_begin, nullptr);
+    EXPECT_NE(ops->scope_end, nullptr);
+    EXPECT_NE(ops->orchestration_done, nullptr);
+    EXPECT_NE(ops->is_fatal, nullptr);
+    EXPECT_NE(ops->report_fatal, nullptr);
+    EXPECT_NE(ops->get_tensor_data, nullptr);
+    EXPECT_NE(ops->set_tensor_data, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, IsFatal_FreshRuntime_ReturnsFalse) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_FALSE(rt_->ops->is_fatal(rt_));
+}
+
+TEST_F(RuntimeLifecycleTest, ReportFatal_SetsFatalFlag) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+
+    // Act
+    rt_->ops->report_fatal(rt_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", "%s", "forced");
+
+    // Assert
+    EXPECT_TRUE(rt_->ops->is_fatal(rt_));
+}
diff --git a/tests/ut/cpp/a2a3/test_shared_memory.cpp b/tests/ut/cpp/a2a3/test_shared_memory.cpp
index 44182090a..cd9b4d526 100644
--- a/tests/ut/cpp/a2a3/test_shared_memory.cpp
+++ b/tests/ut/cpp/a2a3/test_shared_memory.cpp
@@ -31,6 +31,8 @@
 
 #include <gtest/gtest.h>
 #include <cstring>
+#include <thread>
+#include <vector>
 #include "pto_shared_memory.h"
 
 // =============================================================================
@@ -187,3 +189,158 @@ TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) {
     PTO2SharedMemoryHandle *h = PTO2SharedMemoryHandle::create_from_buffer(buf, 64, 256, 4096);
     EXPECT_EQ(h, nullptr) << "Undersized buffer should fail";
 }
+
+// =============================================================================
+// Concurrent read/write of per-ring flow control
+// =============================================================================
+
+class SharedMemoryConcurrentTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = PTO2SharedMemoryHandle::create(256, 4096);
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            handle->destroy();
+            handle = nullptr;
+        }
+    }
+};
+
+TEST_F(SharedMemoryConcurrentTest, PerRingTaskIndexIsolation) {
+    constexpr int kIterations = 10000;
+
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        int32_t base = ring * 100000;
+        for (int i = 1; i <= kIterations; i++) {
+            fc.current_task_index.store(base + i, std::memory_order_release);
+        }
+    };
+
+    struct Observation {
+        bool went_backward = false;
+        bool saw_other_ring_range = false;
+    };
+
+    auto reader = [&](int ring, Observation *obs) {
+        auto &fc = handle->header->rings[ring].fc;
+        int32_t base = ring * 100000;
+        int32_t prev = 0;
+        for (int i = 0; i < kIterations; i++) {
+            int32_t val = fc.current_task_index.load(std::memory_order_acquire);
+            if (val < prev) {
+                obs->went_backward = true;
+            }
+            if (val != 0 && (val <= base || val > base + kIterations)) {
+                obs->saw_other_ring_range = true;
+            }
+            prev = val;
+        }
+    };
+
+    Observation ring0;
+    Observation ring1;
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread r0(reader, 0, &ring0);
+    std::thread r1(reader, 1, &ring1);
+
+    w0.join();
+    w1.join();
+    r0.join();
+    r1.join();
+
+    EXPECT_FALSE(ring0.went_backward) << "Ring 0 current_task_index should be monotonic";
+    EXPECT_FALSE(ring1.went_backward) << "Ring 1 current_task_index should be monotonic";
+    EXPECT_FALSE(ring0.saw_other_ring_range) << "Ring 0 should not observe ring 1 values";
+    EXPECT_FALSE(ring1.saw_other_ring_range) << "Ring 1 should not observe ring 0 values";
+
+    EXPECT_EQ(handle->header->rings[0].fc.current_task_index.load(), static_cast<int32_t>(kIterations));
+    EXPECT_EQ(handle->header->rings[1].fc.current_task_index.load(), static_cast<int32_t>(100000 + kIterations));
+}
+
+TEST_F(SharedMemoryConcurrentTest, TaskIndexAtomicIncrement) {
+    constexpr int kIncrements = 5000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.current_task_index.store(0, std::memory_order_relaxed);
+
+    auto incrementer = [&]() {
+        for (int i = 0; i < kIncrements; i++) {
+            fc.current_task_index.fetch_add(1, std::memory_order_acq_rel);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(incrementer);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(fc.current_task_index.load(), kIncrements * kThreads) << "Concurrent increments should not lose updates";
+}
+
+TEST_F(SharedMemoryConcurrentTest, LastTaskAliveMonotonic) {
+    constexpr int kIterations = 10000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.last_task_alive.store(0, std::memory_order_relaxed);
+
+    auto advancer = [&](int id) {
+        for (int i = 0; i < kIterations; i++) {
+            int32_t desired = id * kIterations + i + 1;
+            int32_t current = fc.last_task_alive.load(std::memory_order_acquire);
+            while (current < desired) {
+                if (fc.last_task_alive.compare_exchange_weak(
+                        current, desired, std::memory_order_acq_rel, std::memory_order_acquire
+                    )) {
+                    break;
+                }
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(advancer, i);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    int32_t final_val = fc.last_task_alive.load();
+    EXPECT_EQ(final_val, kIterations * kThreads) << "last_task_alive should advance to the largest published value";
+}
+
+TEST_F(SharedMemoryConcurrentTest, ValidateAfterConcurrentWrites) {
+    constexpr int kIterations = 1000;
+
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        for (int i = 0; i < kIterations; i++) {
+            fc.current_task_index.store(static_cast<int32_t>(i % 256), std::memory_order_release);
+        }
+    };
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread w2(writer, 2);
+    std::thread w3(writer, 3);
+    w0.join();
+    w1.join();
+    w2.join();
+    w3.join();
+
+    EXPECT_TRUE(handle->validate()) << "Valid current_task_index values should pass validation";
+
+    handle->header->rings[2].fc.current_task_index.store(-1, std::memory_order_relaxed);
+    EXPECT_FALSE(handle->validate()) << "Corrupted current_task_index should fail validation";
+}
diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp
index 383003900..443a2e5da 100644
--- a/tests/ut/cpp/a2a3/test_task_allocator.cpp
+++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp
@@ -405,3 +405,99 @@ TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
     EXPECT_GE(r3.slot, 0);
     EXPECT_LT(r3.slot, WINDOW_SIZE);
 }
+
+// =============================================================================
+// Re-init safety
+// =============================================================================
+
+class TaskAllocatorReinitTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 1024;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[1024]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void InitAllocator() {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+TEST_F(TaskAllocatorReinitTest, ReInitAfterUse) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(128);
+    ASSERT_FALSE(r1.failed());
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+
+    InitAllocator();
+
+    auto r3 = allocator.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, 0) << "Re-init should reset task ID counter";
+    EXPECT_EQ(r3.slot, 0);
+}
+
+TEST_F(TaskAllocatorReinitTest, ReInitDifferentHeapSize) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    InitAllocator();
+    EXPECT_EQ(allocator.heap_top(), 0u) << "Re-init resets heap_top";
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE) << "Re-init restores full capacity";
+}
+
+TEST_F(TaskAllocatorReinitTest, ReInitClearsErrorState) {
+    InitAllocator();
+
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed());
+    EXPECT_NE(error_code.load(), PTO2_ERROR_NONE);
+
+    InitAllocator();
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_NONE);
+
+    auto r2 = allocator.alloc(64);
+    EXPECT_FALSE(r2.failed());
+}
+
+TEST_F(TaskAllocatorReinitTest, MultipleReInitCycles) {
+    for (int cycle = 0; cycle < 10; cycle++) {
+        InitAllocator();
+
+        for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+            auto r = allocator.alloc(0);
+            ASSERT_FALSE(r.failed()) << "Cycle " << cycle << " alloc " << i;
+            EXPECT_EQ(r.task_id, i);
+        }
+    }
+}
+
+TEST_F(TaskAllocatorReinitTest, ReInitIgnoresStaleLastAlive) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    last_alive.store(5, std::memory_order_release);
+
+    InitAllocator();
+    EXPECT_EQ(last_alive.load(), 0);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 0);
+}
diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp
index 10eef0317..b7ceb4ce5 100644
--- a/tests/ut/cpp/a2a3/test_tensormap.cpp
+++ b/tests/ut/cpp/a2a3/test_tensormap.cpp
@@ -65,6 +65,30 @@ static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32
     return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 2, DataType::FLOAT32, false, version);
 }
 
+static Tensor make_test_tensor_nd(
+    uint64_t addr, uint32_t ndims, const uint32_t shapes[], const uint32_t offsets[] = nullptr,
+    int32_t version = 0
+) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(addr), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t rs[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t o[RUNTIME_MAX_TENSOR_DIMS]{};
+    bool all_zero = true;
+    for (uint32_t i = 0; i < ndims && i < RUNTIME_MAX_TENSOR_DIMS; i++) {
+        s[i] = shapes[i];
+        rs[i] = shapes[i];
+        o[i] = offsets ? offsets[i] : 0;
+        if (o[i] != 0) all_zero = false;
+    }
+    uint64_t total = 4;
+    for (uint32_t i = 0; i < ndims; i++) {
+        total *= (rs[i] + (offsets ? offsets[i] : 0));
+    }
+    t.init(reinterpret_cast<void *>(addr), total, rs, s, o, ndims, DataType::FLOAT32, version, all_zero, true);
+    return t;
+}
+
 // =============================================================================
 // Fixture
 // =============================================================================
@@ -549,3 +573,105 @@ TEST(TaskIdTest, LocalIdMaxValue) {
     EXPECT_EQ(tid.ring(), 0);
     EXPECT_EQ(tid.local(), UINT32_MAX);
 }
+
+// =============================================================================
+// Edge cases merged from test_tensormap_overlap.cpp
+// =============================================================================
+
+TEST_F(TensorMapTest, ZeroDimensionTensor) {
+    // ndims=0: fast-path loop doesn't execute, contains=true -> COVERED
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x2000), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{};
+    t.init(reinterpret_cast<void *>(0x2000), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+TEST_F(TensorMapTest, TwoZeroDimTensorsSameAddr) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t1 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    Tensor t2 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{};
+    t1.init(reinterpret_cast<void *>(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+    t2.init(reinterpret_cast<void *>(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    TestLookupResult result;
+    run_lookup(tmap, t1, result);
+    EXPECT_EQ(result.count, 2);
+    for (int i = 0; i < result.count; i++) {
+        EXPECT_EQ(result.entries[i].overlap_status, OverlapStatus::COVERED)
+            << "0-dim tensors always report COVERED (empty loop -> contains=true)";
+    }
+}
+
+TEST_F(TensorMapTest, AdjacentNoOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_test_tensor_nd(0x8000, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {100};
+    Tensor cons = make_test_tensor_nd(0x8000, 1, cons_shapes, cons_offsets, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    EXPECT_EQ(result.count, 0) << "Adjacent regions [0,100) and [100,200) must not overlap";
+}
+
+TEST_F(TensorMapTest, OneElementOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_test_tensor_nd(0x8100, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {99};
+    Tensor cons = make_test_tensor_nd(0x8100, 1, cons_shapes, cons_offsets, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER) << "Partial overlap (1 element) -> OTHER";
+}
+
+TEST_F(TensorMapTest, ZeroShapeInDimension) {
+    // Producer: 2D [10, 0] -- zero in dim 1
+    uint32_t prod_shapes[] = {10, 0};
+    Tensor prod = make_test_tensor_nd(0x8200, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: 2D [10, 20]
+    uint32_t cons_shapes[] = {10, 20};
+    Tensor cons = make_test_tensor_nd(0x8200, 2, cons_shapes, nullptr, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    ASSERT_EQ(result.count, 1);
+    // Fast path: input.shapes[1](20) >= entry.shapes[1](0) -> contains=true -> COVERED
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+        << "Zero-shape producer is COVERED by any consumer (empty production)";
+}
+
+TEST_F(TensorMapTest, FullFiveDimensionalOverlap) {
+    uint32_t prod_shapes[] = {2, 3, 4, 5, 6};
+    Tensor prod = make_test_tensor_nd(0x9200, 5, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer with larger shapes in all dims -> COVERED
+    uint32_t cons_shapes[] = {4, 6, 8, 10, 12};
+    Tensor cons = make_test_tensor_nd(0x9200, 5, cons_shapes, nullptr, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+        << "5D consumer covers 5D producer in all dimensions";
+}
diff --git a/tests/ut/cpp/a5/test_link_isolation.cpp b/tests/ut/cpp/a5/test_link_isolation.cpp
new file mode 100644
index 000000000..78f1378a9
--- /dev/null
+++ b/tests/ut/cpp/a5/test_link_isolation.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Behavior tests for runtime components built without pto_orchestrator.cpp.
+ *
+ * The CMake target for this file deliberately omits the orchestrator source.
+ * Passing build and runtime assertions here verifies that scheduler, ring
+ * buffer, shared memory, and TensorMap behavior does not require an
+ * orchestrator link dependency.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstring>
+#include <vector>
+
+#include "pto_orchestration_api.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+
+struct TestLookupResult {
+    struct Entry {
+        PTO2TensorMapEntry *entry;
+        OverlapStatus overlap_status;
+    };
+    std::vector<Entry> entries;
+    int count = 0;
+};
+
+void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) {
+    tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool {
+        out.entries.push_back({&entry, status});
+        out.count++;
+        return true;
+    });
+}
+
+Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 1, DataType::FLOAT32, false, version);
+}
+
+struct DepPoolFixture {
+    PTO2DepListEntry entries[512];
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2DepListPool pool{};
+
+    void Init() {
+        std::memset(entries, 0, sizeof(entries));
+        error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        pool.init(entries, 512, &error_code);
+    }
+
+    void AllocN(int count) {
+        for (int i = 0; i < count; i++) {
+            ASSERT_NE(pool.alloc(), nullptr);
+        }
+    }
+};
+
+}  // namespace
+
+TEST(LinkIsolationDepPool, ReclaimBelowIntervalKeepsAllocatedEntries) {
+    DepPoolFixture fixture;
+    fixture.Init();
+    fixture.AllocN(100);
+    int32_t used_before = fixture.pool.used();
+
+    PTO2SharedMemoryRingHeader ring_header{};
+    fixture.pool.reclaim(ring_header, PTO2_DEP_POOL_CLEANUP_INTERVAL - 1);
+
+    EXPECT_EQ(fixture.pool.used(), used_before);
+    EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+}
+
+TEST(LinkIsolationDepPool, ReclaimAtIntervalUsesConsumedTaskMark) {
+    DepPoolFixture fixture;
+    fixture.Init();
+    fixture.AllocN(100);
+
+    std::vector<PTO2TaskSlotState> slots(kWindowSize);
+    PTO2SharedMemoryRingHeader ring_header{};
+    ring_header.slot_states = slots.data();
+    ring_header.task_window_size = kWindowSize;
+    ring_header.task_window_mask = kWindowSize - 1;
+
+    int32_t last_alive = PTO2_DEP_POOL_CLEANUP_INTERVAL;
+    int32_t mark_slot = (last_alive - 1) & ring_header.task_window_mask;
+    slots[mark_slot].dep_pool_mark = 50;
+
+    fixture.pool.reclaim(ring_header, last_alive);
+
+    EXPECT_EQ(fixture.pool.used(), 51);
+    EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+}
+
+TEST(LinkIsolationScheduler, ReleaseFaninPushesReadyTask) {
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+    ASSERT_NE(sm, nullptr);
+
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(sched.init(sm->header));
+
+    alignas(64) PTO2TaskSlotState slot{};
+    slot.fanin_count = 1;
+    slot.fanin_refcount.store(0, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    slot.active_mask = ActiveMask(PTO2_SUBTASK_MASK_AIV0);
+
+    EXPECT_TRUE(sched.release_fanin_and_check_ready(slot, nullptr));
+
+    PTO2ResourceShape shape = slot.active_mask.to_shape();
+    EXPECT_EQ(sched.ready_queues[static_cast<int>(shape)].pop(), &slot);
+
+    sched.destroy();
+    sm->destroy();
+}
+
+TEST(LinkIsolationScheduler, CompletedTaskCanBecomeConsumed) {
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+    ASSERT_NE(sm, nullptr);
+
+    PTO2SchedulerState sched{};
+    ASSERT_TRUE(sched.init(sm->header));
+
+    PTO2TaskDescriptor desc{};
+    PTO2TaskSlotState &slot = sm->header->rings[0].get_slot_state_by_slot(0);
+    slot.task = &desc;
+    slot.ring_id = 0;
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED);
+
+    sched.destroy();
+    sm->destroy();
+}
+
+TEST(LinkIsolationReadyQueue, PushPopBatchWithoutOrchestrator) {
+    PTO2ReadyQueue queue{};
+    ASSERT_TRUE(ready_queue_init(&queue, 16));
+
+    PTO2TaskSlotState items[4]{};
+    PTO2TaskSlotState *in[4] = {&items[0], &items[1], &items[2], &items[3]};
+    queue.push_batch(in, 4);
+
+    PTO2TaskSlotState *out[4]{};
+    EXPECT_EQ(queue.pop_batch(out, 4), 4);
+    for (int i = 0; i < 4; i++) {
+        EXPECT_EQ(out[i], &items[i]);
+    }
+    EXPECT_EQ(queue.pop(), nullptr);
+
+    ready_queue_destroy(&queue);
+}
+
+TEST(LinkIsolationTensorMap, InsertLookupAndValidityWithoutOrchestrator) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {kWindowSize, kWindowSize, kWindowSize, kWindowSize};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor tensor = make_tensor(0x3000);
+    for (int i = 0; i < kWindowSize; i++) {
+        tmap.insert(tensor, PTO2TaskId::make(0, i));
+    }
+
+    tmap.sync_validity(0, kWindowSize / 2);
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+    EXPECT_EQ(result.count, kWindowSize / 2);
+    for (const auto &entry : result.entries) {
+        EXPECT_GE(entry.entry->producer_task_id.local(), static_cast<uint32_t>(kWindowSize / 2));
+        EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED);
+    }
+
+    tmap.destroy();
+}
diff --git a/tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp b/tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp
new file mode 100644
index 000000000..24350c7c6
--- /dev/null
+++ b/tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * UT for the orchestrator-side fatal reporting path.
+ *
+ * Targets orch_.report_fatal (pto_orchestrator.cpp) and verifies:
+ *  - orch->fatal latches to true on any non-zero error code
+ *  - the first non-zero code wins via CAS into sm_header->orch_error_code
+ *  - subsequent fatal reports do NOT overwrite the first code
+ *  - PTO2_ERROR_NONE never latches the shared-memory code (but still flips
+ *    the local fatal flag -- by design, callers may use it to mark fatal
+ *    without writing a code)
+ *
+ * This test exercises the real symbol against a fully-initialized
+ * orchestrator + shared memory pair, complementing the fake-runtime test
+ * (test_a5_fatal.cpp) that only validates the ops-table dispatch.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestrator.h"
+#include "pto_runtime_status.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+class OrchestratorFatalTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = sched_.init(sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) orch_.destroy();
+        if (sched_ok_) sched_.destroy();
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) sm_->destroy();
+    }
+
+    int32_t shared_orch_code() const { return sm_->header->orch_error_code.load(std::memory_order_acquire); }
+};
+
+}  // namespace
+
+// ---------- baseline ----------
+
+TEST_F(OrchestratorFatalTest, InitialState_NoFatalNoSharedCode) {
+    // Verify no fatal state via the observable shared memory output
+    EXPECT_FALSE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- happy path: single fatal latches both local flag and shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatal_SetsLocalFlagAndSharedCode) {
+    orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", "deadlock at ring %d", 3);
+
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+// ---------- CAS first-writer-wins ----------
+
+TEST_F(OrchestratorFatalTest, SecondReportFatal_DoesNotOverwriteSharedCode) {
+    orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", nullptr);
+    orch_.report_fatal(PTO2_ERROR_DEP_POOL_OVERFLOW, "test", nullptr);
+
+    // Second report must NOT overwrite the first latched code.
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK);
+}
+
+TEST_F(OrchestratorFatalTest, RepeatedSameCode_StaysLatched) {
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr);
+
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+// ---------- PTO2_ERROR_NONE: marks fatal locally, does NOT touch shared code ----------
+
+TEST_F(OrchestratorFatalTest, ReportFatalWithErrorNone_DoesNotWriteSharedCode) {
+    orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr);
+
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+}
+
+// ---------- PTO2_ERROR_NONE first does not block a real code from latching ----------
+
+TEST_F(OrchestratorFatalTest, ErrorNoneFirst_RealCodeStillLatchesAfter) {
+    orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr);
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE);
+
+    orch_.report_fatal(PTO2_ERROR_SCOPE_DEADLOCK, "test", nullptr);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_SCOPE_DEADLOCK);
+}
+
+// ---------- coverage of every defined orchestrator code ----------
+
+TEST_F(OrchestratorFatalTest, EveryOrchCode_LatchesIntoSharedMemory) {
+    const int32_t codes[] = {
+        PTO2_ERROR_SCOPE_DEADLOCK,
+        PTO2_ERROR_HEAP_RING_DEADLOCK,
+        PTO2_ERROR_FLOW_CONTROL_DEADLOCK,
+        PTO2_ERROR_DEP_POOL_OVERFLOW,
+        PTO2_ERROR_INVALID_ARGS,
+        PTO2_ERROR_DEPENDENCY_OVERFLOW,
+        PTO2_ERROR_REQUIRE_SYNC_START_INVALID,
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT,
+        PTO2_ERROR_EXPLICIT_ORCH_FATAL,
+    };
+    for (int32_t code : codes) {
+        // Reset latches between iterations. Direct field access is unavoidable here
+        // since there is no public reset API for the orchestrator fatal state.
+        sm_->header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_release);
+        orch_.fatal = false;
+
+        orch_.report_fatal(code, "test", "code=%d", code);
+
+        SCOPED_TRACE(testing::Message() << "code=" << code);
+        EXPECT_TRUE(orch_.fatal);
+        EXPECT_EQ(shared_orch_code(), code);
+    }
+}
+
+// ---------- format-string variants must not crash ----------
+
+TEST_F(OrchestratorFatalTest, NullFmt_DoesNotCrash) {
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", nullptr);
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, EmptyFmt_DoesNotCrash) {
+    orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", "");
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS);
+}
+
+TEST_F(OrchestratorFatalTest, FmtWithVarArgs_DoesNotCrash) {
+    orch_.report_fatal(
+        PTO2_ERROR_TENSOR_WAIT_TIMEOUT, "func", "tensor=%p slot=%d msg=%s", reinterpret_cast<void *>(0xdeadbeef), 17,
+        "boom"
+    );
+    EXPECT_TRUE(orch_.fatal);
+    EXPECT_EQ(shared_orch_code(), PTO2_ERROR_TENSOR_WAIT_TIMEOUT);
+}
+
+// ---------- end-to-end: status helper sees latched code ----------
+
+TEST_F(OrchestratorFatalTest, StatusHelperReadsLatchedOrchCode) {
+    orch_.report_fatal(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, "func", nullptr);
+
+    int32_t orch_code = shared_orch_code();
+    int32_t sched_code = sm_->header->sched_error_code.load(std::memory_order_acquire);
+    EXPECT_EQ(runtime_status_from_error_codes(orch_code, sched_code), -PTO2_ERROR_FLOW_CONTROL_DEADLOCK);
+}
diff --git a/tests/ut/cpp/a5/test_orchestrator_submit.cpp b/tests/ut/cpp/a5/test_orchestrator_submit.cpp
new file mode 100644
index 000000000..c01635f57
--- /dev/null
+++ b/tests/ut/cpp/a5/test_orchestrator_submit.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Orchestrator submit-path UT.
+ *
+ * Covers pto2_submit_mixed_task, pto2_alloc_tensors, pto2_orchestrator_done,
+ * and pto2_orchestrator_set_scheduler on a fully initialized
+ * (TMR) system.
+ *
+ * Follows AAA and FIRST: each TEST_F builds a fresh TMRSystem, exercises
+ * one behavior, and tears the system down in TearDown().
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+
+#include "pto_orchestration_api.h"  // make_tensor_external, TensorCreateInfo ctor
+#include "pto_orchestrator.h"
+#include "pto_ring_buffer.h"
+#include "scheduler/pto_scheduler.h"
+#include "pto_shared_memory.h"
+#include "pto_submit_types.h"
+#include "pto_tensormap.h"
+#include "tensor.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPool = 256;
+
+// -----------------------------------------------------------------------------
+// Fixture: minimal TMR system for orchestrator-level tests.
+// -----------------------------------------------------------------------------
+class OrchestratorSubmitTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *sm_ = nullptr;
+    PTO2SchedulerState sched_{};
+    PTO2OrchestratorState orch_{};
+    uint8_t *gm_heap_ = nullptr;
+    bool sched_ok_ = false;
+    bool orch_ok_ = false;
+
+    void SetUp() override {
+        sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+        ASSERT_NE(sm_, nullptr);
+
+        gm_heap_ = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+        ASSERT_NE(gm_heap_, nullptr);
+
+        sched_ok_ = sched_.init(sm_->header, kDepPool);
+        ASSERT_TRUE(sched_ok_);
+
+        orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool);
+        ASSERT_TRUE(orch_ok_);
+
+        orch_.set_scheduler(&sched_);
+    }
+
+    void TearDown() override {
+        if (orch_ok_) orch_.destroy();
+        if (sched_ok_) sched_.destroy();
+        if (gm_heap_) std::free(gm_heap_);
+        if (sm_) sm_->destroy();
+    }
+
+    // Helper: build a minimal TensorCreateInfo owning one FP32 scalar output.
+    static TensorCreateInfo make_scalar_ci() {
+        static const uint32_t kShape[1] = {1};
+        return TensorCreateInfo(kShape, 1, DataType::FLOAT32);
+    }
+
+    bool has_orch_error() const {
+        return sm_->header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE;
+    }
+};
+
+}  // namespace
+
+// ---------- set_scheduler ----------
+
+TEST_F(OrchestratorSubmitTest, SetScheduler_StoresPointer) {
+    PTO2SchedulerState other{};
+    orch_.set_scheduler(&other);
+    // Direct field read: no public getter exists for the scheduler pointer.
+    EXPECT_EQ(orch_.scheduler, &other);
+
+    // Restore for TearDown.
+    orch_.set_scheduler(&sched_);
+}
+
+// ---------- alloc_tensors: argument validation ----------
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_EmptyArgs_MarksFatal) {
+    Arg args;  // no tensors, no scalars
+
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_WithScalars_MarksFatal) {
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+    args.add_scalar(uint64_t{42});
+
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_InputArg_MarksFatal) {
+    // alloc_tensors only accepts OUTPUT TensorCreateInfo args.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_input(input);
+
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_OutputOnly_ReturnsMaterializedTensors) {
+    // Arrange: two output CIs, inside an active scope.
+    TensorCreateInfo ci1 = make_scalar_ci();
+    TensorCreateInfo ci2 = make_scalar_ci();
+    Arg args;
+    args.add_output(ci1, ci2);
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+    orch_.end_scope();
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 2U);
+}
+
+TEST_F(OrchestratorSubmitTest, AllocTensors_AfterFatal_ReturnsEmpty) {
+    // Arrange: force fatal.
+    orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+    ASSERT_TRUE(has_orch_error());
+
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    // Act
+    TaskOutputTensors result = orch_.alloc_tensors(args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+// ---------- submit_mixed_task ----------
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_AfterFatal_ReturnsEmpty) {
+    // Arrange: pre-fatal state
+    orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+    Arg args;
+
+    // Act
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_ArgWithError_MarksFatalInvalidArgs) {
+    // Arrange: craft an Arg with has_error set.
+    // Calling add_input after add_scalar triggers the ordering error path.
+    uint32_t shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x1000), shape, 1);
+    Arg args;
+    args.add_scalar(uint64_t{1});
+    args.add_input(t);  // illegal ordering -> has_error = true
+    ASSERT_TRUE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 0;
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+    orch_.end_scope();
+
+    // Assert
+    EXPECT_TRUE(result.empty());
+    EXPECT_TRUE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_PureInputOnly_Succeeds) {
+    // Arrange: one input tensor, one AIC kernel, within a scope.
+    uint32_t shape[1] = {1};
+    Tensor input = make_tensor_external(reinterpret_cast<void *>(0x2000), shape, 1);
+
+    Arg args;
+    args.add_input(input);
+    ASSERT_FALSE(args.has_error);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 7;  // any non-invalid id
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+    orch_.end_scope();
+
+    // Assert: submit returns (no outputs), and no fatal state was set.
+    EXPECT_TRUE(result.empty());
+    EXPECT_FALSE(has_orch_error());
+}
+
+TEST_F(OrchestratorSubmitTest, SubmitMixedTask_OutputTensor_MaterializesResult) {
+    // Arrange: one OUTPUT TensorCreateInfo -> task produces one tensor.
+    TensorCreateInfo ci = make_scalar_ci();
+    Arg args;
+    args.add_output(ci);
+
+    MixedKernels mixed;
+    mixed.aic_kernel_id = 1;
+
+    // Act
+    orch_.begin_scope();
+    TaskOutputTensors result = orch_.submit_task(mixed, args);
+    orch_.end_scope();
+
+    // Assert
+    EXPECT_FALSE(has_orch_error());
+    EXPECT_EQ(result.size(), 1U);
+}
+
+// ---------- orchestrator_done ----------
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_SetsSharedMemoryFlag) {
+    // Arrange
+    ASSERT_EQ(sm_->header->orchestrator_done.load(), 0);
+
+    // Act
+    orch_.mark_done();
+
+    // Assert
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
+
+TEST_F(OrchestratorSubmitTest, OrchestratorDone_IsIdempotent) {
+    orch_.mark_done();
+    orch_.mark_done();
+
+    // Flag stays 1 -- store is release-set, not increment.
+    EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1);
+}
diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp
index 9dea3ae94..bb2a6a101 100644
--- a/tests/ut/cpp/a5/test_ready_queue.cpp
+++ b/tests/ut/cpp/a5/test_ready_queue.cpp
@@ -371,9 +371,10 @@ TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) {
 INSTANTIATE_TEST_SUITE_P(
     MPMCVariants, ReadyQueueMPMCTest,
     ::testing::Values(
-        MPMCConfig{2, 2, 200},  // TwoProducersTwoConsumers
-        MPMCConfig{1, 4, 500},  // OneProducerNConsumers
-        MPMCConfig{4, 4, 1250}  // HighContentionStress
+        MPMCConfig{2, 2, 200},   // TwoProducersTwoConsumers
+        MPMCConfig{1, 4, 500},   // OneProducerNConsumers
+        MPMCConfig{4, 4, 1250},  // HighContentionStress
+        MPMCConfig{8, 8, 2000}   // EightProducersEightConsumers
     )
 );
 
@@ -444,3 +445,118 @@ TEST_F(LocalReadyBufferTest, NullBackingBuffer) {
     EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing";
     EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing";
 }
+
+// =============================================================================
+// High-contention stress tests
+// =============================================================================
+
+class ReadyQueueStressTest : public ::testing::Test {
+protected:
+    static constexpr uint64_t kCapacity = 512;
+    PTO2ReadyQueue queue;
+
+    void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, kCapacity)); }
+    void TearDown() override { ready_queue_destroy(&queue); }
+};
+
+TEST_F(ReadyQueueStressTest, RapidFillDrainCycles) {
+    constexpr int kCycles = 100;
+    constexpr int kItemsPerCycle = static_cast<int>(kCapacity / 2);
+
+    std::vector<PTO2TaskSlotState> items(kItemsPerCycle);
+    for (int i = 0; i < kItemsPerCycle; i++) {
+        items[i].fanin_count = i;
+    }
+
+    for (int cycle = 0; cycle < kCycles; cycle++) {
+        std::atomic<int> push_done{0};
+        std::atomic<int> popped{0};
+
+        auto producer = [&](int id) {
+            int per_thread = kItemsPerCycle / 4;
+            int base = id * per_thread;
+            for (int i = 0; i < per_thread; i++) {
+                while (!queue.push(&items[base + i])) {}
+            }
+            push_done.fetch_add(1, std::memory_order_release);
+        };
+
+        auto consumer = [&]() {
+            while (true) {
+                PTO2TaskSlotState *s = queue.pop();
+                if (s) {
+                    popped.fetch_add(1, std::memory_order_relaxed);
+                } else if (push_done.load(std::memory_order_acquire) == 4) {
+                    while ((s = queue.pop()) != nullptr) {
+                        popped.fetch_add(1, std::memory_order_relaxed);
+                    }
+                    break;
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(producer, i);
+        for (int i = 0; i < 4; i++)
+            threads.emplace_back(consumer);
+        for (auto &t : threads)
+            t.join();
+
+        ASSERT_EQ(popped.load(), kItemsPerCycle) << "Cycle " << cycle << ": lost items";
+    }
+}
+
+TEST_F(ReadyQueueStressTest, PopBatchUnderContention) {
+    constexpr int kBatchSize = 8;
+    constexpr int kBatches = 500;
+    constexpr int kProducers = 4;
+    constexpr int kTotalItems = kBatchSize * kBatches * kProducers;
+
+    std::vector<PTO2TaskSlotState> items(kTotalItems);
+    for (int i = 0; i < kTotalItems; i++)
+        items[i].fanin_count = i;
+
+    std::atomic<int> total_consumed{0};
+    std::atomic<int> producers_done{0};
+
+    auto producer = [&](int id) {
+        int base = id * kBatchSize * kBatches;
+        for (int b = 0; b < kBatches; b++) {
+            PTO2TaskSlotState *ptrs[kBatchSize];
+            for (int i = 0; i < kBatchSize; i++) {
+                ptrs[i] = &items[base + b * kBatchSize + i];
+            }
+            for (int i = 0; i < kBatchSize; i++) {
+                while (!queue.push(ptrs[i])) {}
+            }
+        }
+        producers_done.fetch_add(1, std::memory_order_release);
+    };
+
+    auto consumer = [&]() {
+        while (true) {
+            PTO2TaskSlotState *out[kBatchSize];
+            int n = queue.pop_batch(out, kBatchSize);
+            total_consumed.fetch_add(n, std::memory_order_relaxed);
+            if (n == 0 && producers_done.load(std::memory_order_acquire) == kProducers) {
+                while (true) {
+                    n = queue.pop_batch(out, kBatchSize);
+                    if (n == 0) break;
+                    total_consumed.fetch_add(n, std::memory_order_relaxed);
+                }
+                break;
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kProducers; i++)
+        threads.emplace_back(producer, i);
+    for (int i = 0; i < 4; i++)
+        threads.emplace_back(consumer);
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(total_consumed.load(), kTotalItems);
+}
diff --git a/tests/ut/cpp/a5/test_runtime_coupling.cpp b/tests/ut/cpp/a5/test_runtime_coupling.cpp
new file mode 100644
index 000000000..927a7b0fc
--- /dev/null
+++ b/tests/ut/cpp/a5/test_runtime_coupling.cpp
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Runtime behavior tests for the combined TMR components.
+ *
+ * These tests keep the assertions on observable component behavior: lifecycle,
+ * scheduler state transitions, TensorMap lookup validity, and shared-memory
+ * coordination. Structural coupling checks belong in design review rather than
+ * in unit tests.
+ */
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "pto_orchestration_api.h"
+#include "pto_orchestrator.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+#include "pto_tensormap.h"
+#include "scheduler/pto_scheduler.h"
+
+namespace {
+
+constexpr uint64_t kHeapSize = 64 * 1024;
+constexpr int32_t kWindowSize = 64;
+constexpr int32_t kDepPoolSize = 256;
+
+struct TestLookupResult {
+    struct Entry {
+        PTO2TensorMapEntry *entry;
+        OverlapStatus overlap_status;
+    };
+    std::vector<Entry> entries;
+    int count = 0;
+};
+
+void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) {
+    tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool {
+        out.entries.push_back({&entry, status});
+        out.count++;
+        return true;
+    });
+}
+
+Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) {
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0};
+    return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 1, DataType::FLOAT32, false, version);
+}
+
+struct TMRSystem {
+    PTO2SharedMemoryHandle *sm = nullptr;
+    PTO2SchedulerState sched{};
+    PTO2OrchestratorState orch{};
+    uint8_t *gm_heap = nullptr;
+    bool sched_ok = false;
+    bool orch_ok = false;
+
+    bool Init(uint64_t heap_size = kHeapSize, int32_t window_size = kWindowSize) {
+        sm = PTO2SharedMemoryHandle::create(window_size, heap_size);
+        if (sm == nullptr) return false;
+
+        gm_heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, heap_size));
+        if (gm_heap == nullptr) return false;
+
+        sched_ok = sched.init(sm->header, kDepPoolSize);
+        if (!sched_ok) return false;
+
+        orch_ok = orch.init(sm->header, gm_heap, heap_size, kDepPoolSize);
+        if (!orch_ok) return false;
+
+        orch.set_scheduler(&sched);
+        return true;
+    }
+
+    void Destroy() {
+        if (orch_ok) {
+            orch.destroy();
+            orch_ok = false;
+        }
+        if (sched_ok) {
+            sched.destroy();
+            sched_ok = false;
+        }
+        if (gm_heap != nullptr) {
+            std::free(gm_heap);
+            gm_heap = nullptr;
+        }
+        if (sm != nullptr) {
+            sm->destroy();
+            sm = nullptr;
+        }
+    }
+};
+
+}  // namespace
+
+TEST(RuntimeLifecycleBehavior, InitDestroyCanRepeat) {
+    for (int cycle = 0; cycle < 3; cycle++) {
+        TMRSystem sys;
+        ASSERT_TRUE(sys.Init()) << "cycle=" << cycle;
+        EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+        EXPECT_EQ(sys.sm->header->sched_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+        sys.Destroy();
+    }
+}
+
+TEST(RuntimeLifecycleBehavior, OrchestratorScopeWithoutSchedulerLeavesNoFatalCode) {
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize));
+    ASSERT_NE(heap, nullptr);
+
+    PTO2OrchestratorState orch{};
+    ASSERT_TRUE(orch.init(sm->header, heap, kHeapSize, kDepPoolSize));
+
+    orch.begin_scope();
+    orch.end_scope();
+
+    EXPECT_EQ(sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+    EXPECT_FALSE(orch.fatal);
+
+    orch.destroy();
+    std::free(heap);
+    sm->destroy();
+}
+
+TEST(RuntimeSchedulerBehavior, CompletedSlotWithSatisfiedFanoutBecomesConsumed) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    PTO2TaskDescriptor desc{};
+    PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot.task = &desc;
+    slot.ring_id = 0;
+    slot.fanout_count = 1;
+    slot.fanout_refcount.store(1, std::memory_order_relaxed);
+    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+    sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed);
+
+    sys.sched.check_and_handle_consumed(slot);
+
+    EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED);
+
+    sys.Destroy();
+}
+
+TEST(RuntimeSchedulerBehavior, RingPointerStopsAtFirstUnconsumedTask) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    auto &ring_state = sys.sched.ring_sched_states[0];
+    PTO2TaskDescriptor descs[3]{};
+
+    PTO2TaskSlotState &slot0 = sys.sm->header->rings[0].get_slot_state_by_slot(0);
+    slot0.task = &descs[0];
+    slot0.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+
+    PTO2TaskSlotState &slot1 = sys.sm->header->rings[0].get_slot_state_by_slot(1);
+    slot1.task = &descs[1];
+    slot1.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed);
+
+    PTO2TaskSlotState &slot2 = sys.sm->header->rings[0].get_slot_state_by_slot(2);
+    slot2.task = &descs[2];
+    slot2.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed);
+
+    sys.sm->header->rings[0].fc.current_task_index.store(3, std::memory_order_relaxed);
+
+    ring_state.advance_ring_pointers();
+
+    EXPECT_EQ(ring_state.last_task_alive, 1);
+    EXPECT_EQ(sys.sm->header->rings[0].fc.last_task_alive.load(std::memory_order_acquire), static_cast<int32_t>(1));
+
+    sys.Destroy();
+}
+
+TEST(RuntimeSchedulerBehavior, ReadyQueuesAcceptEveryResourceShape) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    for (int shape = 0; shape < PTO2_NUM_RESOURCE_SHAPES; shape++) {
+        PTO2TaskSlotState slot{};
+        EXPECT_TRUE(sys.sched.ready_queues[shape].push(&slot)) << "shape=" << shape;
+        EXPECT_EQ(sys.sched.ready_queues[shape].pop(), &slot) << "shape=" << shape;
+    }
+
+    sys.Destroy();
+}
+
+TEST(RuntimeTensorMapBehavior, StandaloneInsertLookupDoesNotNeedOrchestratorPointer) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor tensor = make_tensor(0x1000);
+    tmap.insert(tensor, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0));
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+
+    tmap.destroy();
+}
+
+TEST(RuntimeTensorMapBehavior, ValiditySkipsRetiredEntries) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {256, 256, 256, 256};
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 4096, window_sizes));
+
+    Tensor tensor = make_tensor(0x2000);
+    for (int i = 0; i < 100; i++) {
+        tmap.insert(tensor, PTO2TaskId::make(0, i));
+    }
+
+    tmap.sync_validity(0, 80);
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+
+    EXPECT_EQ(result.count, 20);
+    for (const auto &entry : result.entries) {
+        EXPECT_GE(entry.entry->producer_task_id.local(), 80u);
+        EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED);
+    }
+
+    tmap.destroy();
+}
+
+TEST(RuntimeTensorMapBehavior, AllRingsCanProduceForSameTensor) {
+    int32_t window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++) {
+        window_sizes[i] = kWindowSize;
+    }
+    PTO2TensorMap tmap{};
+    ASSERT_TRUE(tmap.init(256, 1024, window_sizes));
+
+    Tensor tensor = make_tensor(0x3000);
+    for (int ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) {
+        tmap.insert(tensor, PTO2TaskId::make(ring, 0));
+    }
+
+    TestLookupResult result;
+    run_lookup(tmap, tensor, result);
+
+    EXPECT_EQ(result.count, PTO2_MAX_RING_DEPTH);
+
+    tmap.destroy();
+}
+
+TEST(RuntimeIntegrationBehavior, OrchestratorTensorMapUsesConfiguredWindow) {
+    TMRSystem sys;
+    ASSERT_TRUE(sys.Init());
+
+    Tensor tensor = make_tensor(0x4000);
+    sys.orch.begin_scope();
+    sys.orch.tensor_map.insert(tensor, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(sys.orch.tensor_map, tensor, result);
+
+    sys.orch.end_scope();
+
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0));
+    EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE);
+
+    sys.Destroy();
+}
diff --git a/tests/ut/cpp/a5/test_runtime_lifecycle.cpp b/tests/ut/cpp/a5/test_runtime_lifecycle.cpp
new file mode 100644
index 000000000..b1987ed9f
--- /dev/null
+++ b/tests/ut/cpp/a5/test_runtime_lifecycle.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * PTO2 Runtime lifecycle UT.
+ *
+ * Covers runtime_create / _custom / _from_sm / _destroy / set_mode.
+ *
+ * Follows AAA and FIRST: no shared mutable state between tests, each test
+ * constructs its own runtime and tears it down.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+
+namespace {
+
+constexpr uint64_t kSmallWindow = 64;
+constexpr uint64_t kSmallHeap = 64 * 1024;
+
+// -----------------------------------------------------------------------------
+// Fixture: each test gets a fresh, isolated runtime config.
+// -----------------------------------------------------------------------------
+class RuntimeLifecycleTest : public ::testing::Test {
+protected:
+    PTO2Runtime *rt_ = nullptr;
+
+    void TearDown() override {
+        if (rt_ != nullptr) {
+            runtime_destroy(rt_);
+            rt_ = nullptr;
+        }
+    }
+};
+
+}  // namespace
+
+// ---------- Happy-path creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ValidSizes_ReturnsInitializedRuntime) {
+    // Arrange + Act
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+
+    // Assert
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_NE(rt_->ops, nullptr);
+    EXPECT_NE(rt_->sm_handle, nullptr);
+    EXPECT_NE(rt_->gm_heap, nullptr);
+    EXPECT_TRUE(rt_->gm_heap_owned);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_SIMULATE);
+    EXPECT_EQ(rt_->gm_heap_size, kSmallHeap * PTO2_MAX_RING_DEPTH);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateCustom_ConnectsOrchestratorToScheduler) {
+    rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+
+    ASSERT_NE(rt_, nullptr);
+    // In simulated mode the orchestrator must hold a pointer to the scheduler.
+    EXPECT_EQ(rt_->orchestrator.scheduler, &rt_->scheduler);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateDefault_UsesDefaultSizes) {
+    // create() is a thin wrapper around create_custom with PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE.
+    // Use GRAPH_ONLY to avoid executor threads.  We don't allocate the full
+    // 256MB heap in this path -- keep the assertion restricted to mode.
+    rt_ = runtime_create(PTO2_MODE_GRAPH_ONLY);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+// ---------- From-SM creation ----------
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_NullHandle_ReturnsNull) {
+    // Act
+    PTO2Runtime *rt = runtime_create_from_sm(PTO2_MODE_SIMULATE, nullptr, nullptr, 0);
+
+    // Assert
+    EXPECT_EQ(rt, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, CreateFromSM_RecordsCallerBuffers) {
+    // Arrange: caller-allocated sm + gm_heap.
+    PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kSmallWindow, kSmallHeap);
+    ASSERT_NE(sm, nullptr);
+    uint8_t *heap = static_cast<uint8_t *>(std::calloc(PTO2_MAX_RING_DEPTH, kSmallHeap));
+    ASSERT_NE(heap, nullptr);
+
+    // Act
+    rt_ = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm, heap, kSmallHeap);
+
+    // Assert: the returned runtime must NOT claim ownership of the gm_heap.
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_EQ(rt_->sm_handle, sm);
+    EXPECT_EQ(rt_->gm_heap, heap);
+    EXPECT_FALSE(rt_->gm_heap_owned);
+
+    // Cleanup: runtime_destroy consumes sm via pto2_sm_destroy (observed
+    // behavior, see pto_runtime2.cpp:339), so only free the gm_heap here.
+    runtime_destroy(rt_);
+    rt_ = nullptr;
+    std::free(heap);
+}
+
+// ---------- Destroy ----------
+
+TEST_F(RuntimeLifecycleTest, Destroy_NullRuntime_NoCrash) {
+    // Documented contract: destroy(nullptr) is a no-op.
+    runtime_destroy(nullptr);
+    SUCCEED();
+}
+
+TEST_F(RuntimeLifecycleTest, Destroy_ReleasesOwnedHeap) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    // Act: explicitly destroy and null out so TearDown doesn't double-free.
+    runtime_destroy(rt_);
+    rt_ = nullptr;
+    // Assert: reaching here without asan/ubsan complaint is the test (leak-free).
+    SUCCEED();
+}
+
+// ---------- set_mode ----------
+
+TEST_F(RuntimeLifecycleTest, SetMode_UpdatesField) {
+    rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    ASSERT_EQ(rt_->mode, PTO2_MODE_EXECUTE);
+
+    // Act
+    runtime_set_mode(rt_, PTO2_MODE_GRAPH_ONLY);
+
+    // Assert
+    EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY);
+}
+
+TEST_F(RuntimeLifecycleTest, SetMode_NullRuntime_NoCrash) {
+    // Contract: defensive null check, mirrors destroy.
+    runtime_set_mode(nullptr, PTO2_MODE_SIMULATE);
+    SUCCEED();
+}
+
+// ---------- Ops table wiring ----------
+
+TEST_F(RuntimeLifecycleTest, OpsTable_AllFunctionPointersPopulated) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    const PTO2RuntimeOps *ops = rt_->ops;
+    ASSERT_NE(ops, nullptr);
+
+    // Hot-path ops called by the orchestration .so -- must never be null.
+    EXPECT_NE(ops->submit_task, nullptr);
+    EXPECT_NE(ops->alloc_tensors, nullptr);
+    EXPECT_NE(ops->scope_begin, nullptr);
+    EXPECT_NE(ops->scope_end, nullptr);
+    EXPECT_NE(ops->orchestration_done, nullptr);
+    EXPECT_NE(ops->is_fatal, nullptr);
+    EXPECT_NE(ops->report_fatal, nullptr);
+    EXPECT_NE(ops->get_tensor_data, nullptr);
+    EXPECT_NE(ops->set_tensor_data, nullptr);
+}
+
+TEST_F(RuntimeLifecycleTest, IsFatal_FreshRuntime_ReturnsFalse) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+    EXPECT_FALSE(rt_->ops->is_fatal(rt_));
+}
+
+TEST_F(RuntimeLifecycleTest, ReportFatal_SetsFatalFlag) {
+    rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap);
+    ASSERT_NE(rt_, nullptr);
+
+    // Act
+    rt_->ops->report_fatal(rt_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", "%s", "forced");
+
+    // Assert
+    EXPECT_TRUE(rt_->ops->is_fatal(rt_));
+}
diff --git a/tests/ut/cpp/a5/test_shared_memory.cpp b/tests/ut/cpp/a5/test_shared_memory.cpp
index c45b4c8e4..cd9b4d526 100644
--- a/tests/ut/cpp/a5/test_shared_memory.cpp
+++ b/tests/ut/cpp/a5/test_shared_memory.cpp
@@ -16,13 +16,13 @@
  *
  * Design contracts:
  *
- * - PTO2SharedMemoryHandle::validate checks `top > heap_size`.  top == heap_size is a
+ * - validate() checks `top > heap_size`.  top == heap_size is a
  *   legitimate "filled exactly to end" state, so strict > is correct.
  *
- * - Zero window size: if PTO2SharedMemoryHandle::calculate_size() is called with 0, all ring
+ * - Zero window size: if calculate_size() is called with 0, all ring
  *   descriptors/payloads alias the same address.  Current entry path
- *   (PTO2SharedMemoryHandle::create) is called only with valid sizes, but there is no
- *   explicit guard.  PTO2SharedMemoryHandle::create should reject task_window_size==0.
+ *   (create) is called only with valid sizes, but there is no
+ *   explicit guard.  create should reject task_window_size==0.
  *
  * - Flow control heap_top validation: validate() does not verify
  *   heap_top <= heap_size.  After a corruption, heap_top could exceed
@@ -31,6 +31,8 @@
 
 #include <gtest/gtest.h>
 #include <cstring>
+#include <thread>
+#include <vector>
 #include "pto_shared_memory.h"
 
 // =============================================================================
@@ -182,13 +184,163 @@ TEST(SharedMemoryBoundary, ValidateDetectsCorruption) {
     h->destroy();
 }
 
-TEST(SharedMemoryBoundary, ValidateNullHandle) {
-    PTO2SharedMemoryHandle handle{};
-    EXPECT_FALSE(handle.validate());
-}
-
 TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) {
     char buf[64]{};
     PTO2SharedMemoryHandle *h = PTO2SharedMemoryHandle::create_from_buffer(buf, 64, 256, 4096);
     EXPECT_EQ(h, nullptr) << "Undersized buffer should fail";
 }
+
+// =============================================================================
+// Concurrent read/write of per-ring flow control
+// =============================================================================
+
+class SharedMemoryConcurrentTest : public ::testing::Test {
+protected:
+    PTO2SharedMemoryHandle *handle = nullptr;
+
+    void SetUp() override {
+        handle = PTO2SharedMemoryHandle::create(256, 4096);
+        ASSERT_NE(handle, nullptr);
+    }
+
+    void TearDown() override {
+        if (handle) {
+            handle->destroy();
+            handle = nullptr;
+        }
+    }
+};
+
+TEST_F(SharedMemoryConcurrentTest, PerRingTaskIndexIsolation) {
+    constexpr int kIterations = 10000;
+
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        int32_t base = ring * 100000;
+        for (int i = 1; i <= kIterations; i++) {
+            fc.current_task_index.store(base + i, std::memory_order_release);
+        }
+    };
+
+    struct Observation {
+        bool went_backward = false;
+        bool saw_other_ring_range = false;
+    };
+
+    auto reader = [&](int ring, Observation *obs) {
+        auto &fc = handle->header->rings[ring].fc;
+        int32_t base = ring * 100000;
+        int32_t prev = 0;
+        for (int i = 0; i < kIterations; i++) {
+            int32_t val = fc.current_task_index.load(std::memory_order_acquire);
+            if (val < prev) {
+                obs->went_backward = true;
+            }
+            if (val != 0 && (val <= base || val > base + kIterations)) {
+                obs->saw_other_ring_range = true;
+            }
+            prev = val;
+        }
+    };
+
+    Observation ring0;
+    Observation ring1;
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread r0(reader, 0, &ring0);
+    std::thread r1(reader, 1, &ring1);
+
+    w0.join();
+    w1.join();
+    r0.join();
+    r1.join();
+
+    EXPECT_FALSE(ring0.went_backward) << "Ring 0 current_task_index should be monotonic";
+    EXPECT_FALSE(ring1.went_backward) << "Ring 1 current_task_index should be monotonic";
+    EXPECT_FALSE(ring0.saw_other_ring_range) << "Ring 0 should not observe ring 1 values";
+    EXPECT_FALSE(ring1.saw_other_ring_range) << "Ring 1 should not observe ring 0 values";
+
+    EXPECT_EQ(handle->header->rings[0].fc.current_task_index.load(), static_cast<int32_t>(kIterations));
+    EXPECT_EQ(handle->header->rings[1].fc.current_task_index.load(), static_cast<int32_t>(100000 + kIterations));
+}
+
+TEST_F(SharedMemoryConcurrentTest, TaskIndexAtomicIncrement) {
+    constexpr int kIncrements = 5000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.current_task_index.store(0, std::memory_order_relaxed);
+
+    auto incrementer = [&]() {
+        for (int i = 0; i < kIncrements; i++) {
+            fc.current_task_index.fetch_add(1, std::memory_order_acq_rel);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(incrementer);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    EXPECT_EQ(fc.current_task_index.load(), kIncrements * kThreads) << "Concurrent increments should not lose updates";
+}
+
+TEST_F(SharedMemoryConcurrentTest, LastTaskAliveMonotonic) {
+    constexpr int kIterations = 10000;
+    constexpr int kThreads = 4;
+
+    auto &fc = handle->header->rings[0].fc;
+    fc.last_task_alive.store(0, std::memory_order_relaxed);
+
+    auto advancer = [&](int id) {
+        for (int i = 0; i < kIterations; i++) {
+            int32_t desired = id * kIterations + i + 1;
+            int32_t current = fc.last_task_alive.load(std::memory_order_acquire);
+            while (current < desired) {
+                if (fc.last_task_alive.compare_exchange_weak(
+                        current, desired, std::memory_order_acq_rel, std::memory_order_acquire
+                    )) {
+                    break;
+                }
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < kThreads; i++) {
+        threads.emplace_back(advancer, i);
+    }
+    for (auto &t : threads)
+        t.join();
+
+    int32_t final_val = fc.last_task_alive.load();
+    EXPECT_EQ(final_val, kIterations * kThreads) << "last_task_alive should advance to the largest published value";
+}
+
+TEST_F(SharedMemoryConcurrentTest, ValidateAfterConcurrentWrites) {
+    constexpr int kIterations = 1000;
+
+    auto writer = [&](int ring) {
+        auto &fc = handle->header->rings[ring].fc;
+        for (int i = 0; i < kIterations; i++) {
+            fc.current_task_index.store(static_cast<int32_t>(i % 256), std::memory_order_release);
+        }
+    };
+
+    std::thread w0(writer, 0);
+    std::thread w1(writer, 1);
+    std::thread w2(writer, 2);
+    std::thread w3(writer, 3);
+    w0.join();
+    w1.join();
+    w2.join();
+    w3.join();
+
+    EXPECT_TRUE(handle->validate()) << "Valid current_task_index values should pass validation";
+
+    handle->header->rings[2].fc.current_task_index.store(-1, std::memory_order_relaxed);
+    EXPECT_FALSE(handle->validate()) << "Corrupted current_task_index should fail validation";
+}
diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp
index 383003900..443a2e5da 100644
--- a/tests/ut/cpp/a5/test_task_allocator.cpp
+++ b/tests/ut/cpp/a5/test_task_allocator.cpp
@@ -405,3 +405,99 @@ TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) {
     EXPECT_GE(r3.slot, 0);
     EXPECT_LT(r3.slot, WINDOW_SIZE);
 }
+
+// =============================================================================
+// Re-init safety
+// =============================================================================
+
+class TaskAllocatorReinitTest : public ::testing::Test {
+protected:
+    static constexpr int32_t WINDOW_SIZE = 16;
+    static constexpr uint64_t HEAP_SIZE = 1024;
+
+    std::vector<PTO2TaskDescriptor> descriptors;
+    alignas(64) uint8_t heap_buf[1024]{};
+    std::atomic<int32_t> current_index{0};
+    std::atomic<int32_t> last_alive{0};
+    std::atomic<int32_t> error_code{PTO2_ERROR_NONE};
+    PTO2TaskAllocator allocator{};
+
+    void InitAllocator() {
+        descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{});
+        std::memset(heap_buf, 0, sizeof(heap_buf));
+        current_index.store(0);
+        last_alive.store(0);
+        error_code.store(PTO2_ERROR_NONE);
+        allocator.init(descriptors.data(), WINDOW_SIZE, &current_index, &last_alive, heap_buf, HEAP_SIZE, &error_code);
+    }
+};
+
+TEST_F(TaskAllocatorReinitTest, ReInitAfterUse) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(128);
+    ASSERT_FALSE(r1.failed());
+    auto r2 = allocator.alloc(128);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 1);
+
+    InitAllocator();
+
+    auto r3 = allocator.alloc(64);
+    ASSERT_FALSE(r3.failed());
+    EXPECT_EQ(r3.task_id, 0) << "Re-init should reset task ID counter";
+    EXPECT_EQ(r3.slot, 0);
+}
+
+TEST_F(TaskAllocatorReinitTest, ReInitDifferentHeapSize) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(HEAP_SIZE);
+    ASSERT_FALSE(r1.failed());
+    EXPECT_EQ(allocator.heap_top(), HEAP_SIZE);
+
+    InitAllocator();
+    EXPECT_EQ(allocator.heap_top(), 0u) << "Re-init resets heap_top";
+    EXPECT_EQ(allocator.heap_available(), HEAP_SIZE) << "Re-init restores full capacity";
+}
+
+TEST_F(TaskAllocatorReinitTest, ReInitClearsErrorState) {
+    InitAllocator();
+
+    auto r = allocator.alloc(HEAP_SIZE * 2);
+    EXPECT_TRUE(r.failed());
+    EXPECT_NE(error_code.load(), PTO2_ERROR_NONE);
+
+    InitAllocator();
+    EXPECT_EQ(error_code.load(), PTO2_ERROR_NONE);
+
+    auto r2 = allocator.alloc(64);
+    EXPECT_FALSE(r2.failed());
+}
+
+TEST_F(TaskAllocatorReinitTest, MultipleReInitCycles) {
+    for (int cycle = 0; cycle < 10; cycle++) {
+        InitAllocator();
+
+        for (int i = 0; i < WINDOW_SIZE - 1; i++) {
+            auto r = allocator.alloc(0);
+            ASSERT_FALSE(r.failed()) << "Cycle " << cycle << " alloc " << i;
+            EXPECT_EQ(r.task_id, i);
+        }
+    }
+}
+
+TEST_F(TaskAllocatorReinitTest, ReInitIgnoresStaleLastAlive) {
+    InitAllocator();
+
+    auto r1 = allocator.alloc(64);
+    ASSERT_FALSE(r1.failed());
+    last_alive.store(5, std::memory_order_release);
+
+    InitAllocator();
+    EXPECT_EQ(last_alive.load(), 0);
+
+    auto r2 = allocator.alloc(64);
+    ASSERT_FALSE(r2.failed());
+    EXPECT_EQ(r2.task_id, 0);
+}
diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp
index 10eef0317..522cd0279 100644
--- a/tests/ut/cpp/a5/test_tensormap.cpp
+++ b/tests/ut/cpp/a5/test_tensormap.cpp
@@ -65,6 +65,29 @@ static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32
     return make_tensor_external(reinterpret_cast<void *>(addr), shapes, 2, DataType::FLOAT32, false, version);
 }
 
+static Tensor make_test_tensor_nd(
+    uint64_t addr, uint32_t ndims, const uint32_t shapes[], const uint32_t offsets[] = nullptr, int32_t version = 0
+) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(addr), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t rs[RUNTIME_MAX_TENSOR_DIMS]{};
+    uint32_t o[RUNTIME_MAX_TENSOR_DIMS]{};
+    bool all_zero = true;
+    for (uint32_t i = 0; i < ndims && i < RUNTIME_MAX_TENSOR_DIMS; i++) {
+        s[i] = shapes[i];
+        rs[i] = shapes[i];
+        o[i] = offsets ? offsets[i] : 0;
+        if (o[i] != 0) all_zero = false;
+    }
+    uint64_t total = 4;
+    for (uint32_t i = 0; i < ndims; i++) {
+        total *= (rs[i] + (offsets ? offsets[i] : 0));
+    }
+    t.init(reinterpret_cast<void *>(addr), total, rs, s, o, ndims, DataType::FLOAT32, version, all_zero, true);
+    return t;
+}
+
 // =============================================================================
 // Fixture
 // =============================================================================
@@ -549,3 +572,105 @@ TEST(TaskIdTest, LocalIdMaxValue) {
     EXPECT_EQ(tid.ring(), 0);
     EXPECT_EQ(tid.local(), UINT32_MAX);
 }
+
+// =============================================================================
+// Edge cases merged from test_tensormap_overlap.cpp
+// =============================================================================
+
+TEST_F(TensorMapTest, ZeroDimensionTensor) {
+    // ndims=0: fast-path loop doesn't execute, contains=true -> COVERED
+    uint32_t seed_shape[1] = {1};
+    Tensor t = make_tensor_external(reinterpret_cast<void *>(0x2000), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{};
+    t.init(reinterpret_cast<void *>(0x2000), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    tmap.insert(t, PTO2TaskId::make(0, 0));
+
+    TestLookupResult result;
+    run_lookup(tmap, t, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED);
+}
+
+TEST_F(TensorMapTest, TwoZeroDimTensorsSameAddr) {
+    uint32_t seed_shape[1] = {1};
+    Tensor t1 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    Tensor t2 = make_tensor_external(reinterpret_cast<void *>(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0);
+    uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{};
+    t1.init(reinterpret_cast<void *>(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+    t2.init(reinterpret_cast<void *>(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true);
+
+    tmap.insert(t1, PTO2TaskId::make(0, 0));
+    tmap.insert(t2, PTO2TaskId::make(0, 1));
+
+    TestLookupResult result;
+    run_lookup(tmap, t1, result);
+    EXPECT_EQ(result.count, 2);
+    for (int i = 0; i < result.count; i++) {
+        EXPECT_EQ(result.entries[i].overlap_status, OverlapStatus::COVERED)
+            << "0-dim tensors always report COVERED (empty loop -> contains=true)";
+    }
+}
+
+TEST_F(TensorMapTest, AdjacentNoOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_test_tensor_nd(0x8000, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {100};
+    Tensor cons = make_test_tensor_nd(0x8000, 1, cons_shapes, cons_offsets, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    EXPECT_EQ(result.count, 0) << "Adjacent regions [0,100) and [100,200) must not overlap";
+}
+
+TEST_F(TensorMapTest, OneElementOverlap) {
+    uint32_t prod_shapes[] = {100};
+    Tensor prod = make_test_tensor_nd(0x8100, 1, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    uint32_t cons_shapes[] = {100};
+    uint32_t cons_offsets[] = {99};
+    Tensor cons = make_test_tensor_nd(0x8100, 1, cons_shapes, cons_offsets, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER) << "Partial overlap (1 element) -> OTHER";
+}
+
+TEST_F(TensorMapTest, ZeroShapeInDimension) {
+    // Producer: 2D [10, 0] -- zero in dim 1
+    uint32_t prod_shapes[] = {10, 0};
+    Tensor prod = make_test_tensor_nd(0x8200, 2, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer: 2D [10, 20]
+    uint32_t cons_shapes[] = {10, 20};
+    Tensor cons = make_test_tensor_nd(0x8200, 2, cons_shapes, nullptr, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    ASSERT_EQ(result.count, 1);
+    // Fast path: input.shapes[1](20) >= entry.shapes[1](0) -> contains=true -> COVERED
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+        << "Zero-shape producer is COVERED by any consumer (empty production)";
+}
+
+TEST_F(TensorMapTest, FullFiveDimensionalOverlap) {
+    uint32_t prod_shapes[] = {2, 3, 4, 5, 6};
+    Tensor prod = make_test_tensor_nd(0x9200, 5, prod_shapes, nullptr, 0);
+    tmap.insert(prod, PTO2TaskId::make(0, 0));
+
+    // Consumer with larger shapes in all dims -> COVERED
+    uint32_t cons_shapes[] = {4, 6, 8, 10, 12};
+    Tensor cons = make_test_tensor_nd(0x9200, 5, cons_shapes, nullptr, 0);
+
+    TestLookupResult result;
+    run_lookup(tmap, cons, result);
+    ASSERT_EQ(result.count, 1);
+    EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED)
+        << "5D consumer covers 5D producer in all dimensions";
+}