From 75a8a25cce1dc2f215963c89bfa6358952ab69ae Mon Sep 17 00:00:00 2001 From: chenshengxin Date: Wed, 22 Apr 2026 14:34:36 +0800 Subject: [PATCH] Add: PTO2 runtime unit tests for A2A3 and A5 (tensormap, orchestrator, coupling, link isolation) Refactor and extend PTO2 runtime unit test coverage for A2A3 and A5: A2A3 changes: - Remove outdated edge/stub tests: test_tensormap_edge, test_coupling, test_coupling_stub, test_boundary_edge - Add link isolation tests (test_link_isolation) and runtime coupling tests (test_runtime_coupling) - Rename test_orchestrator_fatal -> test_orchestrator_report_fatal - Extend existing tests: tensormap, shared_memory, task_allocator, ready_queue, orchestrator_submit, runtime_lifecycle A5 additions: - Add runtime-linked tests: test_a5_runtime_coupling, test_a5_link_isolation, test_a5_orchestrator_submit, test_a5_orchestrator_report_fatal, test_a5_runtime_lifecycle - Extend existing tests: ready_queue, shared_memory, task_allocator, tensormap Build: - Add add_a5_runtime_test() CMake helper and A5 runtime source sets --- tests/ut/cpp/CMakeLists.txt | 130 ++++++++ tests/ut/cpp/a2a3/test_link_isolation.cpp | 199 ++++++++++++ .../a2a3/test_orchestrator_report_fatal.cpp | 194 ++++++++++++ .../ut/cpp/a2a3/test_orchestrator_submit.cpp | 273 +++++++++++++++++ tests/ut/cpp/a2a3/test_ready_queue.cpp | 122 +++++++- tests/ut/cpp/a2a3/test_runtime_coupling.cpp | 283 ++++++++++++++++++ tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp | 189 ++++++++++++ tests/ut/cpp/a2a3/test_shared_memory.cpp | 157 ++++++++++ tests/ut/cpp/a2a3/test_task_allocator.cpp | 96 ++++++ tests/ut/cpp/a2a3/test_tensormap.cpp | 126 ++++++++ tests/ut/cpp/a5/test_link_isolation.cpp | 199 ++++++++++++ .../cpp/a5/test_orchestrator_report_fatal.cpp | 194 ++++++++++++ tests/ut/cpp/a5/test_orchestrator_submit.cpp | 273 +++++++++++++++++ tests/ut/cpp/a5/test_ready_queue.cpp | 122 +++++++- tests/ut/cpp/a5/test_runtime_coupling.cpp | 283 ++++++++++++++++++ tests/ut/cpp/a5/test_runtime_lifecycle.cpp | 189 ++++++++++++ tests/ut/cpp/a5/test_shared_memory.cpp | 170 ++++++++++- tests/ut/cpp/a5/test_task_allocator.cpp | 96 ++++++ tests/ut/cpp/a5/test_tensormap.cpp | 125 ++++++++ 19 files changed, 3405 insertions(+), 15 deletions(-) create mode 100644 tests/ut/cpp/a2a3/test_link_isolation.cpp create mode 100644 tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp create mode 100644 tests/ut/cpp/a2a3/test_orchestrator_submit.cpp create mode 100644 tests/ut/cpp/a2a3/test_runtime_coupling.cpp create mode 100644 tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp create mode 100644 tests/ut/cpp/a5/test_link_isolation.cpp create mode 100644 tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp create mode 100644 tests/ut/cpp/a5/test_orchestrator_submit.cpp create mode 100644 tests/ut/cpp/a5/test_runtime_coupling.cpp create mode 100644 tests/ut/cpp/a5/test_runtime_lifecycle.cpp diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index f923f290c..2c111d24d 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -94,6 +94,26 @@ set(A2A3_COMMON_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface ) +# --------------------------------------------------------------------------- +# A5 runtime sources and stubs for ring-buffer / tensormap tests +# --------------------------------------------------------------------------- +set(A5_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime) +set(A5_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp) +set(A5_RUNTIME_SOURCES + ${A5_RUNTIME_DIR}/pto_ring_buffer.cpp + ${A5_RUNTIME_DIR}/shared/pto_shared_memory.cpp + ${A5_RUNTIME_DIR}/scheduler/pto_scheduler.cpp + ${A5_RUNTIME_DIR}/pto_tensormap.cpp +) + +set(A5_COMMON_INCLUDE_DIRS + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/orchestration + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/runtime + ${CMAKE_SOURCE_DIR}/../../../src/a5/runtime/tensormap_and_ringbuffer/common + ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/include + ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface +) + function(add_a2a3_runtime_test name) cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN}) set(_all_sources ${ARG_SOURCES} ${A2A3_STUB_SOURCES}) @@ -117,6 +137,29 @@ function(add_a2a3_runtime_test name) set_tests_properties(${name} PROPERTIES LABELS "no_hardware") endfunction() +function(add_a5_runtime_test name) + cmake_parse_arguments(ARG "" "" "SOURCES;EXTRA_SOURCES" ${ARGN}) + set(_all_sources ${ARG_SOURCES} ${A5_STUB_SOURCES}) + foreach(src ${ARG_SOURCES} ${ARG_EXTRA_SOURCES}) + if(EXISTS ${src}) + list(APPEND _all_sources ${src}) + endif() + endforeach() + add_executable(${name} ${_all_sources}) + target_include_directories(${name} PRIVATE + ${GTEST_INCLUDE_DIRS} + ${A5_COMMON_INCLUDE_DIRS} + ) + target_compile_options(${name} PRIVATE -D_GLIBCXX_USE_CXX11_ABI=0) + target_link_libraries(${name} PRIVATE + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread + ) + add_test(NAME ${name} COMMAND ${name}) + set_tests_properties(${name} PROPERTIES LABELS "no_hardware") +endfunction() + # --------------------------------------------------------------------------- # Distributed runtime sources under test # --------------------------------------------------------------------------- @@ -248,6 +291,30 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp) add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp) add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp) +# PTO2 runtime-linked tests (tensormap, orchestrator, coupling, boundary) +add_a2a3_runtime_test(test_runtime_coupling + SOURCES a2a3/test_runtime_coupling.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a2a3_runtime_test(test_link_isolation + SOURCES a2a3/test_link_isolation.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_orchestrator_submit + SOURCES a2a3/test_orchestrator_submit.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a2a3_runtime_test(test_orchestrator_report_fatal + SOURCES a2a3/test_orchestrator_report_fatal.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a2a3_runtime_test(test_runtime_lifecycle + SOURCES a2a3/test_runtime_lifecycle.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} + ${A2A3_RUNTIME_DIR}/pto_runtime2.cpp + ${A2A3_RUNTIME_DIR}/pto_orchestrator.cpp +) + # --------------------------------------------------------------------------- # A2A3 tests (src/a2a3/runtime/tensormap_and_ringbuffer/) # --------------------------------------------------------------------------- @@ -300,6 +367,69 @@ add_a2a3_runtime_test(test_wiring # --------------------------------------------------------------------------- add_a5_test(test_a5_fatal a5/test_a5_fatal.cpp) +add_a5_runtime_test(test_a5_runtime_coupling + SOURCES a5/test_runtime_coupling.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} ${A5_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a5_runtime_test(test_a5_link_isolation + SOURCES a5/test_link_isolation.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_orchestrator_submit + SOURCES a5/test_orchestrator_submit.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} ${A5_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a5_runtime_test(test_a5_orchestrator_report_fatal + SOURCES a5/test_orchestrator_report_fatal.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} ${A5_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a5_runtime_test(test_a5_runtime_lifecycle + SOURCES a5/test_runtime_lifecycle.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} + ${A5_RUNTIME_DIR}/pto_runtime2.cpp + ${A5_RUNTIME_DIR}/pto_orchestrator.cpp +) +add_a5_runtime_test(test_a5_task_allocator + SOURCES a5/test_task_allocator.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_dep_list_pool + SOURCES a5/test_dep_list_pool.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_scheduler_state + SOURCES a5/test_scheduler_state.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_task_state + SOURCES a5/test_task_state.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_ready_queue + SOURCES a5/test_ready_queue.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_shared_memory + SOURCES a5/test_shared_memory.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_tensormap + SOURCES a5/test_tensormap.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_fanin_pool + SOURCES a5/test_fanin_pool.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_spsc_queue + SOURCES a5/test_spsc_queue.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) +add_a5_runtime_test(test_a5_wiring + SOURCES a5/test_wiring.cpp + EXTRA_SOURCES ${A5_RUNTIME_SOURCES} +) + # Host logger silent/off behavior — no runtime deps, just compile host_log.cpp # alongside the test. set(A5_HOST_LOG_DIR ${CMAKE_SOURCE_DIR}/../../../src/a5/platform/src/host) diff --git a/tests/ut/cpp/a2a3/test_link_isolation.cpp b/tests/ut/cpp/a2a3/test_link_isolation.cpp new file mode 100644 index 000000000..78f1378a9 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_link_isolation.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Behavior tests for runtime components built without pto_orchestrator.cpp. + * + * The CMake target for this file deliberately omits the orchestrator source. + * Passing build and runtime assertions here verifies that scheduler, ring + * buffer, shared memory, and TensorMap behavior does not require an + * orchestrator link dependency. + */ + +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; + +struct TestLookupResult { + struct Entry { + PTO2TensorMapEntry *entry; + OverlapStatus overlap_status; + }; + std::vector entries; + int count = 0; +}; + +void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) { + tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool { + out.entries.push_back({&entry, status}); + out.count++; + return true; + }); +} + +Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0}; + return make_tensor_external(reinterpret_cast(addr), shapes, 1, DataType::FLOAT32, false, version); +} + +struct DepPoolFixture { + PTO2DepListEntry entries[512]; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2DepListPool pool{}; + + void Init() { + std::memset(entries, 0, sizeof(entries)); + error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); + pool.init(entries, 512, &error_code); + } + + void AllocN(int count) { + for (int i = 0; i < count; i++) { + ASSERT_NE(pool.alloc(), nullptr); + } + } +}; + +} // namespace + +TEST(LinkIsolationDepPool, ReclaimBelowIntervalKeepsAllocatedEntries) { + DepPoolFixture fixture; + fixture.Init(); + fixture.AllocN(100); + int32_t used_before = fixture.pool.used(); + + PTO2SharedMemoryRingHeader ring_header{}; + fixture.pool.reclaim(ring_header, PTO2_DEP_POOL_CLEANUP_INTERVAL - 1); + + EXPECT_EQ(fixture.pool.used(), used_before); + EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); +} + +TEST(LinkIsolationDepPool, ReclaimAtIntervalUsesConsumedTaskMark) { + DepPoolFixture fixture; + fixture.Init(); + fixture.AllocN(100); + + std::vector slots(kWindowSize); + PTO2SharedMemoryRingHeader ring_header{}; + ring_header.slot_states = slots.data(); + ring_header.task_window_size = kWindowSize; + ring_header.task_window_mask = kWindowSize - 1; + + int32_t last_alive = PTO2_DEP_POOL_CLEANUP_INTERVAL; + int32_t mark_slot = (last_alive - 1) & ring_header.task_window_mask; + slots[mark_slot].dep_pool_mark = 50; + + fixture.pool.reclaim(ring_header, last_alive); + + EXPECT_EQ(fixture.pool.used(), 51); + EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); +} + +TEST(LinkIsolationScheduler, ReleaseFaninPushesReadyTask) { + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm, nullptr); + + PTO2SchedulerState sched{}; + ASSERT_TRUE(sched.init(sm->header)); + + alignas(64) PTO2TaskSlotState slot{}; + slot.fanin_count = 1; + slot.fanin_refcount.store(0, std::memory_order_relaxed); + slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); + slot.active_mask = ActiveMask(PTO2_SUBTASK_MASK_AIV0); + + EXPECT_TRUE(sched.release_fanin_and_check_ready(slot, nullptr)); + + PTO2ResourceShape shape = slot.active_mask.to_shape(); + EXPECT_EQ(sched.ready_queues[static_cast(shape)].pop(), &slot); + + sched.destroy(); + sm->destroy(); +} + +TEST(LinkIsolationScheduler, CompletedTaskCanBecomeConsumed) { + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm, nullptr); + + PTO2SchedulerState sched{}; + ASSERT_TRUE(sched.init(sm->header)); + + PTO2TaskDescriptor desc{}; + PTO2TaskSlotState &slot = sm->header->rings[0].get_slot_state_by_slot(0); + slot.task = &desc; + slot.ring_id = 0; + slot.fanout_count = 1; + slot.fanout_refcount.store(1, std::memory_order_relaxed); + slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed); + sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed); + + sched.check_and_handle_consumed(slot); + + EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED); + + sched.destroy(); + sm->destroy(); +} + +TEST(LinkIsolationReadyQueue, PushPopBatchWithoutOrchestrator) { + PTO2ReadyQueue queue{}; + ASSERT_TRUE(ready_queue_init(&queue, 16)); + + PTO2TaskSlotState items[4]{}; + PTO2TaskSlotState *in[4] = {&items[0], &items[1], &items[2], &items[3]}; + queue.push_batch(in, 4); + + PTO2TaskSlotState *out[4]{}; + EXPECT_EQ(queue.pop_batch(out, 4), 4); + for (int i = 0; i < 4; i++) { + EXPECT_EQ(out[i], &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); + + ready_queue_destroy(&queue); +} + +TEST(LinkIsolationTensorMap, InsertLookupAndValidityWithoutOrchestrator) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {kWindowSize, kWindowSize, kWindowSize, kWindowSize}; + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 1024, window_sizes)); + + Tensor tensor = make_tensor(0x3000); + for (int i = 0; i < kWindowSize; i++) { + tmap.insert(tensor, PTO2TaskId::make(0, i)); + } + + tmap.sync_validity(0, kWindowSize / 2); + + TestLookupResult result; + run_lookup(tmap, tensor, result); + EXPECT_EQ(result.count, kWindowSize / 2); + for (const auto &entry : result.entries) { + EXPECT_GE(entry.entry->producer_task_id.local(), static_cast(kWindowSize / 2)); + EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED); + } + + tmap.destroy(); +} diff --git a/tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp b/tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp new file mode 100644 index 000000000..06fdd16d7 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_orchestrator_report_fatal.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * UT for the orchestrator-side fatal reporting path. + * + * Targets orch_.report_fatal (pto_orchestrator.cpp) and verifies: + * - orch->fatal latches to true on any non-zero error code + * - the first non-zero code wins via CAS into sm_header->orch_error_code + * - subsequent fatal reports do NOT overwrite the first code + * - PTO2_ERROR_NONE never latches the shared-memory code (but still flips + * the local fatal flag -- by design, callers may use it to mark fatal + * without writing a code) + * + * This test exercises the real symbol against a fully-initialized + * orchestrator + shared memory pair, complementing the fake-runtime test + * (test_a2a3_pto2_fatal.cpp) that only validates the ops-table dispatch. + */ + +#include + +#include +#include + +#include "pto_orchestrator.h" +#include "pto_runtime_status.h" +#include "scheduler/pto_scheduler.h" +#include "pto_shared_memory.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; +constexpr int32_t kDepPool = 256; + +class OrchestratorFatalTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *sm_ = nullptr; + PTO2SchedulerState sched_{}; + PTO2OrchestratorState orch_{}; + uint8_t *gm_heap_ = nullptr; + bool sched_ok_ = false; + bool orch_ok_ = false; + + void SetUp() override { + sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm_, nullptr); + + gm_heap_ = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize)); + ASSERT_NE(gm_heap_, nullptr); + + sched_ok_ = sched_.init(sm_->header, kDepPool); + ASSERT_TRUE(sched_ok_); + + orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool); + ASSERT_TRUE(orch_ok_); + } + + void TearDown() override { + if (orch_ok_) orch_.destroy(); + if (sched_ok_) sched_.destroy(); + if (gm_heap_) std::free(gm_heap_); + if (sm_) sm_->destroy(); + } + + int32_t shared_orch_code() const { return sm_->header->orch_error_code.load(std::memory_order_acquire); } +}; + +} // namespace + +// ---------- baseline ---------- + +TEST_F(OrchestratorFatalTest, InitialState_NoFatalNoSharedCode) { + // Verify no fatal state via the observable shared memory output + EXPECT_FALSE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE); +} + +// ---------- happy path: single fatal latches both local flag and shared code ---------- + +TEST_F(OrchestratorFatalTest, ReportFatal_SetsLocalFlagAndSharedCode) { + orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", "deadlock at ring %d", 3); + + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +// ---------- CAS first-writer-wins ---------- + +TEST_F(OrchestratorFatalTest, SecondReportFatal_DoesNotOverwriteSharedCode) { + orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", nullptr); + orch_.report_fatal(PTO2_ERROR_DEP_POOL_OVERFLOW, "test", nullptr); + + // Second report must NOT overwrite the first latched code. + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +TEST_F(OrchestratorFatalTest, RepeatedSameCode_StaysLatched) { + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr); + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr); + + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS); +} + +// ---------- PTO2_ERROR_NONE: marks fatal locally, does NOT touch shared code ---------- + +TEST_F(OrchestratorFatalTest, ReportFatalWithErrorNone_DoesNotWriteSharedCode) { + orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr); + + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE); +} + +// ---------- PTO2_ERROR_NONE first does not block a real code from latching ---------- + +TEST_F(OrchestratorFatalTest, ErrorNoneFirst_RealCodeStillLatchesAfter) { + orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE); + + orch_.report_fatal(PTO2_ERROR_SCOPE_DEADLOCK, "test", nullptr); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_SCOPE_DEADLOCK); +} + +// ---------- coverage of every defined orchestrator code ---------- + +TEST_F(OrchestratorFatalTest, EveryOrchCode_LatchesIntoSharedMemory) { + const int32_t codes[] = { + PTO2_ERROR_SCOPE_DEADLOCK, + PTO2_ERROR_HEAP_RING_DEADLOCK, + PTO2_ERROR_FLOW_CONTROL_DEADLOCK, + PTO2_ERROR_DEP_POOL_OVERFLOW, + PTO2_ERROR_INVALID_ARGS, + PTO2_ERROR_DEPENDENCY_OVERFLOW, + PTO2_ERROR_REQUIRE_SYNC_START_INVALID, + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, + PTO2_ERROR_EXPLICIT_ORCH_FATAL, + }; + for (int32_t code : codes) { + // Reset latches between iterations. Direct field access is unavoidable here + // since there is no public reset API for the orchestrator fatal state. + sm_->header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_release); + orch_.fatal = false; + + orch_.report_fatal(code, "test", "code=%d", code); + + SCOPED_TRACE(testing::Message() << "code=" << code); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), code); + } +} + +// ---------- format-string variants must not crash ---------- + +TEST_F(OrchestratorFatalTest, NullFmt_DoesNotCrash) { + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", nullptr); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS); +} + +TEST_F(OrchestratorFatalTest, EmptyFmt_DoesNotCrash) { + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", ""); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS); +} + +TEST_F(OrchestratorFatalTest, FmtWithVarArgs_DoesNotCrash) { + orch_.report_fatal( + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, "func", "tensor=%p slot=%d msg=%s", reinterpret_cast(0xdeadbeef), 17, + "boom" + ); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_TENSOR_WAIT_TIMEOUT); +} + +// ---------- end-to-end: status helper sees latched code ---------- + +TEST_F(OrchestratorFatalTest, StatusHelperReadsLatchedOrchCode) { + orch_.report_fatal(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, "func", nullptr); + + int32_t orch_code = shared_orch_code(); + int32_t sched_code = sm_->header->sched_error_code.load(std::memory_order_acquire); + EXPECT_EQ(runtime_status_from_error_codes(orch_code, sched_code), -PTO2_ERROR_FLOW_CONTROL_DEADLOCK); +} diff --git a/tests/ut/cpp/a2a3/test_orchestrator_submit.cpp b/tests/ut/cpp/a2a3/test_orchestrator_submit.cpp new file mode 100644 index 000000000..c01635f57 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_orchestrator_submit.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * Orchestrator submit-path UT. + * + * Covers pto2_submit_mixed_task, pto2_alloc_tensors, pto2_orchestrator_done, + * and pto2_orchestrator_set_scheduler on a fully initialized + * (TMR) system. + * + * Follows AAA and FIRST: each TEST_F builds a fresh TMRSystem, exercises + * one behavior, and tears the system down in TearDown(). + */ + +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" // make_tensor_external, TensorCreateInfo ctor +#include "pto_orchestrator.h" +#include "pto_ring_buffer.h" +#include "scheduler/pto_scheduler.h" +#include "pto_shared_memory.h" +#include "pto_submit_types.h" +#include "pto_tensormap.h" +#include "tensor.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; +constexpr int32_t kDepPool = 256; + +// ----------------------------------------------------------------------------- +// Fixture: minimal TMR system for orchestrator-level tests. +// ----------------------------------------------------------------------------- +class OrchestratorSubmitTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *sm_ = nullptr; + PTO2SchedulerState sched_{}; + PTO2OrchestratorState orch_{}; + uint8_t *gm_heap_ = nullptr; + bool sched_ok_ = false; + bool orch_ok_ = false; + + void SetUp() override { + sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm_, nullptr); + + gm_heap_ = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize)); + ASSERT_NE(gm_heap_, nullptr); + + sched_ok_ = sched_.init(sm_->header, kDepPool); + ASSERT_TRUE(sched_ok_); + + orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool); + ASSERT_TRUE(orch_ok_); + + orch_.set_scheduler(&sched_); + } + + void TearDown() override { + if (orch_ok_) orch_.destroy(); + if (sched_ok_) sched_.destroy(); + if (gm_heap_) std::free(gm_heap_); + if (sm_) sm_->destroy(); + } + + // Helper: build a minimal TensorCreateInfo owning one FP32 scalar output. + static TensorCreateInfo make_scalar_ci() { + static const uint32_t kShape[1] = {1}; + return TensorCreateInfo(kShape, 1, DataType::FLOAT32); + } + + bool has_orch_error() const { + return sm_->header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE; + } +}; + +} // namespace + +// ---------- set_scheduler ---------- + +TEST_F(OrchestratorSubmitTest, SetScheduler_StoresPointer) { + PTO2SchedulerState other{}; + orch_.set_scheduler(&other); + // Direct field read: no public getter exists for the scheduler pointer. + EXPECT_EQ(orch_.scheduler, &other); + + // Restore for TearDown. + orch_.set_scheduler(&sched_); +} + +// ---------- alloc_tensors: argument validation ---------- + +TEST_F(OrchestratorSubmitTest, AllocTensors_EmptyArgs_MarksFatal) { + Arg args; // no tensors, no scalars + + TaskOutputTensors result = orch_.alloc_tensors(args); + + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_WithScalars_MarksFatal) { + TensorCreateInfo ci = make_scalar_ci(); + Arg args; + args.add_output(ci); + args.add_scalar(uint64_t{42}); + + TaskOutputTensors result = orch_.alloc_tensors(args); + + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_InputArg_MarksFatal) { + // alloc_tensors only accepts OUTPUT TensorCreateInfo args. + uint32_t shape[1] = {1}; + Tensor input = make_tensor_external(reinterpret_cast(0x1000), shape, 1); + Arg args; + args.add_input(input); + + TaskOutputTensors result = orch_.alloc_tensors(args); + + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_OutputOnly_ReturnsMaterializedTensors) { + // Arrange: two output CIs, inside an active scope. + TensorCreateInfo ci1 = make_scalar_ci(); + TensorCreateInfo ci2 = make_scalar_ci(); + Arg args; + args.add_output(ci1, ci2); + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.alloc_tensors(args); + orch_.end_scope(); + + // Assert + EXPECT_FALSE(has_orch_error()); + EXPECT_EQ(result.size(), 2U); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_AfterFatal_ReturnsEmpty) { + // Arrange: force fatal. + orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr); + ASSERT_TRUE(has_orch_error()); + + TensorCreateInfo ci = make_scalar_ci(); + Arg args; + args.add_output(ci); + + // Act + TaskOutputTensors result = orch_.alloc_tensors(args); + + // Assert + EXPECT_TRUE(result.empty()); +} + +// ---------- submit_mixed_task ---------- + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_AfterFatal_ReturnsEmpty) { + // Arrange: pre-fatal state + orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr); + + MixedKernels mixed; + mixed.aic_kernel_id = 0; + Arg args; + + // Act + TaskOutputTensors result = orch_.submit_task(mixed, args); + + // Assert + EXPECT_TRUE(result.empty()); +} + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_ArgWithError_MarksFatalInvalidArgs) { + // Arrange: craft an Arg with has_error set. + // Calling add_input after add_scalar triggers the ordering error path. + uint32_t shape[1] = {1}; + Tensor t = make_tensor_external(reinterpret_cast(0x1000), shape, 1); + Arg args; + args.add_scalar(uint64_t{1}); + args.add_input(t); // illegal ordering -> has_error = true + ASSERT_TRUE(args.has_error); + + MixedKernels mixed; + mixed.aic_kernel_id = 0; + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.submit_task(mixed, args); + orch_.end_scope(); + + // Assert + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_PureInputOnly_Succeeds) { + // Arrange: one input tensor, one AIC kernel, within a scope. + uint32_t shape[1] = {1}; + Tensor input = make_tensor_external(reinterpret_cast(0x2000), shape, 1); + + Arg args; + args.add_input(input); + ASSERT_FALSE(args.has_error); + + MixedKernels mixed; + mixed.aic_kernel_id = 7; // any non-invalid id + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.submit_task(mixed, args); + orch_.end_scope(); + + // Assert: submit returns (no outputs), and no fatal state was set. + EXPECT_TRUE(result.empty()); + EXPECT_FALSE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_OutputTensor_MaterializesResult) { + // Arrange: one OUTPUT TensorCreateInfo -> task produces one tensor. + TensorCreateInfo ci = make_scalar_ci(); + Arg args; + args.add_output(ci); + + MixedKernels mixed; + mixed.aic_kernel_id = 1; + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.submit_task(mixed, args); + orch_.end_scope(); + + // Assert + EXPECT_FALSE(has_orch_error()); + EXPECT_EQ(result.size(), 1U); +} + +// ---------- orchestrator_done ---------- + +TEST_F(OrchestratorSubmitTest, OrchestratorDone_SetsSharedMemoryFlag) { + // Arrange + ASSERT_EQ(sm_->header->orchestrator_done.load(), 0); + + // Act + orch_.mark_done(); + + // Assert + EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1); +} + +TEST_F(OrchestratorSubmitTest, OrchestratorDone_IsIdempotent) { + orch_.mark_done(); + orch_.mark_done(); + + // Flag stays 1 -- store is release-set, not increment. + EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1); +} diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp index 9dea3ae94..bb2a6a101 100644 --- a/tests/ut/cpp/a2a3/test_ready_queue.cpp +++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp @@ -371,9 +371,10 @@ TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) { INSTANTIATE_TEST_SUITE_P( MPMCVariants, ReadyQueueMPMCTest, ::testing::Values( - MPMCConfig{2, 2, 200}, // TwoProducersTwoConsumers - MPMCConfig{1, 4, 500}, // OneProducerNConsumers - MPMCConfig{4, 4, 1250} // HighContentionStress + MPMCConfig{2, 2, 200}, // TwoProducersTwoConsumers + MPMCConfig{1, 4, 500}, // OneProducerNConsumers + MPMCConfig{4, 4, 1250}, // HighContentionStress + MPMCConfig{8, 8, 2000} // EightProducersEightConsumers ) ); @@ -444,3 +445,118 @@ TEST_F(LocalReadyBufferTest, NullBackingBuffer) { EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing"; EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing"; } + +// ============================================================================= +// High-contention stress tests +// ============================================================================= + +class ReadyQueueStressTest : public ::testing::Test { +protected: + static constexpr uint64_t kCapacity = 512; + PTO2ReadyQueue queue; + + void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, kCapacity)); } + void TearDown() override { ready_queue_destroy(&queue); } +}; + +TEST_F(ReadyQueueStressTest, RapidFillDrainCycles) { + constexpr int kCycles = 100; + constexpr int kItemsPerCycle = static_cast(kCapacity / 2); + + std::vector items(kItemsPerCycle); + for (int i = 0; i < kItemsPerCycle; i++) { + items[i].fanin_count = i; + } + + for (int cycle = 0; cycle < kCycles; cycle++) { + std::atomic push_done{0}; + std::atomic popped{0}; + + auto producer = [&](int id) { + int per_thread = kItemsPerCycle / 4; + int base = id * per_thread; + for (int i = 0; i < per_thread; i++) { + while (!queue.push(&items[base + i])) {} + } + push_done.fetch_add(1, std::memory_order_release); + }; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState *s = queue.pop(); + if (s) { + popped.fetch_add(1, std::memory_order_relaxed); + } else if (push_done.load(std::memory_order_acquire) == 4) { + while ((s = queue.pop()) != nullptr) { + popped.fetch_add(1, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector threads; + for (int i = 0; i < 4; i++) + threads.emplace_back(producer, i); + for (int i = 0; i < 4; i++) + threads.emplace_back(consumer); + for (auto &t : threads) + t.join(); + + ASSERT_EQ(popped.load(), kItemsPerCycle) << "Cycle " << cycle << ": lost items"; + } +} + +TEST_F(ReadyQueueStressTest, PopBatchUnderContention) { + constexpr int kBatchSize = 8; + constexpr int kBatches = 500; + constexpr int kProducers = 4; + constexpr int kTotalItems = kBatchSize * kBatches * kProducers; + + std::vector items(kTotalItems); + for (int i = 0; i < kTotalItems; i++) + items[i].fanin_count = i; + + std::atomic total_consumed{0}; + std::atomic producers_done{0}; + + auto producer = [&](int id) { + int base = id * kBatchSize * kBatches; + for (int b = 0; b < kBatches; b++) { + PTO2TaskSlotState *ptrs[kBatchSize]; + for (int i = 0; i < kBatchSize; i++) { + ptrs[i] = &items[base + b * kBatchSize + i]; + } + for (int i = 0; i < kBatchSize; i++) { + while (!queue.push(ptrs[i])) {} + } + } + producers_done.fetch_add(1, std::memory_order_release); + }; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState *out[kBatchSize]; + int n = queue.pop_batch(out, kBatchSize); + total_consumed.fetch_add(n, std::memory_order_relaxed); + if (n == 0 && producers_done.load(std::memory_order_acquire) == kProducers) { + while (true) { + n = queue.pop_batch(out, kBatchSize); + if (n == 0) break; + total_consumed.fetch_add(n, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector threads; + for (int i = 0; i < kProducers; i++) + threads.emplace_back(producer, i); + for (int i = 0; i < 4; i++) + threads.emplace_back(consumer); + for (auto &t : threads) + t.join(); + + EXPECT_EQ(total_consumed.load(), kTotalItems); +} diff --git a/tests/ut/cpp/a2a3/test_runtime_coupling.cpp b/tests/ut/cpp/a2a3/test_runtime_coupling.cpp new file mode 100644 index 000000000..927a7b0fc --- /dev/null +++ b/tests/ut/cpp/a2a3/test_runtime_coupling.cpp @@ -0,0 +1,283 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime behavior tests for the combined TMR components. + * + * These tests keep the assertions on observable component behavior: lifecycle, + * scheduler state transitions, TensorMap lookup validity, and shared-memory + * coordination. Structural coupling checks belong in design review rather than + * in unit tests. + */ + +#include + +#include +#include +#include +#include + +#include "pto_orchestration_api.h" +#include "pto_orchestrator.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; +constexpr int32_t kDepPoolSize = 256; + +struct TestLookupResult { + struct Entry { + PTO2TensorMapEntry *entry; + OverlapStatus overlap_status; + }; + std::vector entries; + int count = 0; +}; + +void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) { + tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool { + out.entries.push_back({&entry, status}); + out.count++; + return true; + }); +} + +Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0}; + return make_tensor_external(reinterpret_cast(addr), shapes, 1, DataType::FLOAT32, false, version); +} + +struct TMRSystem { + PTO2SharedMemoryHandle *sm = nullptr; + PTO2SchedulerState sched{}; + PTO2OrchestratorState orch{}; + uint8_t *gm_heap = nullptr; + bool sched_ok = false; + bool orch_ok = false; + + bool Init(uint64_t heap_size = kHeapSize, int32_t window_size = kWindowSize) { + sm = PTO2SharedMemoryHandle::create(window_size, heap_size); + if (sm == nullptr) return false; + + gm_heap = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, heap_size)); + if (gm_heap == nullptr) return false; + + sched_ok = sched.init(sm->header, kDepPoolSize); + if (!sched_ok) return false; + + orch_ok = orch.init(sm->header, gm_heap, heap_size, kDepPoolSize); + if (!orch_ok) return false; + + orch.set_scheduler(&sched); + return true; + } + + void Destroy() { + if (orch_ok) { + orch.destroy(); + orch_ok = false; + } + if (sched_ok) { + sched.destroy(); + sched_ok = false; + } + if (gm_heap != nullptr) { + std::free(gm_heap); + gm_heap = nullptr; + } + if (sm != nullptr) { + sm->destroy(); + sm = nullptr; + } + } +}; + +} // namespace + +TEST(RuntimeLifecycleBehavior, InitDestroyCanRepeat) { + for (int cycle = 0; cycle < 3; cycle++) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()) << "cycle=" << cycle; + EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + EXPECT_EQ(sys.sm->header->sched_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + sys.Destroy(); + } +} + +TEST(RuntimeLifecycleBehavior, OrchestratorScopeWithoutSchedulerLeavesNoFatalCode) { + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm, nullptr); + uint8_t *heap = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize)); + ASSERT_NE(heap, nullptr); + + PTO2OrchestratorState orch{}; + ASSERT_TRUE(orch.init(sm->header, heap, kHeapSize, kDepPoolSize)); + + orch.begin_scope(); + orch.end_scope(); + + EXPECT_EQ(sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + EXPECT_FALSE(orch.fatal); + + orch.destroy(); + std::free(heap); + sm->destroy(); +} + +TEST(RuntimeSchedulerBehavior, CompletedSlotWithSatisfiedFanoutBecomesConsumed) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + PTO2TaskDescriptor desc{}; + PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0); + slot.task = &desc; + slot.ring_id = 0; + slot.fanout_count = 1; + slot.fanout_refcount.store(1, std::memory_order_relaxed); + slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed); + sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed); + + sys.sched.check_and_handle_consumed(slot); + + EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED); + + sys.Destroy(); +} + +TEST(RuntimeSchedulerBehavior, RingPointerStopsAtFirstUnconsumedTask) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + auto &ring_state = sys.sched.ring_sched_states[0]; + PTO2TaskDescriptor descs[3]{}; + + PTO2TaskSlotState &slot0 = sys.sm->header->rings[0].get_slot_state_by_slot(0); + slot0.task = &descs[0]; + slot0.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed); + + PTO2TaskSlotState &slot1 = sys.sm->header->rings[0].get_slot_state_by_slot(1); + slot1.task = &descs[1]; + slot1.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed); + + PTO2TaskSlotState &slot2 = sys.sm->header->rings[0].get_slot_state_by_slot(2); + slot2.task = &descs[2]; + slot2.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed); + + sys.sm->header->rings[0].fc.current_task_index.store(3, std::memory_order_relaxed); + + ring_state.advance_ring_pointers(); + + EXPECT_EQ(ring_state.last_task_alive, 1); + EXPECT_EQ(sys.sm->header->rings[0].fc.last_task_alive.load(std::memory_order_acquire), static_cast(1)); + + sys.Destroy(); +} + +TEST(RuntimeSchedulerBehavior, ReadyQueuesAcceptEveryResourceShape) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + for (int shape = 0; shape < PTO2_NUM_RESOURCE_SHAPES; shape++) { + PTO2TaskSlotState slot{}; + EXPECT_TRUE(sys.sched.ready_queues[shape].push(&slot)) << "shape=" << shape; + EXPECT_EQ(sys.sched.ready_queues[shape].pop(), &slot) << "shape=" << shape; + } + + sys.Destroy(); +} + +TEST(RuntimeTensorMapBehavior, StandaloneInsertLookupDoesNotNeedOrchestratorPointer) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16}; + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 1024, window_sizes)); + + Tensor tensor = make_tensor(0x1000); + tmap.insert(tensor, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, tensor, result); + + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0)); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); + + tmap.destroy(); +} + +TEST(RuntimeTensorMapBehavior, ValiditySkipsRetiredEntries) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {256, 256, 256, 256}; + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 4096, window_sizes)); + + Tensor tensor = make_tensor(0x2000); + for (int i = 0; i < 100; i++) { + tmap.insert(tensor, PTO2TaskId::make(0, i)); + } + + tmap.sync_validity(0, 80); + + TestLookupResult result; + run_lookup(tmap, tensor, result); + + EXPECT_EQ(result.count, 20); + for (const auto &entry : result.entries) { + EXPECT_GE(entry.entry->producer_task_id.local(), 80u); + EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED); + } + + tmap.destroy(); +} + +TEST(RuntimeTensorMapBehavior, AllRingsCanProduceForSameTensor) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH]; + for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++) { + window_sizes[i] = kWindowSize; + } + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 1024, window_sizes)); + + Tensor tensor = make_tensor(0x3000); + for (int ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) { + tmap.insert(tensor, PTO2TaskId::make(ring, 0)); + } + + TestLookupResult result; + run_lookup(tmap, tensor, result); + + EXPECT_EQ(result.count, PTO2_MAX_RING_DEPTH); + + tmap.destroy(); +} + +TEST(RuntimeIntegrationBehavior, OrchestratorTensorMapUsesConfiguredWindow) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + Tensor tensor = make_tensor(0x4000); + sys.orch.begin_scope(); + sys.orch.tensor_map.insert(tensor, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(sys.orch.tensor_map, tensor, result); + + sys.orch.end_scope(); + + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0)); + EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + + sys.Destroy(); +} diff --git a/tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp b/tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp new file mode 100644 index 000000000..b1987ed9f --- /dev/null +++ b/tests/ut/cpp/a2a3/test_runtime_lifecycle.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO2 Runtime lifecycle UT. + * + * Covers runtime_create / _custom / _from_sm / _destroy / set_mode. + * + * Follows AAA and FIRST: no shared mutable state between tests, each test + * constructs its own runtime and tears it down. + */ + +#include + +#include + +#include "pto_runtime2.h" +#include "pto_shared_memory.h" + +namespace { + +constexpr uint64_t kSmallWindow = 64; +constexpr uint64_t kSmallHeap = 64 * 1024; + +// ----------------------------------------------------------------------------- +// Fixture: each test gets a fresh, isolated runtime config. +// ----------------------------------------------------------------------------- +class RuntimeLifecycleTest : public ::testing::Test { +protected: + PTO2Runtime *rt_ = nullptr; + + void TearDown() override { + if (rt_ != nullptr) { + runtime_destroy(rt_); + rt_ = nullptr; + } + } +}; + +} // namespace + +// ---------- Happy-path creation ---------- + +TEST_F(RuntimeLifecycleTest, CreateCustom_ValidSizes_ReturnsInitializedRuntime) { + // Arrange + Act + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + + // Assert + ASSERT_NE(rt_, nullptr); + EXPECT_NE(rt_->ops, nullptr); + EXPECT_NE(rt_->sm_handle, nullptr); + EXPECT_NE(rt_->gm_heap, nullptr); + EXPECT_TRUE(rt_->gm_heap_owned); + EXPECT_EQ(rt_->mode, PTO2_MODE_SIMULATE); + EXPECT_EQ(rt_->gm_heap_size, kSmallHeap * PTO2_MAX_RING_DEPTH); +} + +TEST_F(RuntimeLifecycleTest, CreateCustom_ConnectsOrchestratorToScheduler) { + rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap); + + ASSERT_NE(rt_, nullptr); + // In simulated mode the orchestrator must hold a pointer to the scheduler. + EXPECT_EQ(rt_->orchestrator.scheduler, &rt_->scheduler); +} + +TEST_F(RuntimeLifecycleTest, CreateDefault_UsesDefaultSizes) { + // create() is a thin wrapper around create_custom with PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. + // Use GRAPH_ONLY to avoid executor threads. We don't allocate the full + // 256MB heap in this path -- keep the assertion restricted to mode. + rt_ = runtime_create(PTO2_MODE_GRAPH_ONLY); + ASSERT_NE(rt_, nullptr); + EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY); +} + +// ---------- From-SM creation ---------- + +TEST_F(RuntimeLifecycleTest, CreateFromSM_NullHandle_ReturnsNull) { + // Act + PTO2Runtime *rt = runtime_create_from_sm(PTO2_MODE_SIMULATE, nullptr, nullptr, 0); + + // Assert + EXPECT_EQ(rt, nullptr); +} + +TEST_F(RuntimeLifecycleTest, CreateFromSM_RecordsCallerBuffers) { + // Arrange: caller-allocated sm + gm_heap. + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kSmallWindow, kSmallHeap); + ASSERT_NE(sm, nullptr); + uint8_t *heap = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kSmallHeap)); + ASSERT_NE(heap, nullptr); + + // Act + rt_ = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm, heap, kSmallHeap); + + // Assert: the returned runtime must NOT claim ownership of the gm_heap. + ASSERT_NE(rt_, nullptr); + EXPECT_EQ(rt_->sm_handle, sm); + EXPECT_EQ(rt_->gm_heap, heap); + EXPECT_FALSE(rt_->gm_heap_owned); + + // Cleanup: runtime_destroy consumes sm via pto2_sm_destroy (observed + // behavior, see pto_runtime2.cpp:339), so only free the gm_heap here. + runtime_destroy(rt_); + rt_ = nullptr; + std::free(heap); +} + +// ---------- Destroy ---------- + +TEST_F(RuntimeLifecycleTest, Destroy_NullRuntime_NoCrash) { + // Documented contract: destroy(nullptr) is a no-op. + runtime_destroy(nullptr); + SUCCEED(); +} + +TEST_F(RuntimeLifecycleTest, Destroy_ReleasesOwnedHeap) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + // Act: explicitly destroy and null out so TearDown doesn't double-free. + runtime_destroy(rt_); + rt_ = nullptr; + // Assert: reaching here without asan/ubsan complaint is the test (leak-free). + SUCCEED(); +} + +// ---------- set_mode ---------- + +TEST_F(RuntimeLifecycleTest, SetMode_UpdatesField) { + rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + ASSERT_EQ(rt_->mode, PTO2_MODE_EXECUTE); + + // Act + runtime_set_mode(rt_, PTO2_MODE_GRAPH_ONLY); + + // Assert + EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY); +} + +TEST_F(RuntimeLifecycleTest, SetMode_NullRuntime_NoCrash) { + // Contract: defensive null check, mirrors destroy. + runtime_set_mode(nullptr, PTO2_MODE_SIMULATE); + SUCCEED(); +} + +// ---------- Ops table wiring ---------- + +TEST_F(RuntimeLifecycleTest, OpsTable_AllFunctionPointersPopulated) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + const PTO2RuntimeOps *ops = rt_->ops; + ASSERT_NE(ops, nullptr); + + // Hot-path ops called by the orchestration .so -- must never be null. + EXPECT_NE(ops->submit_task, nullptr); + EXPECT_NE(ops->alloc_tensors, nullptr); + EXPECT_NE(ops->scope_begin, nullptr); + EXPECT_NE(ops->scope_end, nullptr); + EXPECT_NE(ops->orchestration_done, nullptr); + EXPECT_NE(ops->is_fatal, nullptr); + EXPECT_NE(ops->report_fatal, nullptr); + EXPECT_NE(ops->get_tensor_data, nullptr); + EXPECT_NE(ops->set_tensor_data, nullptr); +} + +TEST_F(RuntimeLifecycleTest, IsFatal_FreshRuntime_ReturnsFalse) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + EXPECT_FALSE(rt_->ops->is_fatal(rt_)); +} + +TEST_F(RuntimeLifecycleTest, ReportFatal_SetsFatalFlag) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + + // Act + rt_->ops->report_fatal(rt_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", "%s", "forced"); + + // Assert + EXPECT_TRUE(rt_->ops->is_fatal(rt_)); +} diff --git a/tests/ut/cpp/a2a3/test_shared_memory.cpp b/tests/ut/cpp/a2a3/test_shared_memory.cpp index 44182090a..cd9b4d526 100644 --- a/tests/ut/cpp/a2a3/test_shared_memory.cpp +++ b/tests/ut/cpp/a2a3/test_shared_memory.cpp @@ -31,6 +31,8 @@ #include #include +#include +#include #include "pto_shared_memory.h" // ============================================================================= @@ -187,3 +189,158 @@ TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) { PTO2SharedMemoryHandle *h = PTO2SharedMemoryHandle::create_from_buffer(buf, 64, 256, 4096); EXPECT_EQ(h, nullptr) << "Undersized buffer should fail"; } + +// ============================================================================= +// Concurrent read/write of per-ring flow control +// ============================================================================= + +class SharedMemoryConcurrentTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *handle = nullptr; + + void SetUp() override { + handle = PTO2SharedMemoryHandle::create(256, 4096); + ASSERT_NE(handle, nullptr); + } + + void TearDown() override { + if (handle) { + handle->destroy(); + handle = nullptr; + } + } +}; + +TEST_F(SharedMemoryConcurrentTest, PerRingTaskIndexIsolation) { + constexpr int kIterations = 10000; + + auto writer = [&](int ring) { + auto &fc = handle->header->rings[ring].fc; + int32_t base = ring * 100000; + for (int i = 1; i <= kIterations; i++) { + fc.current_task_index.store(base + i, std::memory_order_release); + } + }; + + struct Observation { + bool went_backward = false; + bool saw_other_ring_range = false; + }; + + auto reader = [&](int ring, Observation *obs) { + auto &fc = handle->header->rings[ring].fc; + int32_t base = ring * 100000; + int32_t prev = 0; + for (int i = 0; i < kIterations; i++) { + int32_t val = fc.current_task_index.load(std::memory_order_acquire); + if (val < prev) { + obs->went_backward = true; + } + if (val != 0 && (val <= base || val > base + kIterations)) { + obs->saw_other_ring_range = true; + } + prev = val; + } + }; + + Observation ring0; + Observation ring1; + + std::thread w0(writer, 0); + std::thread w1(writer, 1); + std::thread r0(reader, 0, &ring0); + std::thread r1(reader, 1, &ring1); + + w0.join(); + w1.join(); + r0.join(); + r1.join(); + + EXPECT_FALSE(ring0.went_backward) << "Ring 0 current_task_index should be monotonic"; + EXPECT_FALSE(ring1.went_backward) << "Ring 1 current_task_index should be monotonic"; + EXPECT_FALSE(ring0.saw_other_ring_range) << "Ring 0 should not observe ring 1 values"; + EXPECT_FALSE(ring1.saw_other_ring_range) << "Ring 1 should not observe ring 0 values"; + + EXPECT_EQ(handle->header->rings[0].fc.current_task_index.load(), static_cast(kIterations)); + EXPECT_EQ(handle->header->rings[1].fc.current_task_index.load(), static_cast(100000 + kIterations)); +} + +TEST_F(SharedMemoryConcurrentTest, TaskIndexAtomicIncrement) { + constexpr int kIncrements = 5000; + constexpr int kThreads = 4; + + auto &fc = handle->header->rings[0].fc; + fc.current_task_index.store(0, std::memory_order_relaxed); + + auto incrementer = [&]() { + for (int i = 0; i < kIncrements; i++) { + fc.current_task_index.fetch_add(1, std::memory_order_acq_rel); + } + }; + + std::vector threads; + for (int i = 0; i < kThreads; i++) { + threads.emplace_back(incrementer); + } + for (auto &t : threads) + t.join(); + + EXPECT_EQ(fc.current_task_index.load(), kIncrements * kThreads) << "Concurrent increments should not lose updates"; +} + +TEST_F(SharedMemoryConcurrentTest, LastTaskAliveMonotonic) { + constexpr int kIterations = 10000; + constexpr int kThreads = 4; + + auto &fc = handle->header->rings[0].fc; + fc.last_task_alive.store(0, std::memory_order_relaxed); + + auto advancer = [&](int id) { + for (int i = 0; i < kIterations; i++) { + int32_t desired = id * kIterations + i + 1; + int32_t current = fc.last_task_alive.load(std::memory_order_acquire); + while (current < desired) { + if (fc.last_task_alive.compare_exchange_weak( + current, desired, std::memory_order_acq_rel, std::memory_order_acquire + )) { + break; + } + } + } + }; + + std::vector threads; + for (int i = 0; i < kThreads; i++) { + threads.emplace_back(advancer, i); + } + for (auto &t : threads) + t.join(); + + int32_t final_val = fc.last_task_alive.load(); + EXPECT_EQ(final_val, kIterations * kThreads) << "last_task_alive should advance to the largest published value"; +} + +TEST_F(SharedMemoryConcurrentTest, ValidateAfterConcurrentWrites) { + constexpr int kIterations = 1000; + + auto writer = [&](int ring) { + auto &fc = handle->header->rings[ring].fc; + for (int i = 0; i < kIterations; i++) { + fc.current_task_index.store(static_cast(i % 256), std::memory_order_release); + } + }; + + std::thread w0(writer, 0); + std::thread w1(writer, 1); + std::thread w2(writer, 2); + std::thread w3(writer, 3); + w0.join(); + w1.join(); + w2.join(); + w3.join(); + + EXPECT_TRUE(handle->validate()) << "Valid current_task_index values should pass validation"; + + handle->header->rings[2].fc.current_task_index.store(-1, std::memory_order_relaxed); + EXPECT_FALSE(handle->validate()) << "Corrupted current_task_index should fail validation"; +} diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp index 383003900..443a2e5da 100644 --- a/tests/ut/cpp/a2a3/test_task_allocator.cpp +++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp @@ -405,3 +405,99 @@ TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { EXPECT_GE(r3.slot, 0); EXPECT_LT(r3.slot, WINDOW_SIZE); } + +// ============================================================================= +// Re-init safety +// ============================================================================= + +class TaskAllocatorReinitTest : public ::testing::Test { +protected: + static constexpr int32_t WINDOW_SIZE = 16; + static constexpr uint64_t HEAP_SIZE = 1024; + + std::vector descriptors; + alignas(64) uint8_t heap_buf[1024]{}; + std::atomic current_index{0}; + std::atomic last_alive{0}; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2TaskAllocator allocator{}; + + void InitAllocator() { + descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{}); + std::memset(heap_buf, 0, sizeof(heap_buf)); + current_index.store(0); + last_alive.store(0); + error_code.store(PTO2_ERROR_NONE); + allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + } +}; + +TEST_F(TaskAllocatorReinitTest, ReInitAfterUse) { + InitAllocator(); + + auto r1 = allocator.alloc(128); + ASSERT_FALSE(r1.failed()); + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 1); + + InitAllocator(); + + auto r3 = allocator.alloc(64); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(r3.task_id, 0) << "Re-init should reset task ID counter"; + EXPECT_EQ(r3.slot, 0); +} + +TEST_F(TaskAllocatorReinitTest, ReInitDifferentHeapSize) { + InitAllocator(); + + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + + InitAllocator(); + EXPECT_EQ(allocator.heap_top(), 0u) << "Re-init resets heap_top"; + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE) << "Re-init restores full capacity"; +} + +TEST_F(TaskAllocatorReinitTest, ReInitClearsErrorState) { + InitAllocator(); + + auto r = allocator.alloc(HEAP_SIZE * 2); + EXPECT_TRUE(r.failed()); + EXPECT_NE(error_code.load(), PTO2_ERROR_NONE); + + InitAllocator(); + EXPECT_EQ(error_code.load(), PTO2_ERROR_NONE); + + auto r2 = allocator.alloc(64); + EXPECT_FALSE(r2.failed()); +} + +TEST_F(TaskAllocatorReinitTest, MultipleReInitCycles) { + for (int cycle = 0; cycle < 10; cycle++) { + InitAllocator(); + + for (int i = 0; i < WINDOW_SIZE - 1; i++) { + auto r = allocator.alloc(0); + ASSERT_FALSE(r.failed()) << "Cycle " << cycle << " alloc " << i; + EXPECT_EQ(r.task_id, i); + } + } +} + +TEST_F(TaskAllocatorReinitTest, ReInitIgnoresStaleLastAlive) { + InitAllocator(); + + auto r1 = allocator.alloc(64); + ASSERT_FALSE(r1.failed()); + last_alive.store(5, std::memory_order_release); + + InitAllocator(); + EXPECT_EQ(last_alive.load(), 0); + + auto r2 = allocator.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 0); +} diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp index 10eef0317..b7ceb4ce5 100644 --- a/tests/ut/cpp/a2a3/test_tensormap.cpp +++ b/tests/ut/cpp/a2a3/test_tensormap.cpp @@ -65,6 +65,30 @@ static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32 return make_tensor_external(reinterpret_cast(addr), shapes, 2, DataType::FLOAT32, false, version); } +static Tensor make_test_tensor_nd( + uint64_t addr, uint32_t ndims, const uint32_t shapes[], const uint32_t offsets[] = nullptr, + int32_t version = 0 +) { + uint32_t seed_shape[1] = {1}; + Tensor t = make_tensor_external(reinterpret_cast(addr), seed_shape, 1, DataType::FLOAT32, false, 0); + uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}; + uint32_t rs[RUNTIME_MAX_TENSOR_DIMS]{}; + uint32_t o[RUNTIME_MAX_TENSOR_DIMS]{}; + bool all_zero = true; + for (uint32_t i = 0; i < ndims && i < RUNTIME_MAX_TENSOR_DIMS; i++) { + s[i] = shapes[i]; + rs[i] = shapes[i]; + o[i] = offsets ? offsets[i] : 0; + if (o[i] != 0) all_zero = false; + } + uint64_t total = 4; + for (uint32_t i = 0; i < ndims; i++) { + total *= (rs[i] + (offsets ? offsets[i] : 0)); + } + t.init(reinterpret_cast(addr), total, rs, s, o, ndims, DataType::FLOAT32, version, all_zero, true); + return t; +} + // ============================================================================= // Fixture // ============================================================================= @@ -549,3 +573,105 @@ TEST(TaskIdTest, LocalIdMaxValue) { EXPECT_EQ(tid.ring(), 0); EXPECT_EQ(tid.local(), UINT32_MAX); } + +// ============================================================================= +// Edge cases merged from test_tensormap_overlap.cpp +// ============================================================================= + +TEST_F(TensorMapTest, ZeroDimensionTensor) { + // ndims=0: fast-path loop doesn't execute, contains=true -> COVERED + uint32_t seed_shape[1] = {1}; + Tensor t = make_tensor_external(reinterpret_cast(0x2000), seed_shape, 1, DataType::FLOAT32, false, 0); + uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{}; + t.init(reinterpret_cast(0x2000), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true); + + tmap.insert(t, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +TEST_F(TensorMapTest, TwoZeroDimTensorsSameAddr) { + uint32_t seed_shape[1] = {1}; + Tensor t1 = make_tensor_external(reinterpret_cast(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0); + Tensor t2 = make_tensor_external(reinterpret_cast(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0); + uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{}; + t1.init(reinterpret_cast(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true); + t2.init(reinterpret_cast(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true); + + tmap.insert(t1, PTO2TaskId::make(0, 0)); + tmap.insert(t2, PTO2TaskId::make(0, 1)); + + TestLookupResult result; + run_lookup(tmap, t1, result); + EXPECT_EQ(result.count, 2); + for (int i = 0; i < result.count; i++) { + EXPECT_EQ(result.entries[i].overlap_status, OverlapStatus::COVERED) + << "0-dim tensors always report COVERED (empty loop -> contains=true)"; + } +} + +TEST_F(TensorMapTest, AdjacentNoOverlap) { + uint32_t prod_shapes[] = {100}; + Tensor prod = make_test_tensor_nd(0x8000, 1, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + uint32_t cons_shapes[] = {100}; + uint32_t cons_offsets[] = {100}; + Tensor cons = make_test_tensor_nd(0x8000, 1, cons_shapes, cons_offsets, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + EXPECT_EQ(result.count, 0) << "Adjacent regions [0,100) and [100,200) must not overlap"; +} + +TEST_F(TensorMapTest, OneElementOverlap) { + uint32_t prod_shapes[] = {100}; + Tensor prod = make_test_tensor_nd(0x8100, 1, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + uint32_t cons_shapes[] = {100}; + uint32_t cons_offsets[] = {99}; + Tensor cons = make_test_tensor_nd(0x8100, 1, cons_shapes, cons_offsets, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER) << "Partial overlap (1 element) -> OTHER"; +} + +TEST_F(TensorMapTest, ZeroShapeInDimension) { + // Producer: 2D [10, 0] -- zero in dim 1 + uint32_t prod_shapes[] = {10, 0}; + Tensor prod = make_test_tensor_nd(0x8200, 2, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + // Consumer: 2D [10, 20] + uint32_t cons_shapes[] = {10, 20}; + Tensor cons = make_test_tensor_nd(0x8200, 2, cons_shapes, nullptr, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + ASSERT_EQ(result.count, 1); + // Fast path: input.shapes[1](20) >= entry.shapes[1](0) -> contains=true -> COVERED + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED) + << "Zero-shape producer is COVERED by any consumer (empty production)"; +} + +TEST_F(TensorMapTest, FullFiveDimensionalOverlap) { + uint32_t prod_shapes[] = {2, 3, 4, 5, 6}; + Tensor prod = make_test_tensor_nd(0x9200, 5, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + // Consumer with larger shapes in all dims -> COVERED + uint32_t cons_shapes[] = {4, 6, 8, 10, 12}; + Tensor cons = make_test_tensor_nd(0x9200, 5, cons_shapes, nullptr, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED) + << "5D consumer covers 5D producer in all dimensions"; +} diff --git a/tests/ut/cpp/a5/test_link_isolation.cpp b/tests/ut/cpp/a5/test_link_isolation.cpp new file mode 100644 index 000000000..78f1378a9 --- /dev/null +++ b/tests/ut/cpp/a5/test_link_isolation.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Behavior tests for runtime components built without pto_orchestrator.cpp. + * + * The CMake target for this file deliberately omits the orchestrator source. + * Passing build and runtime assertions here verifies that scheduler, ring + * buffer, shared memory, and TensorMap behavior does not require an + * orchestrator link dependency. + */ + +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; + +struct TestLookupResult { + struct Entry { + PTO2TensorMapEntry *entry; + OverlapStatus overlap_status; + }; + std::vector entries; + int count = 0; +}; + +void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) { + tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool { + out.entries.push_back({&entry, status}); + out.count++; + return true; + }); +} + +Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0}; + return make_tensor_external(reinterpret_cast(addr), shapes, 1, DataType::FLOAT32, false, version); +} + +struct DepPoolFixture { + PTO2DepListEntry entries[512]; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2DepListPool pool{}; + + void Init() { + std::memset(entries, 0, sizeof(entries)); + error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); + pool.init(entries, 512, &error_code); + } + + void AllocN(int count) { + for (int i = 0; i < count; i++) { + ASSERT_NE(pool.alloc(), nullptr); + } + } +}; + +} // namespace + +TEST(LinkIsolationDepPool, ReclaimBelowIntervalKeepsAllocatedEntries) { + DepPoolFixture fixture; + fixture.Init(); + fixture.AllocN(100); + int32_t used_before = fixture.pool.used(); + + PTO2SharedMemoryRingHeader ring_header{}; + fixture.pool.reclaim(ring_header, PTO2_DEP_POOL_CLEANUP_INTERVAL - 1); + + EXPECT_EQ(fixture.pool.used(), used_before); + EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); +} + +TEST(LinkIsolationDepPool, ReclaimAtIntervalUsesConsumedTaskMark) { + DepPoolFixture fixture; + fixture.Init(); + fixture.AllocN(100); + + std::vector slots(kWindowSize); + PTO2SharedMemoryRingHeader ring_header{}; + ring_header.slot_states = slots.data(); + ring_header.task_window_size = kWindowSize; + ring_header.task_window_mask = kWindowSize - 1; + + int32_t last_alive = PTO2_DEP_POOL_CLEANUP_INTERVAL; + int32_t mark_slot = (last_alive - 1) & ring_header.task_window_mask; + slots[mark_slot].dep_pool_mark = 50; + + fixture.pool.reclaim(ring_header, last_alive); + + EXPECT_EQ(fixture.pool.used(), 51); + EXPECT_EQ(fixture.error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); +} + +TEST(LinkIsolationScheduler, ReleaseFaninPushesReadyTask) { + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm, nullptr); + + PTO2SchedulerState sched{}; + ASSERT_TRUE(sched.init(sm->header)); + + alignas(64) PTO2TaskSlotState slot{}; + slot.fanin_count = 1; + slot.fanin_refcount.store(0, std::memory_order_relaxed); + slot.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); + slot.active_mask = ActiveMask(PTO2_SUBTASK_MASK_AIV0); + + EXPECT_TRUE(sched.release_fanin_and_check_ready(slot, nullptr)); + + PTO2ResourceShape shape = slot.active_mask.to_shape(); + EXPECT_EQ(sched.ready_queues[static_cast(shape)].pop(), &slot); + + sched.destroy(); + sm->destroy(); +} + +TEST(LinkIsolationScheduler, CompletedTaskCanBecomeConsumed) { + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm, nullptr); + + PTO2SchedulerState sched{}; + ASSERT_TRUE(sched.init(sm->header)); + + PTO2TaskDescriptor desc{}; + PTO2TaskSlotState &slot = sm->header->rings[0].get_slot_state_by_slot(0); + slot.task = &desc; + slot.ring_id = 0; + slot.fanout_count = 1; + slot.fanout_refcount.store(1, std::memory_order_relaxed); + slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed); + sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed); + + sched.check_and_handle_consumed(slot); + + EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED); + + sched.destroy(); + sm->destroy(); +} + +TEST(LinkIsolationReadyQueue, PushPopBatchWithoutOrchestrator) { + PTO2ReadyQueue queue{}; + ASSERT_TRUE(ready_queue_init(&queue, 16)); + + PTO2TaskSlotState items[4]{}; + PTO2TaskSlotState *in[4] = {&items[0], &items[1], &items[2], &items[3]}; + queue.push_batch(in, 4); + + PTO2TaskSlotState *out[4]{}; + EXPECT_EQ(queue.pop_batch(out, 4), 4); + for (int i = 0; i < 4; i++) { + EXPECT_EQ(out[i], &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); + + ready_queue_destroy(&queue); +} + +TEST(LinkIsolationTensorMap, InsertLookupAndValidityWithoutOrchestrator) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {kWindowSize, kWindowSize, kWindowSize, kWindowSize}; + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 1024, window_sizes)); + + Tensor tensor = make_tensor(0x3000); + for (int i = 0; i < kWindowSize; i++) { + tmap.insert(tensor, PTO2TaskId::make(0, i)); + } + + tmap.sync_validity(0, kWindowSize / 2); + + TestLookupResult result; + run_lookup(tmap, tensor, result); + EXPECT_EQ(result.count, kWindowSize / 2); + for (const auto &entry : result.entries) { + EXPECT_GE(entry.entry->producer_task_id.local(), static_cast(kWindowSize / 2)); + EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED); + } + + tmap.destroy(); +} diff --git a/tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp b/tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp new file mode 100644 index 000000000..24350c7c6 --- /dev/null +++ b/tests/ut/cpp/a5/test_orchestrator_report_fatal.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * UT for the orchestrator-side fatal reporting path. + * + * Targets orch_.report_fatal (pto_orchestrator.cpp) and verifies: + * - orch->fatal latches to true on any non-zero error code + * - the first non-zero code wins via CAS into sm_header->orch_error_code + * - subsequent fatal reports do NOT overwrite the first code + * - PTO2_ERROR_NONE never latches the shared-memory code (but still flips + * the local fatal flag -- by design, callers may use it to mark fatal + * without writing a code) + * + * This test exercises the real symbol against a fully-initialized + * orchestrator + shared memory pair, complementing the fake-runtime test + * (test_a5_fatal.cpp) that only validates the ops-table dispatch. + */ + +#include + +#include +#include + +#include "pto_orchestrator.h" +#include "pto_runtime_status.h" +#include "scheduler/pto_scheduler.h" +#include "pto_shared_memory.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; +constexpr int32_t kDepPool = 256; + +class OrchestratorFatalTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *sm_ = nullptr; + PTO2SchedulerState sched_{}; + PTO2OrchestratorState orch_{}; + uint8_t *gm_heap_ = nullptr; + bool sched_ok_ = false; + bool orch_ok_ = false; + + void SetUp() override { + sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm_, nullptr); + + gm_heap_ = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize)); + ASSERT_NE(gm_heap_, nullptr); + + sched_ok_ = sched_.init(sm_->header, kDepPool); + ASSERT_TRUE(sched_ok_); + + orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool); + ASSERT_TRUE(orch_ok_); + } + + void TearDown() override { + if (orch_ok_) orch_.destroy(); + if (sched_ok_) sched_.destroy(); + if (gm_heap_) std::free(gm_heap_); + if (sm_) sm_->destroy(); + } + + int32_t shared_orch_code() const { return sm_->header->orch_error_code.load(std::memory_order_acquire); } +}; + +} // namespace + +// ---------- baseline ---------- + +TEST_F(OrchestratorFatalTest, InitialState_NoFatalNoSharedCode) { + // Verify no fatal state via the observable shared memory output + EXPECT_FALSE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE); +} + +// ---------- happy path: single fatal latches both local flag and shared code ---------- + +TEST_F(OrchestratorFatalTest, ReportFatal_SetsLocalFlagAndSharedCode) { + orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", "deadlock at ring %d", 3); + + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +// ---------- CAS first-writer-wins ---------- + +TEST_F(OrchestratorFatalTest, SecondReportFatal_DoesNotOverwriteSharedCode) { + orch_.report_fatal(PTO2_ERROR_HEAP_RING_DEADLOCK, "test", nullptr); + orch_.report_fatal(PTO2_ERROR_DEP_POOL_OVERFLOW, "test", nullptr); + + // Second report must NOT overwrite the first latched code. + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +TEST_F(OrchestratorFatalTest, RepeatedSameCode_StaysLatched) { + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr); + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "test", nullptr); + + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS); +} + +// ---------- PTO2_ERROR_NONE: marks fatal locally, does NOT touch shared code ---------- + +TEST_F(OrchestratorFatalTest, ReportFatalWithErrorNone_DoesNotWriteSharedCode) { + orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr); + + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE); +} + +// ---------- PTO2_ERROR_NONE first does not block a real code from latching ---------- + +TEST_F(OrchestratorFatalTest, ErrorNoneFirst_RealCodeStillLatchesAfter) { + orch_.report_fatal(PTO2_ERROR_NONE, "test", nullptr); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_NONE); + + orch_.report_fatal(PTO2_ERROR_SCOPE_DEADLOCK, "test", nullptr); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_SCOPE_DEADLOCK); +} + +// ---------- coverage of every defined orchestrator code ---------- + +TEST_F(OrchestratorFatalTest, EveryOrchCode_LatchesIntoSharedMemory) { + const int32_t codes[] = { + PTO2_ERROR_SCOPE_DEADLOCK, + PTO2_ERROR_HEAP_RING_DEADLOCK, + PTO2_ERROR_FLOW_CONTROL_DEADLOCK, + PTO2_ERROR_DEP_POOL_OVERFLOW, + PTO2_ERROR_INVALID_ARGS, + PTO2_ERROR_DEPENDENCY_OVERFLOW, + PTO2_ERROR_REQUIRE_SYNC_START_INVALID, + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, + PTO2_ERROR_EXPLICIT_ORCH_FATAL, + }; + for (int32_t code : codes) { + // Reset latches between iterations. Direct field access is unavoidable here + // since there is no public reset API for the orchestrator fatal state. + sm_->header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_release); + orch_.fatal = false; + + orch_.report_fatal(code, "test", "code=%d", code); + + SCOPED_TRACE(testing::Message() << "code=" << code); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), code); + } +} + +// ---------- format-string variants must not crash ---------- + +TEST_F(OrchestratorFatalTest, NullFmt_DoesNotCrash) { + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", nullptr); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS); +} + +TEST_F(OrchestratorFatalTest, EmptyFmt_DoesNotCrash) { + orch_.report_fatal(PTO2_ERROR_INVALID_ARGS, "func", ""); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_INVALID_ARGS); +} + +TEST_F(OrchestratorFatalTest, FmtWithVarArgs_DoesNotCrash) { + orch_.report_fatal( + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, "func", "tensor=%p slot=%d msg=%s", reinterpret_cast(0xdeadbeef), 17, + "boom" + ); + EXPECT_TRUE(orch_.fatal); + EXPECT_EQ(shared_orch_code(), PTO2_ERROR_TENSOR_WAIT_TIMEOUT); +} + +// ---------- end-to-end: status helper sees latched code ---------- + +TEST_F(OrchestratorFatalTest, StatusHelperReadsLatchedOrchCode) { + orch_.report_fatal(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, "func", nullptr); + + int32_t orch_code = shared_orch_code(); + int32_t sched_code = sm_->header->sched_error_code.load(std::memory_order_acquire); + EXPECT_EQ(runtime_status_from_error_codes(orch_code, sched_code), -PTO2_ERROR_FLOW_CONTROL_DEADLOCK); +} diff --git a/tests/ut/cpp/a5/test_orchestrator_submit.cpp b/tests/ut/cpp/a5/test_orchestrator_submit.cpp new file mode 100644 index 000000000..c01635f57 --- /dev/null +++ b/tests/ut/cpp/a5/test_orchestrator_submit.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * Orchestrator submit-path UT. + * + * Covers pto2_submit_mixed_task, pto2_alloc_tensors, pto2_orchestrator_done, + * and pto2_orchestrator_set_scheduler on a fully initialized + * (TMR) system. + * + * Follows AAA and FIRST: each TEST_F builds a fresh TMRSystem, exercises + * one behavior, and tears the system down in TearDown(). + */ + +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" // make_tensor_external, TensorCreateInfo ctor +#include "pto_orchestrator.h" +#include "pto_ring_buffer.h" +#include "scheduler/pto_scheduler.h" +#include "pto_shared_memory.h" +#include "pto_submit_types.h" +#include "pto_tensormap.h" +#include "tensor.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; +constexpr int32_t kDepPool = 256; + +// ----------------------------------------------------------------------------- +// Fixture: minimal TMR system for orchestrator-level tests. +// ----------------------------------------------------------------------------- +class OrchestratorSubmitTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *sm_ = nullptr; + PTO2SchedulerState sched_{}; + PTO2OrchestratorState orch_{}; + uint8_t *gm_heap_ = nullptr; + bool sched_ok_ = false; + bool orch_ok_ = false; + + void SetUp() override { + sm_ = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm_, nullptr); + + gm_heap_ = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize)); + ASSERT_NE(gm_heap_, nullptr); + + sched_ok_ = sched_.init(sm_->header, kDepPool); + ASSERT_TRUE(sched_ok_); + + orch_ok_ = orch_.init(sm_->header, gm_heap_, kHeapSize, kDepPool); + ASSERT_TRUE(orch_ok_); + + orch_.set_scheduler(&sched_); + } + + void TearDown() override { + if (orch_ok_) orch_.destroy(); + if (sched_ok_) sched_.destroy(); + if (gm_heap_) std::free(gm_heap_); + if (sm_) sm_->destroy(); + } + + // Helper: build a minimal TensorCreateInfo owning one FP32 scalar output. + static TensorCreateInfo make_scalar_ci() { + static const uint32_t kShape[1] = {1}; + return TensorCreateInfo(kShape, 1, DataType::FLOAT32); + } + + bool has_orch_error() const { + return sm_->header->orch_error_code.load(std::memory_order_acquire) != PTO2_ERROR_NONE; + } +}; + +} // namespace + +// ---------- set_scheduler ---------- + +TEST_F(OrchestratorSubmitTest, SetScheduler_StoresPointer) { + PTO2SchedulerState other{}; + orch_.set_scheduler(&other); + // Direct field read: no public getter exists for the scheduler pointer. + EXPECT_EQ(orch_.scheduler, &other); + + // Restore for TearDown. + orch_.set_scheduler(&sched_); +} + +// ---------- alloc_tensors: argument validation ---------- + +TEST_F(OrchestratorSubmitTest, AllocTensors_EmptyArgs_MarksFatal) { + Arg args; // no tensors, no scalars + + TaskOutputTensors result = orch_.alloc_tensors(args); + + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_WithScalars_MarksFatal) { + TensorCreateInfo ci = make_scalar_ci(); + Arg args; + args.add_output(ci); + args.add_scalar(uint64_t{42}); + + TaskOutputTensors result = orch_.alloc_tensors(args); + + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_InputArg_MarksFatal) { + // alloc_tensors only accepts OUTPUT TensorCreateInfo args. + uint32_t shape[1] = {1}; + Tensor input = make_tensor_external(reinterpret_cast(0x1000), shape, 1); + Arg args; + args.add_input(input); + + TaskOutputTensors result = orch_.alloc_tensors(args); + + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_OutputOnly_ReturnsMaterializedTensors) { + // Arrange: two output CIs, inside an active scope. + TensorCreateInfo ci1 = make_scalar_ci(); + TensorCreateInfo ci2 = make_scalar_ci(); + Arg args; + args.add_output(ci1, ci2); + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.alloc_tensors(args); + orch_.end_scope(); + + // Assert + EXPECT_FALSE(has_orch_error()); + EXPECT_EQ(result.size(), 2U); +} + +TEST_F(OrchestratorSubmitTest, AllocTensors_AfterFatal_ReturnsEmpty) { + // Arrange: force fatal. + orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr); + ASSERT_TRUE(has_orch_error()); + + TensorCreateInfo ci = make_scalar_ci(); + Arg args; + args.add_output(ci); + + // Act + TaskOutputTensors result = orch_.alloc_tensors(args); + + // Assert + EXPECT_TRUE(result.empty()); +} + +// ---------- submit_mixed_task ---------- + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_AfterFatal_ReturnsEmpty) { + // Arrange: pre-fatal state + orch_.report_fatal(PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", nullptr); + + MixedKernels mixed; + mixed.aic_kernel_id = 0; + Arg args; + + // Act + TaskOutputTensors result = orch_.submit_task(mixed, args); + + // Assert + EXPECT_TRUE(result.empty()); +} + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_ArgWithError_MarksFatalInvalidArgs) { + // Arrange: craft an Arg with has_error set. + // Calling add_input after add_scalar triggers the ordering error path. + uint32_t shape[1] = {1}; + Tensor t = make_tensor_external(reinterpret_cast(0x1000), shape, 1); + Arg args; + args.add_scalar(uint64_t{1}); + args.add_input(t); // illegal ordering -> has_error = true + ASSERT_TRUE(args.has_error); + + MixedKernels mixed; + mixed.aic_kernel_id = 0; + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.submit_task(mixed, args); + orch_.end_scope(); + + // Assert + EXPECT_TRUE(result.empty()); + EXPECT_TRUE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_PureInputOnly_Succeeds) { + // Arrange: one input tensor, one AIC kernel, within a scope. + uint32_t shape[1] = {1}; + Tensor input = make_tensor_external(reinterpret_cast(0x2000), shape, 1); + + Arg args; + args.add_input(input); + ASSERT_FALSE(args.has_error); + + MixedKernels mixed; + mixed.aic_kernel_id = 7; // any non-invalid id + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.submit_task(mixed, args); + orch_.end_scope(); + + // Assert: submit returns (no outputs), and no fatal state was set. + EXPECT_TRUE(result.empty()); + EXPECT_FALSE(has_orch_error()); +} + +TEST_F(OrchestratorSubmitTest, SubmitMixedTask_OutputTensor_MaterializesResult) { + // Arrange: one OUTPUT TensorCreateInfo -> task produces one tensor. + TensorCreateInfo ci = make_scalar_ci(); + Arg args; + args.add_output(ci); + + MixedKernels mixed; + mixed.aic_kernel_id = 1; + + // Act + orch_.begin_scope(); + TaskOutputTensors result = orch_.submit_task(mixed, args); + orch_.end_scope(); + + // Assert + EXPECT_FALSE(has_orch_error()); + EXPECT_EQ(result.size(), 1U); +} + +// ---------- orchestrator_done ---------- + +TEST_F(OrchestratorSubmitTest, OrchestratorDone_SetsSharedMemoryFlag) { + // Arrange + ASSERT_EQ(sm_->header->orchestrator_done.load(), 0); + + // Act + orch_.mark_done(); + + // Assert + EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1); +} + +TEST_F(OrchestratorSubmitTest, OrchestratorDone_IsIdempotent) { + orch_.mark_done(); + orch_.mark_done(); + + // Flag stays 1 -- store is release-set, not increment. + EXPECT_EQ(sm_->header->orchestrator_done.load(std::memory_order_acquire), 1); +} diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp index 9dea3ae94..bb2a6a101 100644 --- a/tests/ut/cpp/a5/test_ready_queue.cpp +++ b/tests/ut/cpp/a5/test_ready_queue.cpp @@ -371,9 +371,10 @@ TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) { INSTANTIATE_TEST_SUITE_P( MPMCVariants, ReadyQueueMPMCTest, ::testing::Values( - MPMCConfig{2, 2, 200}, // TwoProducersTwoConsumers - MPMCConfig{1, 4, 500}, // OneProducerNConsumers - MPMCConfig{4, 4, 1250} // HighContentionStress + MPMCConfig{2, 2, 200}, // TwoProducersTwoConsumers + MPMCConfig{1, 4, 500}, // OneProducerNConsumers + MPMCConfig{4, 4, 1250}, // HighContentionStress + MPMCConfig{8, 8, 2000} // EightProducersEightConsumers ) ); @@ -444,3 +445,118 @@ TEST_F(LocalReadyBufferTest, NullBackingBuffer) { EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing"; EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing"; } + +// ============================================================================= +// High-contention stress tests +// ============================================================================= + +class ReadyQueueStressTest : public ::testing::Test { +protected: + static constexpr uint64_t kCapacity = 512; + PTO2ReadyQueue queue; + + void SetUp() override { ASSERT_TRUE(ready_queue_init(&queue, kCapacity)); } + void TearDown() override { ready_queue_destroy(&queue); } +}; + +TEST_F(ReadyQueueStressTest, RapidFillDrainCycles) { + constexpr int kCycles = 100; + constexpr int kItemsPerCycle = static_cast(kCapacity / 2); + + std::vector items(kItemsPerCycle); + for (int i = 0; i < kItemsPerCycle; i++) { + items[i].fanin_count = i; + } + + for (int cycle = 0; cycle < kCycles; cycle++) { + std::atomic push_done{0}; + std::atomic popped{0}; + + auto producer = [&](int id) { + int per_thread = kItemsPerCycle / 4; + int base = id * per_thread; + for (int i = 0; i < per_thread; i++) { + while (!queue.push(&items[base + i])) {} + } + push_done.fetch_add(1, std::memory_order_release); + }; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState *s = queue.pop(); + if (s) { + popped.fetch_add(1, std::memory_order_relaxed); + } else if (push_done.load(std::memory_order_acquire) == 4) { + while ((s = queue.pop()) != nullptr) { + popped.fetch_add(1, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector threads; + for (int i = 0; i < 4; i++) + threads.emplace_back(producer, i); + for (int i = 0; i < 4; i++) + threads.emplace_back(consumer); + for (auto &t : threads) + t.join(); + + ASSERT_EQ(popped.load(), kItemsPerCycle) << "Cycle " << cycle << ": lost items"; + } +} + +TEST_F(ReadyQueueStressTest, PopBatchUnderContention) { + constexpr int kBatchSize = 8; + constexpr int kBatches = 500; + constexpr int kProducers = 4; + constexpr int kTotalItems = kBatchSize * kBatches * kProducers; + + std::vector items(kTotalItems); + for (int i = 0; i < kTotalItems; i++) + items[i].fanin_count = i; + + std::atomic total_consumed{0}; + std::atomic producers_done{0}; + + auto producer = [&](int id) { + int base = id * kBatchSize * kBatches; + for (int b = 0; b < kBatches; b++) { + PTO2TaskSlotState *ptrs[kBatchSize]; + for (int i = 0; i < kBatchSize; i++) { + ptrs[i] = &items[base + b * kBatchSize + i]; + } + for (int i = 0; i < kBatchSize; i++) { + while (!queue.push(ptrs[i])) {} + } + } + producers_done.fetch_add(1, std::memory_order_release); + }; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState *out[kBatchSize]; + int n = queue.pop_batch(out, kBatchSize); + total_consumed.fetch_add(n, std::memory_order_relaxed); + if (n == 0 && producers_done.load(std::memory_order_acquire) == kProducers) { + while (true) { + n = queue.pop_batch(out, kBatchSize); + if (n == 0) break; + total_consumed.fetch_add(n, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector threads; + for (int i = 0; i < kProducers; i++) + threads.emplace_back(producer, i); + for (int i = 0; i < 4; i++) + threads.emplace_back(consumer); + for (auto &t : threads) + t.join(); + + EXPECT_EQ(total_consumed.load(), kTotalItems); +} diff --git a/tests/ut/cpp/a5/test_runtime_coupling.cpp b/tests/ut/cpp/a5/test_runtime_coupling.cpp new file mode 100644 index 000000000..927a7b0fc --- /dev/null +++ b/tests/ut/cpp/a5/test_runtime_coupling.cpp @@ -0,0 +1,283 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime behavior tests for the combined TMR components. + * + * These tests keep the assertions on observable component behavior: lifecycle, + * scheduler state transitions, TensorMap lookup validity, and shared-memory + * coordination. Structural coupling checks belong in design review rather than + * in unit tests. + */ + +#include + +#include +#include +#include +#include + +#include "pto_orchestration_api.h" +#include "pto_orchestrator.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +namespace { + +constexpr uint64_t kHeapSize = 64 * 1024; +constexpr int32_t kWindowSize = 64; +constexpr int32_t kDepPoolSize = 256; + +struct TestLookupResult { + struct Entry { + PTO2TensorMapEntry *entry; + OverlapStatus overlap_status; + }; + std::vector entries; + int count = 0; +}; + +void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) { + tmap.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus status) -> bool { + out.entries.push_back({&entry, status}); + out.count++; + return true; + }); +} + +Tensor make_tensor(uint64_t addr, uint32_t shape0 = 100, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0}; + return make_tensor_external(reinterpret_cast(addr), shapes, 1, DataType::FLOAT32, false, version); +} + +struct TMRSystem { + PTO2SharedMemoryHandle *sm = nullptr; + PTO2SchedulerState sched{}; + PTO2OrchestratorState orch{}; + uint8_t *gm_heap = nullptr; + bool sched_ok = false; + bool orch_ok = false; + + bool Init(uint64_t heap_size = kHeapSize, int32_t window_size = kWindowSize) { + sm = PTO2SharedMemoryHandle::create(window_size, heap_size); + if (sm == nullptr) return false; + + gm_heap = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, heap_size)); + if (gm_heap == nullptr) return false; + + sched_ok = sched.init(sm->header, kDepPoolSize); + if (!sched_ok) return false; + + orch_ok = orch.init(sm->header, gm_heap, heap_size, kDepPoolSize); + if (!orch_ok) return false; + + orch.set_scheduler(&sched); + return true; + } + + void Destroy() { + if (orch_ok) { + orch.destroy(); + orch_ok = false; + } + if (sched_ok) { + sched.destroy(); + sched_ok = false; + } + if (gm_heap != nullptr) { + std::free(gm_heap); + gm_heap = nullptr; + } + if (sm != nullptr) { + sm->destroy(); + sm = nullptr; + } + } +}; + +} // namespace + +TEST(RuntimeLifecycleBehavior, InitDestroyCanRepeat) { + for (int cycle = 0; cycle < 3; cycle++) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()) << "cycle=" << cycle; + EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + EXPECT_EQ(sys.sm->header->sched_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + sys.Destroy(); + } +} + +TEST(RuntimeLifecycleBehavior, OrchestratorScopeWithoutSchedulerLeavesNoFatalCode) { + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kWindowSize, kHeapSize); + ASSERT_NE(sm, nullptr); + uint8_t *heap = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kHeapSize)); + ASSERT_NE(heap, nullptr); + + PTO2OrchestratorState orch{}; + ASSERT_TRUE(orch.init(sm->header, heap, kHeapSize, kDepPoolSize)); + + orch.begin_scope(); + orch.end_scope(); + + EXPECT_EQ(sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + EXPECT_FALSE(orch.fatal); + + orch.destroy(); + std::free(heap); + sm->destroy(); +} + +TEST(RuntimeSchedulerBehavior, CompletedSlotWithSatisfiedFanoutBecomesConsumed) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + PTO2TaskDescriptor desc{}; + PTO2TaskSlotState &slot = sys.sm->header->rings[0].get_slot_state_by_slot(0); + slot.task = &desc; + slot.ring_id = 0; + slot.fanout_count = 1; + slot.fanout_refcount.store(1, std::memory_order_relaxed); + slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed); + sys.sm->header->rings[0].fc.current_task_index.store(1, std::memory_order_relaxed); + + sys.sched.check_and_handle_consumed(slot); + + EXPECT_EQ(slot.task_state.load(std::memory_order_acquire), PTO2_TASK_CONSUMED); + + sys.Destroy(); +} + +TEST(RuntimeSchedulerBehavior, RingPointerStopsAtFirstUnconsumedTask) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + auto &ring_state = sys.sched.ring_sched_states[0]; + PTO2TaskDescriptor descs[3]{}; + + PTO2TaskSlotState &slot0 = sys.sm->header->rings[0].get_slot_state_by_slot(0); + slot0.task = &descs[0]; + slot0.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed); + + PTO2TaskSlotState &slot1 = sys.sm->header->rings[0].get_slot_state_by_slot(1); + slot1.task = &descs[1]; + slot1.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_relaxed); + + PTO2TaskSlotState &slot2 = sys.sm->header->rings[0].get_slot_state_by_slot(2); + slot2.task = &descs[2]; + slot2.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_relaxed); + + sys.sm->header->rings[0].fc.current_task_index.store(3, std::memory_order_relaxed); + + ring_state.advance_ring_pointers(); + + EXPECT_EQ(ring_state.last_task_alive, 1); + EXPECT_EQ(sys.sm->header->rings[0].fc.last_task_alive.load(std::memory_order_acquire), static_cast(1)); + + sys.Destroy(); +} + +TEST(RuntimeSchedulerBehavior, ReadyQueuesAcceptEveryResourceShape) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + for (int shape = 0; shape < PTO2_NUM_RESOURCE_SHAPES; shape++) { + PTO2TaskSlotState slot{}; + EXPECT_TRUE(sys.sched.ready_queues[shape].push(&slot)) << "shape=" << shape; + EXPECT_EQ(sys.sched.ready_queues[shape].pop(), &slot) << "shape=" << shape; + } + + sys.Destroy(); +} + +TEST(RuntimeTensorMapBehavior, StandaloneInsertLookupDoesNotNeedOrchestratorPointer) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {16, 16, 16, 16}; + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 1024, window_sizes)); + + Tensor tensor = make_tensor(0x1000); + tmap.insert(tensor, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, tensor, result); + + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0)); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); + + tmap.destroy(); +} + +TEST(RuntimeTensorMapBehavior, ValiditySkipsRetiredEntries) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {256, 256, 256, 256}; + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 4096, window_sizes)); + + Tensor tensor = make_tensor(0x2000); + for (int i = 0; i < 100; i++) { + tmap.insert(tensor, PTO2TaskId::make(0, i)); + } + + tmap.sync_validity(0, 80); + + TestLookupResult result; + run_lookup(tmap, tensor, result); + + EXPECT_EQ(result.count, 20); + for (const auto &entry : result.entries) { + EXPECT_GE(entry.entry->producer_task_id.local(), 80u); + EXPECT_EQ(entry.overlap_status, OverlapStatus::COVERED); + } + + tmap.destroy(); +} + +TEST(RuntimeTensorMapBehavior, AllRingsCanProduceForSameTensor) { + int32_t window_sizes[PTO2_MAX_RING_DEPTH]; + for (int i = 0; i < PTO2_MAX_RING_DEPTH; i++) { + window_sizes[i] = kWindowSize; + } + PTO2TensorMap tmap{}; + ASSERT_TRUE(tmap.init(256, 1024, window_sizes)); + + Tensor tensor = make_tensor(0x3000); + for (int ring = 0; ring < PTO2_MAX_RING_DEPTH; ring++) { + tmap.insert(tensor, PTO2TaskId::make(ring, 0)); + } + + TestLookupResult result; + run_lookup(tmap, tensor, result); + + EXPECT_EQ(result.count, PTO2_MAX_RING_DEPTH); + + tmap.destroy(); +} + +TEST(RuntimeIntegrationBehavior, OrchestratorTensorMapUsesConfiguredWindow) { + TMRSystem sys; + ASSERT_TRUE(sys.Init()); + + Tensor tensor = make_tensor(0x4000); + sys.orch.begin_scope(); + sys.orch.tensor_map.insert(tensor, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(sys.orch.tensor_map, tensor, result); + + sys.orch.end_scope(); + + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0)); + EXPECT_EQ(sys.sm->header->orch_error_code.load(std::memory_order_acquire), PTO2_ERROR_NONE); + + sys.Destroy(); +} diff --git a/tests/ut/cpp/a5/test_runtime_lifecycle.cpp b/tests/ut/cpp/a5/test_runtime_lifecycle.cpp new file mode 100644 index 000000000..b1987ed9f --- /dev/null +++ b/tests/ut/cpp/a5/test_runtime_lifecycle.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO2 Runtime lifecycle UT. + * + * Covers runtime_create / _custom / _from_sm / _destroy / set_mode. + * + * Follows AAA and FIRST: no shared mutable state between tests, each test + * constructs its own runtime and tears it down. + */ + +#include + +#include + +#include "pto_runtime2.h" +#include "pto_shared_memory.h" + +namespace { + +constexpr uint64_t kSmallWindow = 64; +constexpr uint64_t kSmallHeap = 64 * 1024; + +// ----------------------------------------------------------------------------- +// Fixture: each test gets a fresh, isolated runtime config. +// ----------------------------------------------------------------------------- +class RuntimeLifecycleTest : public ::testing::Test { +protected: + PTO2Runtime *rt_ = nullptr; + + void TearDown() override { + if (rt_ != nullptr) { + runtime_destroy(rt_); + rt_ = nullptr; + } + } +}; + +} // namespace + +// ---------- Happy-path creation ---------- + +TEST_F(RuntimeLifecycleTest, CreateCustom_ValidSizes_ReturnsInitializedRuntime) { + // Arrange + Act + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + + // Assert + ASSERT_NE(rt_, nullptr); + EXPECT_NE(rt_->ops, nullptr); + EXPECT_NE(rt_->sm_handle, nullptr); + EXPECT_NE(rt_->gm_heap, nullptr); + EXPECT_TRUE(rt_->gm_heap_owned); + EXPECT_EQ(rt_->mode, PTO2_MODE_SIMULATE); + EXPECT_EQ(rt_->gm_heap_size, kSmallHeap * PTO2_MAX_RING_DEPTH); +} + +TEST_F(RuntimeLifecycleTest, CreateCustom_ConnectsOrchestratorToScheduler) { + rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap); + + ASSERT_NE(rt_, nullptr); + // In simulated mode the orchestrator must hold a pointer to the scheduler. + EXPECT_EQ(rt_->orchestrator.scheduler, &rt_->scheduler); +} + +TEST_F(RuntimeLifecycleTest, CreateDefault_UsesDefaultSizes) { + // create() is a thin wrapper around create_custom with PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. + // Use GRAPH_ONLY to avoid executor threads. We don't allocate the full + // 256MB heap in this path -- keep the assertion restricted to mode. + rt_ = runtime_create(PTO2_MODE_GRAPH_ONLY); + ASSERT_NE(rt_, nullptr); + EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY); +} + +// ---------- From-SM creation ---------- + +TEST_F(RuntimeLifecycleTest, CreateFromSM_NullHandle_ReturnsNull) { + // Act + PTO2Runtime *rt = runtime_create_from_sm(PTO2_MODE_SIMULATE, nullptr, nullptr, 0); + + // Assert + EXPECT_EQ(rt, nullptr); +} + +TEST_F(RuntimeLifecycleTest, CreateFromSM_RecordsCallerBuffers) { + // Arrange: caller-allocated sm + gm_heap. + PTO2SharedMemoryHandle *sm = PTO2SharedMemoryHandle::create(kSmallWindow, kSmallHeap); + ASSERT_NE(sm, nullptr); + uint8_t *heap = static_cast(std::calloc(PTO2_MAX_RING_DEPTH, kSmallHeap)); + ASSERT_NE(heap, nullptr); + + // Act + rt_ = runtime_create_from_sm(PTO2_MODE_EXECUTE, sm, heap, kSmallHeap); + + // Assert: the returned runtime must NOT claim ownership of the gm_heap. + ASSERT_NE(rt_, nullptr); + EXPECT_EQ(rt_->sm_handle, sm); + EXPECT_EQ(rt_->gm_heap, heap); + EXPECT_FALSE(rt_->gm_heap_owned); + + // Cleanup: runtime_destroy consumes sm via pto2_sm_destroy (observed + // behavior, see pto_runtime2.cpp:339), so only free the gm_heap here. + runtime_destroy(rt_); + rt_ = nullptr; + std::free(heap); +} + +// ---------- Destroy ---------- + +TEST_F(RuntimeLifecycleTest, Destroy_NullRuntime_NoCrash) { + // Documented contract: destroy(nullptr) is a no-op. + runtime_destroy(nullptr); + SUCCEED(); +} + +TEST_F(RuntimeLifecycleTest, Destroy_ReleasesOwnedHeap) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + // Act: explicitly destroy and null out so TearDown doesn't double-free. + runtime_destroy(rt_); + rt_ = nullptr; + // Assert: reaching here without asan/ubsan complaint is the test (leak-free). + SUCCEED(); +} + +// ---------- set_mode ---------- + +TEST_F(RuntimeLifecycleTest, SetMode_UpdatesField) { + rt_ = runtime_create_custom(PTO2_MODE_EXECUTE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + ASSERT_EQ(rt_->mode, PTO2_MODE_EXECUTE); + + // Act + runtime_set_mode(rt_, PTO2_MODE_GRAPH_ONLY); + + // Assert + EXPECT_EQ(rt_->mode, PTO2_MODE_GRAPH_ONLY); +} + +TEST_F(RuntimeLifecycleTest, SetMode_NullRuntime_NoCrash) { + // Contract: defensive null check, mirrors destroy. + runtime_set_mode(nullptr, PTO2_MODE_SIMULATE); + SUCCEED(); +} + +// ---------- Ops table wiring ---------- + +TEST_F(RuntimeLifecycleTest, OpsTable_AllFunctionPointersPopulated) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + const PTO2RuntimeOps *ops = rt_->ops; + ASSERT_NE(ops, nullptr); + + // Hot-path ops called by the orchestration .so -- must never be null. + EXPECT_NE(ops->submit_task, nullptr); + EXPECT_NE(ops->alloc_tensors, nullptr); + EXPECT_NE(ops->scope_begin, nullptr); + EXPECT_NE(ops->scope_end, nullptr); + EXPECT_NE(ops->orchestration_done, nullptr); + EXPECT_NE(ops->is_fatal, nullptr); + EXPECT_NE(ops->report_fatal, nullptr); + EXPECT_NE(ops->get_tensor_data, nullptr); + EXPECT_NE(ops->set_tensor_data, nullptr); +} + +TEST_F(RuntimeLifecycleTest, IsFatal_FreshRuntime_ReturnsFalse) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + EXPECT_FALSE(rt_->ops->is_fatal(rt_)); +} + +TEST_F(RuntimeLifecycleTest, ReportFatal_SetsFatalFlag) { + rt_ = runtime_create_custom(PTO2_MODE_SIMULATE, kSmallWindow, kSmallHeap); + ASSERT_NE(rt_, nullptr); + + // Act + rt_->ops->report_fatal(rt_, PTO2_ERROR_EXPLICIT_ORCH_FATAL, "UT", "%s", "forced"); + + // Assert + EXPECT_TRUE(rt_->ops->is_fatal(rt_)); +} diff --git a/tests/ut/cpp/a5/test_shared_memory.cpp b/tests/ut/cpp/a5/test_shared_memory.cpp index c45b4c8e4..cd9b4d526 100644 --- a/tests/ut/cpp/a5/test_shared_memory.cpp +++ b/tests/ut/cpp/a5/test_shared_memory.cpp @@ -16,13 +16,13 @@ * * Design contracts: * - * - PTO2SharedMemoryHandle::validate checks `top > heap_size`. top == heap_size is a + * - validate() checks `top > heap_size`. top == heap_size is a * legitimate "filled exactly to end" state, so strict > is correct. * - * - Zero window size: if PTO2SharedMemoryHandle::calculate_size() is called with 0, all ring + * - Zero window size: if calculate_size() is called with 0, all ring * descriptors/payloads alias the same address. Current entry path - * (PTO2SharedMemoryHandle::create) is called only with valid sizes, but there is no - * explicit guard. PTO2SharedMemoryHandle::create should reject task_window_size==0. + * (create) is called only with valid sizes, but there is no + * explicit guard. create should reject task_window_size==0. * * - Flow control heap_top validation: validate() does not verify * heap_top <= heap_size. After a corruption, heap_top could exceed @@ -31,6 +31,8 @@ #include #include +#include +#include #include "pto_shared_memory.h" // ============================================================================= @@ -182,13 +184,163 @@ TEST(SharedMemoryBoundary, ValidateDetectsCorruption) { h->destroy(); } -TEST(SharedMemoryBoundary, ValidateNullHandle) { - PTO2SharedMemoryHandle handle{}; - EXPECT_FALSE(handle.validate()); -} - TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) { char buf[64]{}; PTO2SharedMemoryHandle *h = PTO2SharedMemoryHandle::create_from_buffer(buf, 64, 256, 4096); EXPECT_EQ(h, nullptr) << "Undersized buffer should fail"; } + +// ============================================================================= +// Concurrent read/write of per-ring flow control +// ============================================================================= + +class SharedMemoryConcurrentTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *handle = nullptr; + + void SetUp() override { + handle = PTO2SharedMemoryHandle::create(256, 4096); + ASSERT_NE(handle, nullptr); + } + + void TearDown() override { + if (handle) { + handle->destroy(); + handle = nullptr; + } + } +}; + +TEST_F(SharedMemoryConcurrentTest, PerRingTaskIndexIsolation) { + constexpr int kIterations = 10000; + + auto writer = [&](int ring) { + auto &fc = handle->header->rings[ring].fc; + int32_t base = ring * 100000; + for (int i = 1; i <= kIterations; i++) { + fc.current_task_index.store(base + i, std::memory_order_release); + } + }; + + struct Observation { + bool went_backward = false; + bool saw_other_ring_range = false; + }; + + auto reader = [&](int ring, Observation *obs) { + auto &fc = handle->header->rings[ring].fc; + int32_t base = ring * 100000; + int32_t prev = 0; + for (int i = 0; i < kIterations; i++) { + int32_t val = fc.current_task_index.load(std::memory_order_acquire); + if (val < prev) { + obs->went_backward = true; + } + if (val != 0 && (val <= base || val > base + kIterations)) { + obs->saw_other_ring_range = true; + } + prev = val; + } + }; + + Observation ring0; + Observation ring1; + + std::thread w0(writer, 0); + std::thread w1(writer, 1); + std::thread r0(reader, 0, &ring0); + std::thread r1(reader, 1, &ring1); + + w0.join(); + w1.join(); + r0.join(); + r1.join(); + + EXPECT_FALSE(ring0.went_backward) << "Ring 0 current_task_index should be monotonic"; + EXPECT_FALSE(ring1.went_backward) << "Ring 1 current_task_index should be monotonic"; + EXPECT_FALSE(ring0.saw_other_ring_range) << "Ring 0 should not observe ring 1 values"; + EXPECT_FALSE(ring1.saw_other_ring_range) << "Ring 1 should not observe ring 0 values"; + + EXPECT_EQ(handle->header->rings[0].fc.current_task_index.load(), static_cast(kIterations)); + EXPECT_EQ(handle->header->rings[1].fc.current_task_index.load(), static_cast(100000 + kIterations)); +} + +TEST_F(SharedMemoryConcurrentTest, TaskIndexAtomicIncrement) { + constexpr int kIncrements = 5000; + constexpr int kThreads = 4; + + auto &fc = handle->header->rings[0].fc; + fc.current_task_index.store(0, std::memory_order_relaxed); + + auto incrementer = [&]() { + for (int i = 0; i < kIncrements; i++) { + fc.current_task_index.fetch_add(1, std::memory_order_acq_rel); + } + }; + + std::vector threads; + for (int i = 0; i < kThreads; i++) { + threads.emplace_back(incrementer); + } + for (auto &t : threads) + t.join(); + + EXPECT_EQ(fc.current_task_index.load(), kIncrements * kThreads) << "Concurrent increments should not lose updates"; +} + +TEST_F(SharedMemoryConcurrentTest, LastTaskAliveMonotonic) { + constexpr int kIterations = 10000; + constexpr int kThreads = 4; + + auto &fc = handle->header->rings[0].fc; + fc.last_task_alive.store(0, std::memory_order_relaxed); + + auto advancer = [&](int id) { + for (int i = 0; i < kIterations; i++) { + int32_t desired = id * kIterations + i + 1; + int32_t current = fc.last_task_alive.load(std::memory_order_acquire); + while (current < desired) { + if (fc.last_task_alive.compare_exchange_weak( + current, desired, std::memory_order_acq_rel, std::memory_order_acquire + )) { + break; + } + } + } + }; + + std::vector threads; + for (int i = 0; i < kThreads; i++) { + threads.emplace_back(advancer, i); + } + for (auto &t : threads) + t.join(); + + int32_t final_val = fc.last_task_alive.load(); + EXPECT_EQ(final_val, kIterations * kThreads) << "last_task_alive should advance to the largest published value"; +} + +TEST_F(SharedMemoryConcurrentTest, ValidateAfterConcurrentWrites) { + constexpr int kIterations = 1000; + + auto writer = [&](int ring) { + auto &fc = handle->header->rings[ring].fc; + for (int i = 0; i < kIterations; i++) { + fc.current_task_index.store(static_cast(i % 256), std::memory_order_release); + } + }; + + std::thread w0(writer, 0); + std::thread w1(writer, 1); + std::thread w2(writer, 2); + std::thread w3(writer, 3); + w0.join(); + w1.join(); + w2.join(); + w3.join(); + + EXPECT_TRUE(handle->validate()) << "Valid current_task_index values should pass validation"; + + handle->header->rings[2].fc.current_task_index.store(-1, std::memory_order_relaxed); + EXPECT_FALSE(handle->validate()) << "Corrupted current_task_index should fail validation"; +} diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp index 383003900..443a2e5da 100644 --- a/tests/ut/cpp/a5/test_task_allocator.cpp +++ b/tests/ut/cpp/a5/test_task_allocator.cpp @@ -405,3 +405,99 @@ TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { EXPECT_GE(r3.slot, 0); EXPECT_LT(r3.slot, WINDOW_SIZE); } + +// ============================================================================= +// Re-init safety +// ============================================================================= + +class TaskAllocatorReinitTest : public ::testing::Test { +protected: + static constexpr int32_t WINDOW_SIZE = 16; + static constexpr uint64_t HEAP_SIZE = 1024; + + std::vector descriptors; + alignas(64) uint8_t heap_buf[1024]{}; + std::atomic current_index{0}; + std::atomic last_alive{0}; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2TaskAllocator allocator{}; + + void InitAllocator() { + descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{}); + std::memset(heap_buf, 0, sizeof(heap_buf)); + current_index.store(0); + last_alive.store(0); + error_code.store(PTO2_ERROR_NONE); + allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + } +}; + +TEST_F(TaskAllocatorReinitTest, ReInitAfterUse) { + InitAllocator(); + + auto r1 = allocator.alloc(128); + ASSERT_FALSE(r1.failed()); + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 1); + + InitAllocator(); + + auto r3 = allocator.alloc(64); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(r3.task_id, 0) << "Re-init should reset task ID counter"; + EXPECT_EQ(r3.slot, 0); +} + +TEST_F(TaskAllocatorReinitTest, ReInitDifferentHeapSize) { + InitAllocator(); + + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + + InitAllocator(); + EXPECT_EQ(allocator.heap_top(), 0u) << "Re-init resets heap_top"; + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE) << "Re-init restores full capacity"; +} + +TEST_F(TaskAllocatorReinitTest, ReInitClearsErrorState) { + InitAllocator(); + + auto r = allocator.alloc(HEAP_SIZE * 2); + EXPECT_TRUE(r.failed()); + EXPECT_NE(error_code.load(), PTO2_ERROR_NONE); + + InitAllocator(); + EXPECT_EQ(error_code.load(), PTO2_ERROR_NONE); + + auto r2 = allocator.alloc(64); + EXPECT_FALSE(r2.failed()); +} + +TEST_F(TaskAllocatorReinitTest, MultipleReInitCycles) { + for (int cycle = 0; cycle < 10; cycle++) { + InitAllocator(); + + for (int i = 0; i < WINDOW_SIZE - 1; i++) { + auto r = allocator.alloc(0); + ASSERT_FALSE(r.failed()) << "Cycle " << cycle << " alloc " << i; + EXPECT_EQ(r.task_id, i); + } + } +} + +TEST_F(TaskAllocatorReinitTest, ReInitIgnoresStaleLastAlive) { + InitAllocator(); + + auto r1 = allocator.alloc(64); + ASSERT_FALSE(r1.failed()); + last_alive.store(5, std::memory_order_release); + + InitAllocator(); + EXPECT_EQ(last_alive.load(), 0); + + auto r2 = allocator.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 0); +} diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp index 10eef0317..522cd0279 100644 --- a/tests/ut/cpp/a5/test_tensormap.cpp +++ b/tests/ut/cpp/a5/test_tensormap.cpp @@ -65,6 +65,29 @@ static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32 return make_tensor_external(reinterpret_cast(addr), shapes, 2, DataType::FLOAT32, false, version); } +static Tensor make_test_tensor_nd( + uint64_t addr, uint32_t ndims, const uint32_t shapes[], const uint32_t offsets[] = nullptr, int32_t version = 0 +) { + uint32_t seed_shape[1] = {1}; + Tensor t = make_tensor_external(reinterpret_cast(addr), seed_shape, 1, DataType::FLOAT32, false, 0); + uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}; + uint32_t rs[RUNTIME_MAX_TENSOR_DIMS]{}; + uint32_t o[RUNTIME_MAX_TENSOR_DIMS]{}; + bool all_zero = true; + for (uint32_t i = 0; i < ndims && i < RUNTIME_MAX_TENSOR_DIMS; i++) { + s[i] = shapes[i]; + rs[i] = shapes[i]; + o[i] = offsets ? offsets[i] : 0; + if (o[i] != 0) all_zero = false; + } + uint64_t total = 4; + for (uint32_t i = 0; i < ndims; i++) { + total *= (rs[i] + (offsets ? offsets[i] : 0)); + } + t.init(reinterpret_cast(addr), total, rs, s, o, ndims, DataType::FLOAT32, version, all_zero, true); + return t; +} + // ============================================================================= // Fixture // ============================================================================= @@ -549,3 +572,105 @@ TEST(TaskIdTest, LocalIdMaxValue) { EXPECT_EQ(tid.ring(), 0); EXPECT_EQ(tid.local(), UINT32_MAX); } + +// ============================================================================= +// Edge cases merged from test_tensormap_overlap.cpp +// ============================================================================= + +TEST_F(TensorMapTest, ZeroDimensionTensor) { + // ndims=0: fast-path loop doesn't execute, contains=true -> COVERED + uint32_t seed_shape[1] = {1}; + Tensor t = make_tensor_external(reinterpret_cast(0x2000), seed_shape, 1, DataType::FLOAT32, false, 0); + uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{}; + t.init(reinterpret_cast(0x2000), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true); + + tmap.insert(t, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +TEST_F(TensorMapTest, TwoZeroDimTensorsSameAddr) { + uint32_t seed_shape[1] = {1}; + Tensor t1 = make_tensor_external(reinterpret_cast(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0); + Tensor t2 = make_tensor_external(reinterpret_cast(0x2100), seed_shape, 1, DataType::FLOAT32, false, 0); + uint32_t s[RUNTIME_MAX_TENSOR_DIMS]{}, o[RUNTIME_MAX_TENSOR_DIMS]{}; + t1.init(reinterpret_cast(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true); + t2.init(reinterpret_cast(0x2100), 0, s, s, o, 0, DataType::FLOAT32, 0, true, true); + + tmap.insert(t1, PTO2TaskId::make(0, 0)); + tmap.insert(t2, PTO2TaskId::make(0, 1)); + + TestLookupResult result; + run_lookup(tmap, t1, result); + EXPECT_EQ(result.count, 2); + for (int i = 0; i < result.count; i++) { + EXPECT_EQ(result.entries[i].overlap_status, OverlapStatus::COVERED) + << "0-dim tensors always report COVERED (empty loop -> contains=true)"; + } +} + +TEST_F(TensorMapTest, AdjacentNoOverlap) { + uint32_t prod_shapes[] = {100}; + Tensor prod = make_test_tensor_nd(0x8000, 1, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + uint32_t cons_shapes[] = {100}; + uint32_t cons_offsets[] = {100}; + Tensor cons = make_test_tensor_nd(0x8000, 1, cons_shapes, cons_offsets, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + EXPECT_EQ(result.count, 0) << "Adjacent regions [0,100) and [100,200) must not overlap"; +} + +TEST_F(TensorMapTest, OneElementOverlap) { + uint32_t prod_shapes[] = {100}; + Tensor prod = make_test_tensor_nd(0x8100, 1, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + uint32_t cons_shapes[] = {100}; + uint32_t cons_offsets[] = {99}; + Tensor cons = make_test_tensor_nd(0x8100, 1, cons_shapes, cons_offsets, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER) << "Partial overlap (1 element) -> OTHER"; +} + +TEST_F(TensorMapTest, ZeroShapeInDimension) { + // Producer: 2D [10, 0] -- zero in dim 1 + uint32_t prod_shapes[] = {10, 0}; + Tensor prod = make_test_tensor_nd(0x8200, 2, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + // Consumer: 2D [10, 20] + uint32_t cons_shapes[] = {10, 20}; + Tensor cons = make_test_tensor_nd(0x8200, 2, cons_shapes, nullptr, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + ASSERT_EQ(result.count, 1); + // Fast path: input.shapes[1](20) >= entry.shapes[1](0) -> contains=true -> COVERED + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED) + << "Zero-shape producer is COVERED by any consumer (empty production)"; +} + +TEST_F(TensorMapTest, FullFiveDimensionalOverlap) { + uint32_t prod_shapes[] = {2, 3, 4, 5, 6}; + Tensor prod = make_test_tensor_nd(0x9200, 5, prod_shapes, nullptr, 0); + tmap.insert(prod, PTO2TaskId::make(0, 0)); + + // Consumer with larger shapes in all dims -> COVERED + uint32_t cons_shapes[] = {4, 6, 8, 10, 12}; + Tensor cons = make_test_tensor_nd(0x9200, 5, cons_shapes, nullptr, 0); + + TestLookupResult result; + run_lookup(tmap, cons, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED) + << "5D consumer covers 5D producer in all dimensions"; +}