From e230799fc3cd0284e0b9e8d58dc010f88feb0126 Mon Sep 17 00:00:00 2001 From: chenshengxin Date: Sat, 25 Apr 2026 10:55:23 +0800 Subject: [PATCH] Add: unit tests for PTO2 scheduler core data structures and hot-paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTO2 scheduler (A2A3 and A5) relies on several tightly coupled data structures (task allocator, dep-list pool, fanin pool, SPSC queue, tensor map) and hot-path functions (wire_task, on_mixed_task_complete, on_task_release, advance_ring_pointers) that previously had no unit test coverage. Bugs in these paths — off-by-one in wrap-around, stale fanin references, lost dependency edges — surface only under specific task-graph topologies and are extremely hard to diagnose at the system-test level. This change adds per-component tests, covering both A2A3 and A5 runtimes, that exercise: - task_allocator: heap bump, wrap-around guard, flow-control window - task_state: slot lifecycle through src API, profiling CAS semantics - dep_list_pool / fanin_pool: ring allocation, overflow, tail advance - spsc_queue: cached-index SPSC correctness, wrap, capacity semantics - tensormap: hash distribution, overlap detection, lookup saturation - wiring: end-to-end wire → complete → release → advance cycle These tests also serve as executable documentation of design contracts (e.g. heap_available reports max-not-sum, LIFO dispatch for cache locality, relaxed size() as a hint) that would otherwise exist only as implicit assumptions in the source. --- tests/ut/cpp/CMakeLists.txt | 46 +- tests/ut/cpp/a2a3/test_a2a3_fatal.cpp | 6 + tests/ut/cpp/a2a3/test_dep_list_pool.cpp | 168 +++++++ tests/ut/cpp/a2a3/test_fanin_pool.cpp | 311 ++++++++++++ tests/ut/cpp/a2a3/test_ready_queue.cpp | 446 +++++++++++++++++ tests/ut/cpp/a2a3/test_scheduler_state.cpp | 197 ++++++++ tests/ut/cpp/a2a3/test_shared_memory.cpp | 191 +++++++ tests/ut/cpp/a2a3/test_spsc_queue.cpp | 293 +++++++++++ tests/ut/cpp/a2a3/test_task_allocator.cpp | 407 +++++++++++++++ tests/ut/cpp/a2a3/test_task_state.cpp | 201 ++++++++ tests/ut/cpp/a2a3/test_tensormap.cpp | 551 +++++++++++++++++++++ tests/ut/cpp/a2a3/test_wiring.cpp | 448 +++++++++++++++++ tests/ut/cpp/a5/test_dep_list_pool.cpp | 168 +++++++ tests/ut/cpp/a5/test_fanin_pool.cpp | 311 ++++++++++++ tests/ut/cpp/a5/test_ready_queue.cpp | 446 +++++++++++++++++ tests/ut/cpp/a5/test_scheduler_state.cpp | 197 ++++++++ tests/ut/cpp/a5/test_shared_memory.cpp | 191 +++++++ tests/ut/cpp/a5/test_spsc_queue.cpp | 293 +++++++++++ tests/ut/cpp/a5/test_task_allocator.cpp | 407 +++++++++++++++ tests/ut/cpp/a5/test_task_state.cpp | 201 ++++++++ tests/ut/cpp/a5/test_tensormap.cpp | 551 +++++++++++++++++++++ tests/ut/cpp/a5/test_wiring.cpp | 448 +++++++++++++++++ 22 files changed, 6476 insertions(+), 2 deletions(-) create mode 100644 tests/ut/cpp/a2a3/test_dep_list_pool.cpp create mode 100644 tests/ut/cpp/a2a3/test_fanin_pool.cpp create mode 100644 tests/ut/cpp/a2a3/test_ready_queue.cpp create mode 100644 tests/ut/cpp/a2a3/test_scheduler_state.cpp create mode 100644 tests/ut/cpp/a2a3/test_shared_memory.cpp create mode 100644 tests/ut/cpp/a2a3/test_spsc_queue.cpp create mode 100644 tests/ut/cpp/a2a3/test_task_allocator.cpp create mode 100644 tests/ut/cpp/a2a3/test_task_state.cpp create mode 100644 tests/ut/cpp/a2a3/test_tensormap.cpp create mode 100644 tests/ut/cpp/a2a3/test_wiring.cpp create mode 100644 tests/ut/cpp/a5/test_dep_list_pool.cpp create mode 100644 tests/ut/cpp/a5/test_fanin_pool.cpp create mode 100644 tests/ut/cpp/a5/test_ready_queue.cpp create mode 100644 tests/ut/cpp/a5/test_scheduler_state.cpp create mode 100644 tests/ut/cpp/a5/test_shared_memory.cpp create mode 100644 tests/ut/cpp/a5/test_spsc_queue.cpp create mode 100644 tests/ut/cpp/a5/test_task_allocator.cpp create mode 100644 tests/ut/cpp/a5/test_task_state.cpp create mode 100644 tests/ut/cpp/a5/test_tensormap.cpp create mode 100644 tests/ut/cpp/a5/test_wiring.cpp diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 06f83d1de..195c601ce 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -81,8 +81,8 @@ set(A2A3_RUNTIME_DIR ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and set(A2A3_STUB_SOURCES ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp) set(A2A3_RUNTIME_SOURCES ${A2A3_RUNTIME_DIR}/pto_ring_buffer.cpp - ${A2A3_RUNTIME_DIR}/pto_shared_memory.cpp - ${A2A3_RUNTIME_DIR}/pto_scheduler.cpp + ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp + ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp ${A2A3_RUNTIME_DIR}/pto_tensormap.cpp ) @@ -230,6 +230,48 @@ add_task_interface_test(test_child_memory types/test_child_memory.cpp) # --------------------------------------------------------------------------- add_a2a3_test(test_a2a3_fatal a2a3/test_a2a3_fatal.cpp) +# PTO2 runtime-linked tests +add_a2a3_runtime_test(test_task_allocator + SOURCES a2a3/test_task_allocator.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_dep_list_pool + SOURCES a2a3/test_dep_list_pool.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_scheduler_state + SOURCES a2a3/test_scheduler_state.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_task_state + SOURCES a2a3/test_task_state.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_ready_queue + SOURCES a2a3/test_ready_queue.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_shared_memory + SOURCES a2a3/test_shared_memory.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_a2a3_tensormap + SOURCES a2a3/test_tensormap.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_fanin_pool + SOURCES a2a3/test_fanin_pool.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_spsc_queue + SOURCES a2a3/test_spsc_queue.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) +add_a2a3_runtime_test(test_wiring + SOURCES a2a3/test_wiring.cpp + EXTRA_SOURCES ${A2A3_RUNTIME_SOURCES} +) + # --------------------------------------------------------------------------- # A5 tests (src/a5/runtime/tensormap_and_ringbuffer/) # --------------------------------------------------------------------------- diff --git a/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp b/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp index 4d55788d7..588136f66 100644 --- a/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp +++ b/tests/ut/cpp/a2a3/test_a2a3_fatal.cpp @@ -8,6 +8,12 @@ * See LICENSE in the root of the software repository for the full text of the License. * ----------------------------------------------------------------------------------------------------------- */ +/** + * Unit tests for PTO2 A2A3 fatal error handling. + * + * Tests API short-circuit after fatal state, explicit fatal routing, + * and allocation with invalid arguments. + */ #include diff --git a/tests/ut/cpp/a2a3/test_dep_list_pool.cpp b/tests/ut/cpp/a2a3/test_dep_list_pool.cpp new file mode 100644 index 000000000..a86a393d1 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_dep_list_pool.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2DepListPool from pto_ring_buffer.h + * + * Tests dependency list pool allocation, prepend chaining, overflow detection, + * tail advancement, and high-water mark tracking. + * + * Design contracts: + * + * - advance_tail(new_tail) only advances if new_tail > tail; it does + * not validate new_tail <= top. Caller contract (monotonic, + * top-bounded). + * + * - The list terminator is literal nullptr. base[0] is a normal pool entry; + * init clearing it is incidental, not an invariant. + */ + +#include + +#include +#include +#include + +#include "pto_ring_buffer.h" + +// ============================================================================= +// Fixture +// ============================================================================= + +class DepListPoolTest : public ::testing::Test { +protected: + static constexpr int32_t POOL_CAP = 8; + PTO2DepListEntry entries[POOL_CAP]{}; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2DepListPool pool{}; + + void SetUp() override { + std::memset(entries, 0, sizeof(entries)); + error_code.store(PTO2_ERROR_NONE); + pool.init(entries, POOL_CAP, &error_code); + } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(DepListPoolTest, InitialState) { + EXPECT_EQ(pool.used(), 0); + EXPECT_EQ(pool.available(), POOL_CAP); +} + +TEST_F(DepListPoolTest, SingleAlloc) { + PTO2DepListEntry *entry = pool.alloc(); + ASSERT_NE(entry, nullptr); + EXPECT_EQ(pool.used(), 1); + EXPECT_EQ(pool.available(), POOL_CAP - 1); +} + +TEST_F(DepListPoolTest, OverflowDetection) { + for (int i = 0; i < POOL_CAP; i++) { + PTO2DepListEntry *e = pool.alloc(); + ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i; + } + EXPECT_EQ(pool.used(), POOL_CAP); + EXPECT_EQ(pool.available(), 0); + + PTO2DepListEntry *overflow = pool.alloc(); + EXPECT_EQ(overflow, nullptr); + EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW); +} + +// Prepend builds LIFO linked list: verify each slot_state pointer. +TEST_F(DepListPoolTest, PrependChainCorrectness) { + PTO2TaskSlotState slots[5]{}; + PTO2DepListEntry *head = nullptr; + + for (int i = 0; i < 5; i++) { + head = pool.prepend(head, &slots[i]); + ASSERT_NE(head, nullptr); + } + + // LIFO order: head -> slots[4] -> slots[3] -> ... -> slots[0] -> nullptr. + PTO2DepListEntry *cur = head; + for (int i = 4; i >= 0; i--) { + ASSERT_NE(cur, nullptr); + EXPECT_EQ(cur->slot_state, &slots[i]) << "Entry " << (4 - i) << " should point to slots[" << i << "]"; + cur = cur->next; + } + EXPECT_EQ(cur, nullptr) << "Chain should terminate with nullptr"; +} + +TEST_F(DepListPoolTest, AdvanceTail) { + for (int i = 0; i < 4; i++) { + pool.alloc(); + } + EXPECT_EQ(pool.used(), 4); + EXPECT_EQ(pool.available(), POOL_CAP - 4); + + pool.advance_tail(4); + EXPECT_EQ(pool.used(), 1); + EXPECT_EQ(pool.available(), POOL_CAP - 1); +} + +TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) { + pool.alloc(); + pool.alloc(); + pool.advance_tail(3); + int32_t used_after = pool.used(); + + pool.advance_tail(2); + EXPECT_EQ(pool.used(), used_after); + + pool.advance_tail(3); + EXPECT_EQ(pool.used(), used_after); +} + +TEST_F(DepListPoolTest, HighWaterAccuracy) { + for (int i = 0; i < 5; i++) + pool.alloc(); + EXPECT_EQ(pool.high_water, 5); + + pool.advance_tail(4); + EXPECT_EQ(pool.high_water, 5) << "High water never decreases"; + + for (int i = 0; i < 3; i++) + pool.alloc(); + EXPECT_GE(pool.high_water, 5); +} + +// ============================================================================= +// Boundary conditions +// ============================================================================= + +// Prepend chain integrity under pool exhaustion: chain must be walkable. +TEST_F(DepListPoolTest, PrependUnderExhaustion) { + PTO2TaskSlotState slots[POOL_CAP]{}; + PTO2DepListEntry *head = nullptr; + + int count = 0; + while (count < POOL_CAP + 5) { + PTO2DepListEntry *new_head = pool.prepend(head, &slots[count % POOL_CAP]); + if (!new_head) break; + head = new_head; + count++; + } + + int walk = 0; + PTO2DepListEntry *cur = head; + while (cur) { + walk++; + cur = cur->next; + if (walk > count + 1) { + FAIL() << "Chain has cycle -- walked more entries than allocated"; + break; + } + } + EXPECT_EQ(walk, count); +} diff --git a/tests/ut/cpp/a2a3/test_fanin_pool.cpp b/tests/ut/cpp/a2a3/test_fanin_pool.cpp new file mode 100644 index 000000000..29199ae2e --- /dev/null +++ b/tests/ut/cpp/a2a3/test_fanin_pool.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2FaninPool and pto2_for_each_fanin_storage/slot_state + * from pto_ring_buffer.h / pto_ring_buffer.cpp + * + * Tests: + * 1. PTO2FaninPool — ring buffer allocation, overflow, tail advance, + * high-water tracking + * 2. pto2_for_each_fanin_storage — inline-only, spill without wrap, + * spill with wrap, callback early return + */ + +#include + +#include +#include +#include + +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" + +// ============================================================================= +// FaninPool fixture +// ============================================================================= + +class FaninPoolTest : public ::testing::Test { +protected: + static constexpr int32_t POOL_CAP = 32; + + std::vector entries; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2FaninPool pool{}; + + void SetUp() override { + entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr}); + error_code.store(PTO2_ERROR_NONE); + pool.init(entries.data(), POOL_CAP, &error_code); + } +}; + +// ============================================================================= +// FaninPool: basic operations +// ============================================================================= + +TEST_F(FaninPoolTest, InitialState) { + EXPECT_EQ(pool.used(), 0); + EXPECT_EQ(pool.available(), POOL_CAP); + EXPECT_EQ(pool.top, 1); + EXPECT_EQ(pool.tail, 1); + EXPECT_EQ(pool.high_water, 0); +} + +TEST_F(FaninPoolTest, AllocReturnsCorrectModuloIndex) { + // First alloc at index top%cap = 1%32 = 1 + auto *e1 = pool.alloc(); + EXPECT_EQ(e1, &entries[1]); + + auto *e2 = pool.alloc(); + EXPECT_EQ(e2, &entries[2]); +} + +TEST_F(FaninPoolTest, AllocFillsPool) { + for (int i = 0; i < POOL_CAP; i++) { + auto *e = pool.alloc(); + ASSERT_NE(e, nullptr) << "Alloc failed at i=" << i; + } + EXPECT_EQ(pool.used(), POOL_CAP); + EXPECT_EQ(pool.available(), 0); +} + +TEST_F(FaninPoolTest, OverflowReturnsNullptr) { + for (int i = 0; i < POOL_CAP; i++) { + pool.alloc(); + } + auto *overflow = pool.alloc(); + EXPECT_EQ(overflow, nullptr); + EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW); +} + +TEST_F(FaninPoolTest, AdvanceTailFreesSpace) { + for (int i = 0; i < 10; i++) { + pool.alloc(); + } + EXPECT_EQ(pool.used(), 10); + + pool.advance_tail(pool.tail + 5); + EXPECT_EQ(pool.used(), 5); + EXPECT_EQ(pool.available(), POOL_CAP - 5); +} + +TEST_F(FaninPoolTest, AdvanceTailBackwardsIsNoop) { + for (int i = 0; i < 10; i++) { + pool.alloc(); + } + int32_t old_tail = pool.tail; + pool.advance_tail(old_tail - 1); + EXPECT_EQ(pool.tail, old_tail); + EXPECT_EQ(pool.used(), 10); +} + +TEST_F(FaninPoolTest, HighWaterNeverDecreases) { + for (int i = 0; i < 10; i++) { + pool.alloc(); + } + EXPECT_EQ(pool.high_water, 10); + + pool.advance_tail(pool.tail + 5); + EXPECT_EQ(pool.high_water, 10) << "high_water must never decrease"; +} + +TEST_F(FaninPoolTest, WrapAroundAllocation) { + // Fill and drain, then fill again to wrap + for (int i = 0; i < POOL_CAP; i++) { + pool.alloc(); + } + pool.advance_tail(pool.top); + EXPECT_EQ(pool.used(), 0); + + // New allocations wrap around + for (int i = 0; i < 5; i++) { + auto *e = pool.alloc(); + ASSERT_NE(e, nullptr); + // Verify modulo indexing + int32_t expected_idx = (pool.top - 1) % POOL_CAP; + EXPECT_EQ(e, &entries[expected_idx]); + } + EXPECT_EQ(pool.used(), 5); +} + +// ============================================================================= +// pto2_for_each_fanin_storage: inline only +// ============================================================================= + +class ForEachFaninTest : public ::testing::Test { +protected: + static constexpr int32_t POOL_CAP = 32; + + std::vector spill_entries; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2FaninPool spill_pool{}; + + alignas(64) PTO2TaskSlotState slots[64]; + + void SetUp() override { + spill_entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr}); + error_code.store(PTO2_ERROR_NONE); + spill_pool.init(spill_entries.data(), POOL_CAP, &error_code); + memset(slots, 0, sizeof(slots)); + } +}; + +TEST_F(ForEachFaninTest, InlineOnlyVoid) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < 5; i++) { + inline_slots[i] = &slots[i]; + } + + std::vector visited; + pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *s) { + visited.push_back(s); + }); + + ASSERT_EQ(visited.size(), 5u); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(visited[i], &slots[i]); + } +} + +TEST_F(ForEachFaninTest, InlineOnlyBoolEarlyReturn) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < 5; i++) { + inline_slots[i] = &slots[i]; + } + + int count = 0; + bool result = pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *) -> bool { + count++; + return count < 3; // stop after 3rd + }); + + EXPECT_FALSE(result) << "Should return false when callback returns false"; + EXPECT_EQ(count, 3); +} + +TEST_F(ForEachFaninTest, InlineOnlyBoolAllTrue) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < 3; i++) { + inline_slots[i] = &slots[i]; + } + + bool result = pto2_for_each_fanin_storage(inline_slots, 3, 0, spill_pool, [](PTO2TaskSlotState *) -> bool { + return true; + }); + + EXPECT_TRUE(result); +} + +TEST_F(ForEachFaninTest, ZeroFanin) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + int count = 0; + pto2_for_each_fanin_storage(inline_slots, 0, 0, spill_pool, [&](PTO2TaskSlotState *) { + count++; + }); + EXPECT_EQ(count, 0); +} + +// ============================================================================= +// pto2_for_each_fanin_storage: spill without wrap +// ============================================================================= + +TEST_F(ForEachFaninTest, SpillNoWrap) { + // 18 fanins = 16 inline + 2 spill + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) { + inline_slots[i] = &slots[i]; + } + + // Allocate 2 spill entries + auto *s0 = spill_pool.alloc(); + int32_t spill_start = spill_pool.top - 1; + s0->slot_state = &slots[16]; + auto *s1 = spill_pool.alloc(); + s1->slot_state = &slots[17]; + + std::vector visited; + pto2_for_each_fanin_storage(inline_slots, 18, spill_start, spill_pool, [&](PTO2TaskSlotState *s) { + visited.push_back(s); + }); + + ASSERT_EQ(visited.size(), 18u); + for (int i = 0; i < 16; i++) { + EXPECT_EQ(visited[i], &slots[i]) << "Inline slot " << i; + } + EXPECT_EQ(visited[16], &slots[16]); + EXPECT_EQ(visited[17], &slots[17]); +} + +// ============================================================================= +// pto2_for_each_fanin_storage: spill with wrap +// ============================================================================= + +TEST_F(ForEachFaninTest, SpillWithWrap) { + // Push pool near end so spill wraps around + // Pool cap = 32, advance top to 30 so next alloc is at index 30 + spill_pool.top = POOL_CAP - 2; + spill_pool.tail = POOL_CAP - 2; + + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) { + inline_slots[i] = &slots[i]; + } + + // 4 spill entries: indices 30, 31, 0, 1 (wraps around) + int32_t spill_start = spill_pool.top; + for (int i = 0; i < 4; i++) { + auto *e = spill_pool.alloc(); + ASSERT_NE(e, nullptr); + e->slot_state = &slots[16 + i]; + } + + std::vector visited; + pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *s) { + visited.push_back(s); + }); + + ASSERT_EQ(visited.size(), 20u); + // Inline + for (int i = 0; i < 16; i++) { + EXPECT_EQ(visited[i], &slots[i]); + } + // Spill (wrapped) + for (int i = 0; i < 4; i++) { + EXPECT_EQ(visited[16 + i], &slots[16 + i]); + } +} + +// ============================================================================= +// pto2_for_each_fanin_storage: spill with bool callback early return +// ============================================================================= + +TEST_F(ForEachFaninTest, SpillBoolEarlyReturnInSpillRegion) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) { + inline_slots[i] = &slots[i]; + } + + int32_t spill_start = spill_pool.top; + for (int i = 0; i < 4; i++) { + auto *e = spill_pool.alloc(); + e->slot_state = &slots[16 + i]; + } + + int count = 0; + bool result = + pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *) -> bool { + count++; + return count < 17; // stop on 17th (first spill entry) + }); + + EXPECT_FALSE(result); + EXPECT_EQ(count, 17); +} diff --git a/tests/ut/cpp/a2a3/test_ready_queue.cpp b/tests/ut/cpp/a2a3/test_ready_queue.cpp new file mode 100644 index 000000000..1a139a8f1 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_ready_queue.cpp @@ -0,0 +1,446 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h + * + * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local + * ready buffer used for local-first dispatch optimization. + * + * Design contracts: + * + * - Sequence wrap: The sequence counter is int64_t. Practically unreachable + * wrap at 2^63; two's-complement comparisons still work. + * + * - Pop fast-path: pop() checks enqueue_pos == dequeue_pos as an early-empty + * hint. A push between the hint and the CAS can race; standard TOCTOU of + * Vyukov MPMC, acceptable. + * + * - Push near full: All producers that see a full slot return false + * simultaneously even if a pop happens right after. Acceptable + * back-pressure. + * + * - size() relaxed ordering: size() reads both positions with + * memory_order_relaxed and is a hint, not a snapshot. If a stale read + * produces d > e the guard returns 0. + * + * - LocalReadyBuffer LIFO dispatch: try_push appends at count++, pop returns + * slot_states[--count]. LIFO reversal is intentional for cache-locality + * when a producer immediately dispatches its fanout. + */ + +#include + +#include +#include +#include +#include +#include + +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// ReadyQueue: Single-threaded fixture (malloc-backed) +// ============================================================================= + +class ReadyQueueTest : public ::testing::Test { +protected: + static constexpr uint64_t CAPACITY = 16; // Power of 2 + + PTO2ReadyQueue queue; + + void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); } + + void TearDown() override { pto2_ready_queue_destroy(&queue); } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) { EXPECT_EQ(queue.pop(), nullptr); } + +TEST_F(ReadyQueueTest, SinglePushPop) { + PTO2TaskSlotState item; + ASSERT_TRUE(queue.push(&item)); + + PTO2TaskSlotState *result = queue.pop(); + EXPECT_EQ(result, &item); +} + +TEST_F(ReadyQueueTest, FIFOOrdering) { + PTO2TaskSlotState a, b, c; + + ASSERT_TRUE(queue.push(&a)); + ASSERT_TRUE(queue.push(&b)); + ASSERT_TRUE(queue.push(&c)); + + EXPECT_EQ(queue.pop(), &a); + EXPECT_EQ(queue.pop(), &b); + EXPECT_EQ(queue.pop(), &c); + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, QueueFullReturnsFalse) { + std::vector items(CAPACITY); + + for (uint64_t i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState extra; + EXPECT_FALSE(queue.push(&extra)); +} + +TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) { + std::vector items(CAPACITY); + + for (uint64_t i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + for (uint64_t i = 0; i < CAPACITY; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); + + for (uint64_t i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + for (uint64_t i = 0; i < CAPACITY; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) { + constexpr int BATCH_SIZE = 5; + PTO2TaskSlotState items[BATCH_SIZE]; + PTO2TaskSlotState *ptrs[BATCH_SIZE]; + for (int i = 0; i < BATCH_SIZE; i++) { + ptrs[i] = &items[i]; + } + + queue.push_batch(ptrs, BATCH_SIZE); + + for (int i = 0; i < BATCH_SIZE; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) { + queue.push_batch(nullptr, 0); + + EXPECT_EQ(queue.size(), 0u); + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, PopBatchReturnsFive) { + constexpr int PUSH_COUNT = 10; + PTO2TaskSlotState items[PUSH_COUNT]; + + for (int i = 0; i < PUSH_COUNT; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState *out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, 5); + + for (int i = 0; i < 5; i++) { + EXPECT_EQ(out[i], &items[i]); + } +} + +TEST_F(ReadyQueueTest, PopBatchPartial) { + constexpr int PUSH_COUNT = 3; + PTO2TaskSlotState items[PUSH_COUNT]; + + for (int i = 0; i < PUSH_COUNT; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState *out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, PUSH_COUNT); + + for (int i = 0; i < PUSH_COUNT; i++) { + EXPECT_EQ(out[i], &items[i]); + } +} + +TEST_F(ReadyQueueTest, PopBatchEmpty) { + PTO2TaskSlotState *out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, 0); +} + +TEST_F(ReadyQueueTest, SizeAccuracy) { + EXPECT_EQ(queue.size(), 0u); + + PTO2TaskSlotState items[8]; + + queue.push(&items[0]); + EXPECT_EQ(queue.size(), 1u); + + queue.push(&items[1]); + queue.push(&items[2]); + EXPECT_EQ(queue.size(), 3u); + + queue.pop(); + EXPECT_EQ(queue.size(), 2u); + + queue.pop(); + queue.pop(); + EXPECT_EQ(queue.size(), 0u); + + for (int i = 0; i < 5; i++) { + queue.push(&items[i]); + } + EXPECT_EQ(queue.size(), 5u); +} + +// ============================================================================= +// Boundary conditions (small capacity for precise boundary testing) +// ============================================================================= + +class ReadyQueueBoundaryTest : public ::testing::Test { +protected: + static constexpr uint64_t QUEUE_CAP = 8; // Small for boundary testing + PTO2ReadyQueue queue{}; + PTO2TaskSlotState dummy[8]{}; + + void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, QUEUE_CAP)); } + void TearDown() override { pto2_ready_queue_destroy(&queue); } +}; + +TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) { + int pushed = 0; + for (uint64_t i = 0; i < QUEUE_CAP; i++) { + if (queue.push(&dummy[i % 8])) pushed++; + else break; + } + EXPECT_GE(pushed, (int)(QUEUE_CAP - 1)); + + for (int i = 0; i < pushed; i++) { + EXPECT_NE(queue.pop(), nullptr); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueBoundaryTest, PushToFullThenRecover) { + int pushed = 0; + while (queue.push(&dummy[0])) + pushed++; + + EXPECT_FALSE(queue.push(&dummy[1])) << "Push to full queue returns false"; + + EXPECT_NE(queue.pop(), nullptr); + EXPECT_TRUE(queue.push(&dummy[1])) << "Push succeeds after pop from full queue"; +} + +// size() with relaxed ordering: exact in single-threaded context. +TEST_F(ReadyQueueBoundaryTest, SizeRelaxedOrdering) { + queue.push(&dummy[0]); + queue.push(&dummy[1]); + queue.push(&dummy[2]); + EXPECT_EQ(queue.size(), 3u); + + queue.pop(); + EXPECT_EQ(queue.size(), 2u); + + queue.pop(); + queue.pop(); + EXPECT_EQ(queue.size(), 0u); +} + +// size() guard: after many push/pop cycles, never goes negative. +TEST_F(ReadyQueueBoundaryTest, SizeNeverNegative) { + for (int i = 0; i < 100; i++) { + ASSERT_TRUE(queue.push(&dummy[0])); + queue.pop(); + } + EXPECT_EQ(queue.size(), 0u) << "size() returns 0 after balanced push/pop cycles"; +} + +TEST_F(ReadyQueueBoundaryTest, RepeatedEmptyPop) { + for (int i = 0; i < 100; i++) { + EXPECT_EQ(queue.pop(), nullptr); + } + EXPECT_EQ(queue.size(), 0u); +} + +// Sequence numbers grow large after many cycles but remain correct. +TEST_F(ReadyQueueBoundaryTest, ManyPushPopCycles) { + for (int i = 0; i < 10000; i++) { + ASSERT_TRUE(queue.push(&dummy[0])); + PTO2TaskSlotState *s = queue.pop(); + ASSERT_NE(s, nullptr); + EXPECT_EQ(s, &dummy[0]); + } + + EXPECT_EQ(queue.size(), 0u); + EXPECT_TRUE(queue.push(&dummy[1])); + EXPECT_EQ(queue.pop(), &dummy[1]); +} + +// ============================================================================= +// Concurrency +// ============================================================================= + +// Parameterized MPMC stress test: {producers, consumers, items_per_producer} +struct MPMCConfig { + int producers; + int consumers; + int items_per_producer; +}; + +class ReadyQueueMPMCTest : public ::testing::TestWithParam { +protected: + static constexpr uint64_t CAPACITY = 1024; + PTO2ReadyQueue queue; + + void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); } + void TearDown() override { pto2_ready_queue_destroy(&queue); } +}; + +TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) { + auto cfg = GetParam(); + int total = cfg.producers * cfg.items_per_producer; + + std::vector items(total); + std::vector> consumed_count(total); + for (int i = 0; i < total; i++) { + consumed_count[i].store(0, std::memory_order_relaxed); + } + + auto item_index = [&](PTO2TaskSlotState *s) -> int { + return static_cast(s - items.data()); + }; + + std::atomic producers_done{0}; + + auto producer = [&](int id) { + for (int i = id; i < total; i += cfg.producers) { + while (!queue.push(&items[i])) {} + } + producers_done.fetch_add(1, std::memory_order_release); + }; + + std::atomic total_consumed{0}; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState *item = queue.pop(); + if (item != nullptr) { + consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed); + total_consumed.fetch_add(1, std::memory_order_relaxed); + } else if (producers_done.load(std::memory_order_acquire) == cfg.producers) { + // Drain remaining + while ((item = queue.pop()) != nullptr) { + consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed); + total_consumed.fetch_add(1, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector threads; + for (int i = 0; i < cfg.producers; i++) + threads.emplace_back(producer, i); + for (int i = 0; i < cfg.consumers; i++) + threads.emplace_back(consumer); + for (auto &t : threads) + t.join(); + + EXPECT_EQ(total_consumed.load(), total); + for (int i = 0; i < total; i++) { + EXPECT_EQ(consumed_count[i].load(), 1) + << "Item " << i << " consumed " << consumed_count[i].load() << " times (expected 1)"; + } +} + +INSTANTIATE_TEST_SUITE_P( + MPMCVariants, ReadyQueueMPMCTest, + ::testing::Values( + MPMCConfig{2, 2, 200}, // TwoProducersTwoConsumers + MPMCConfig{1, 4, 500}, // OneProducerNConsumers + MPMCConfig{4, 4, 1250} // HighContentionStress + ) +); + +// ============================================================================= +// LocalReadyBuffer +// ============================================================================= + +class LocalReadyBufferTest : public ::testing::Test { +protected: + static constexpr int CAPACITY = 8; + + PTO2LocalReadyBuffer buffer; + PTO2TaskSlotState *backing[CAPACITY]; + + void SetUp() override { buffer.reset(backing, CAPACITY); } +}; + +// --- Normal path --- + +TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) { EXPECT_EQ(buffer.pop(), nullptr); } + +// LIFO dispatch: try_push appends at count++, pop returns slot_states[--count]. +TEST_F(LocalReadyBufferTest, LIFOOrdering) { + PTO2TaskSlotState a, b; + + ASSERT_TRUE(buffer.try_push(&a)); + ASSERT_TRUE(buffer.try_push(&b)); + + EXPECT_EQ(buffer.pop(), &b); + EXPECT_EQ(buffer.pop(), &a); + EXPECT_EQ(buffer.pop(), nullptr); +} + +TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) { + PTO2TaskSlotState items[CAPACITY + 1]; + + for (int i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(buffer.try_push(&items[i])); + } + + EXPECT_FALSE(buffer.try_push(&items[CAPACITY])); +} + +TEST_F(LocalReadyBufferTest, ResetSetsCleanState) { + EXPECT_EQ(buffer.pop(), nullptr) << "Fresh buffer is empty"; + + PTO2TaskSlotState a, b; + ASSERT_TRUE(buffer.try_push(&a)); + ASSERT_TRUE(buffer.try_push(&b)); + + buffer.reset(backing, CAPACITY); + EXPECT_EQ(buffer.pop(), nullptr) << "Buffer is empty after reset"; + + PTO2TaskSlotState items[CAPACITY]; + for (int i = 0; i < CAPACITY; i++) { + EXPECT_TRUE(buffer.try_push(&items[i])); + } + EXPECT_FALSE(buffer.try_push(&a)) << "Full after pushing capacity items post-reset"; +} + +// --- Boundary conditions --- + +TEST_F(LocalReadyBufferTest, NullBackingBuffer) { + PTO2LocalReadyBuffer buf; + buf.reset(nullptr, 0); + + PTO2TaskSlotState item{}; + EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing"; + EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing"; +} diff --git a/tests/ut/cpp/a2a3/test_scheduler_state.cpp b/tests/ut/cpp/a2a3/test_scheduler_state.cpp new file mode 100644 index 000000000..13647c320 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_scheduler_state.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2SchedulerState from pto_scheduler.h + * + * Tests task state transitions, fanin/fanout logic, subtask completion. + */ + +#include + +#include +#include + +#include "scheduler/pto_scheduler.h" + +class SchedulerStateTest : public ::testing::Test { +protected: + PTO2SchedulerState sched; + PTO2SharedMemoryHandle *sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle->header); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + void init_slot( + PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0 + ) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = ring_id; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.logical_block_num = 1; + } +}; + +// ============================================================================= +// check_and_handle_consumed +// ============================================================================= + +TEST_F(SchedulerStateTest, ConsumedNotReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(1); // 1 != 2 + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED); +} + +TEST_F(SchedulerStateTest, ConsumedTransition) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(2); // matches fanout_count + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +TEST_F(SchedulerStateTest, ConsumedNotCompletedState) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.fanout_refcount.store(1); + + sched.check_and_handle_consumed(slot); + // CAS fails because state is RUNNING, not COMPLETED + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING); +} + +TEST_F(SchedulerStateTest, ConsumedIdempotent) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_CONSUMED, 1, 1); + slot.fanout_refcount.store(1); + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// release_producer +// ============================================================================= + +TEST_F(SchedulerStateTest, ReleaseProducerIncrements) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 3); + + sched.release_producer(slot); + EXPECT_EQ(slot.fanout_refcount.load(), 1); + + sched.release_producer(slot); + EXPECT_EQ(slot.fanout_refcount.load(), 2); +} + +TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(1); // One away + + sched.release_producer(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// on_subtask_complete +// ============================================================================= + +TEST_F(SchedulerStateTest, SubtaskCompleteSingle) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 1; + slot.completed_subtasks.store(0); + + EXPECT_TRUE(sched.on_subtask_complete(slot)); +} + +TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 6; // 3 cores * 2 blocks + slot.completed_subtasks.store(0); + + for (int i = 0; i < 5; i++) { + EXPECT_FALSE(sched.on_subtask_complete(slot)); + } + EXPECT_TRUE(sched.on_subtask_complete(slot)); +} + +// ============================================================================= +// on_scope_end +// ============================================================================= + +TEST_F(SchedulerStateTest, ScopeEndBatchRelease) { + constexpr int N = 4; + alignas(64) PTO2TaskSlotState slots[N]; + PTO2TaskSlotState *ptrs[N]; + + for (int i = 0; i < N; i++) { + init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2); + ptrs[i] = &slots[i]; + } + + sched.on_scope_end(ptrs, N); + + for (int i = 0; i < N; i++) { + EXPECT_EQ(slots[i].fanout_refcount.load(), 1); + } +} + +// ============================================================================= +// get_ready_tasks_batch: local buffer first +// ============================================================================= + +TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) { + alignas(64) PTO2TaskSlotState slot_a, slot_b; + init_slot(slot_a, PTO2_TASK_READY, 0, 1); + init_slot(slot_b, PTO2_TASK_PENDING, 1, 1); + + PTO2TaskSlotState *local_buf_storage[4]; + PTO2LocalReadyBuffer local_buf; + local_buf.reset(local_buf_storage, 4); + local_buf.try_push(&slot_a); + + // Use src API to route slot_b into the global ready queue + sched.release_fanin_and_check_ready(slot_b); + + PTO2TaskSlotState *out[4]; + int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4); + + EXPECT_EQ(count, 2); + // Local buffer drains first (LIFO), so slot_a comes first + EXPECT_EQ(out[0], &slot_a); + EXPECT_EQ(out[1], &slot_b); +} diff --git a/tests/ut/cpp/a2a3/test_shared_memory.cpp b/tests/ut/cpp/a2a3/test_shared_memory.cpp new file mode 100644 index 000000000..ffcbb7821 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_shared_memory.cpp @@ -0,0 +1,191 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h + * + * Tests creation, validation, per-ring independence, alignment, size + * calculation, and error handling. + * + * Design contracts: + * + * - pto2_sm_validate checks `top > heap_size`. top == heap_size is a + * legitimate "filled exactly to end" state, so strict > is correct. + * + * - Zero window size: if pto2_sm_calculate_size() is called with 0, all ring + * descriptors/payloads alias the same address. Current entry path + * (pto2_sm_create) is called only with valid sizes, but there is no + * explicit guard. pto2_sm_create should reject task_window_size==0. + * + * - Flow control heap_top validation: validate() does not verify + * heap_top <= heap_size. After a corruption, heap_top could exceed + * heap_size without detection. validate should check both bounds. + */ + +#include +#include +#include "pto_shared_memory.h" + +// ============================================================================= +// Fixture (default-created handle) +// ============================================================================= + +class SharedMemoryTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *handle = nullptr; + + void SetUp() override { + handle = pto2_sm_create_default(); + ASSERT_NE(handle, nullptr); + } + + void TearDown() override { + if (handle) { + pto2_sm_destroy(handle); + handle = nullptr; + } + } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) { + EXPECT_NE(handle->sm_base, nullptr); + EXPECT_GT(handle->sm_size, 0u); +} + +TEST_F(SharedMemoryTest, IsOwner) { EXPECT_TRUE(handle->is_owner); } + +TEST_F(SharedMemoryTest, HeaderInitValues) { + auto *hdr = handle->header; + EXPECT_EQ(hdr->orchestrator_done.load(), 0); + EXPECT_EQ(hdr->orch_error_code.load(), 0); + EXPECT_EQ(hdr->sched_error_bitmap.load(), 0); + EXPECT_EQ(hdr->sched_error_code.load(), 0); + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &fc = hdr->rings[r].fc; + EXPECT_EQ(fc.current_task_index.load(), 0); + EXPECT_EQ(fc.last_task_alive.load(), 0); + } +} + +TEST_F(SharedMemoryTest, Validate) { EXPECT_TRUE(pto2_sm_validate(handle)); } + +TEST_F(SharedMemoryTest, PerRingIndependence) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + EXPECT_NE(handle->header->rings[r].task_descriptors, nullptr) << "Ring " << r; + EXPECT_NE(handle->header->rings[r].task_payloads, nullptr) << "Ring " << r; + } + for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) { + EXPECT_NE(handle->header->rings[r].task_descriptors, handle->header->rings[0].task_descriptors) << "Ring " << r; + } +} + +TEST_F(SharedMemoryTest, PointerAlignment) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto addr = reinterpret_cast(handle->header->rings[r].task_descriptors); + EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned"; + } +} + +TEST_F(SharedMemoryTest, HeaderAlignment) { + uintptr_t header_addr = (uintptr_t)handle->header; + EXPECT_EQ(header_addr % PTO2_ALIGN_SIZE, 0u) << "Header must be cache-line aligned"; +} + +// Descriptor and payload regions don't overlap within or across rings. +TEST_F(SharedMemoryTest, RegionsNonOverlapping) { + uint64_t ws = 64; // Use a known window size for byte arithmetic + PTO2SharedMemoryHandle *h = pto2_sm_create(ws, 4096); + ASSERT_NE(h, nullptr); + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + uintptr_t desc_start = (uintptr_t)h->header->rings[r].task_descriptors; + uintptr_t desc_end = desc_start + ws * sizeof(PTO2TaskDescriptor); + uintptr_t payload_start = (uintptr_t)h->header->rings[r].task_payloads; + + EXPECT_GE(payload_start, desc_end) << "Ring " << r << ": payload region should not overlap descriptors"; + } + + for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) { + uintptr_t this_payload_end = (uintptr_t)h->header->rings[r].task_payloads + ws * sizeof(PTO2TaskPayload); + uintptr_t next_desc_start = (uintptr_t)h->header->rings[r + 1].task_descriptors; + EXPECT_GE(next_desc_start, this_payload_end) << "Ring " << r << " and " << (r + 1) << " should not overlap"; + } + + pto2_sm_destroy(h); +} + +// ============================================================================= +// Size calculation +// ============================================================================= + +TEST(SharedMemoryCalcSize, NonZero) { + uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE); + EXPECT_GT(size, 0u); +} + +TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) { + uint64_t small_size = pto2_sm_calculate_size(64); + uint64_t large_size = pto2_sm_calculate_size(256); + EXPECT_GT(large_size, small_size); +} + +TEST(SharedMemoryCalcSize, HeaderAligned) { EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u); } + +TEST(SharedMemoryCalcSize, PerRingDifferentSizes) { + uint64_t ws[PTO2_MAX_RING_DEPTH] = {128, 256, 512, 1024}; + uint64_t size = pto2_sm_calculate_size_per_ring(ws); + + uint64_t uniform_size = pto2_sm_calculate_size(128); + EXPECT_GT(size, uniform_size); +} + +// ============================================================================= +// Boundary conditions +// ============================================================================= + +// Zero window size: all ring descriptors collapse to same address. +TEST(SharedMemoryBoundary, ZeroWindowSize) { + uint64_t size = pto2_sm_calculate_size(0); + uint64_t header_size = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + EXPECT_EQ(size, header_size); + + PTO2SharedMemoryHandle *h = pto2_sm_create(0, 4096); + if (h) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) { + EXPECT_EQ(h->header->rings[r].task_descriptors, h->header->rings[r + 1].task_descriptors) + << "Zero window: all rings' descriptor pointers collapse to same address"; + } + pto2_sm_destroy(h); + } +} + +TEST(SharedMemoryBoundary, ValidateDetectsCorruption) { + PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096); + ASSERT_NE(h, nullptr); + EXPECT_TRUE(pto2_sm_validate(h)); + + h->header->rings[0].fc.current_task_index.store(-1); + EXPECT_FALSE(pto2_sm_validate(h)); + + pto2_sm_destroy(h); +} + +TEST(SharedMemoryBoundary, ValidateNullHandle) { EXPECT_FALSE(pto2_sm_validate(nullptr)); } + +TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) { + char buf[64]{}; + PTO2SharedMemoryHandle *h = pto2_sm_create_from_buffer(buf, 64, 256, 4096); + EXPECT_EQ(h, nullptr) << "Undersized buffer should fail"; +} diff --git a/tests/ut/cpp/a2a3/test_spsc_queue.cpp b/tests/ut/cpp/a2a3/test_spsc_queue.cpp new file mode 100644 index 000000000..a2c80ca05 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_spsc_queue.cpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2SpscQueue from pto_scheduler.h + * + * Tests the Rigtorp cached-index SPSC queue used as the orchestrator → + * scheduler wiring channel: + * - Basic push / pop_batch correctness + * - Full / empty detection (including cached-index lazy refresh) + * - Wrap-around via modulo indexing + * - Capacity is capacity-1 (one sentinel slot) + * - pop_batch partial reads + * - size() accuracy + */ + +#include + +#include +#include +#include + +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Fixture +// ============================================================================= + +class SpscQueueTest : public ::testing::Test { +protected: + static constexpr uint64_t CAPACITY = 16; // must be power of 2 + + PTO2SpscQueue queue{}; + // Dummy slot states used as push values + alignas(64) PTO2TaskSlotState slots[64]{}; + + void SetUp() override { + memset(&queue, 0, sizeof(queue)); + ASSERT_TRUE(queue.init(CAPACITY)); + } + + void TearDown() override { queue.destroy(); } +}; + +// ============================================================================= +// Initialization +// ============================================================================= + +TEST_F(SpscQueueTest, InitValidState) { + EXPECT_EQ(queue.size(), 0u); + EXPECT_EQ(queue.mask_, CAPACITY - 1); + EXPECT_NE(queue.buffer_, nullptr); +} + +TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) { + PTO2SpscQueue bad{}; + EXPECT_FALSE(bad.init(3)); + EXPECT_FALSE(bad.init(7)); + EXPECT_FALSE(bad.init(0)); +} + +TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { + PTO2SpscQueue q{}; + EXPECT_TRUE(q.init(4)); + q.destroy(); + EXPECT_TRUE(q.init(1024)); + q.destroy(); +} + +// ============================================================================= +// Basic push / pop +// ============================================================================= + +TEST_F(SpscQueueTest, PushPopSingle) { + EXPECT_TRUE(queue.push(&slots[0])); + EXPECT_EQ(queue.size(), 1u); + + PTO2TaskSlotState *out[1]; + int count = queue.pop_batch(out, 1); + ASSERT_EQ(count, 1); + EXPECT_EQ(out[0], &slots[0]); + EXPECT_EQ(queue.size(), 0u); +} + +TEST_F(SpscQueueTest, FIFOOrdering) { + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(queue.push(&slots[i])); + } + + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, 5); + ASSERT_EQ(count, 5); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(out[i], &slots[i]) << "FIFO order violated at i=" << i; + } +} + +TEST_F(SpscQueueTest, PopBatchPartial) { + for (int i = 0; i < 3; i++) { + queue.push(&slots[i]); + } + + // Request 5 but only 3 available + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, 5); + EXPECT_EQ(count, 3); +} + +TEST_F(SpscQueueTest, PopBatchEmpty) { + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, 5); + EXPECT_EQ(count, 0); +} + +// ============================================================================= +// Full detection +// ============================================================================= + +TEST_F(SpscQueueTest, FullReturnsFalse) { + // Usable capacity = CAPACITY - 1 = 15 + for (uint64_t i = 0; i < CAPACITY - 1; i++) { + ASSERT_TRUE(queue.push(&slots[i])) << "push failed at i=" << i; + } + EXPECT_EQ(queue.size(), CAPACITY - 1); + + // Queue full + EXPECT_FALSE(queue.push(&slots[CAPACITY - 1])) << "Push to full queue must return false"; +} + +TEST_F(SpscQueueTest, UsableCapacityIsCapacityMinusOne) { + int pushed = 0; + while (queue.push(&slots[pushed % 64])) { + pushed++; + if (pushed > 100) break; // safety + } + EXPECT_EQ(pushed, static_cast(CAPACITY - 1)); +} + +// ============================================================================= +// Full then recover +// ============================================================================= + +TEST_F(SpscQueueTest, FullThenPopThenPush) { + for (uint64_t i = 0; i < CAPACITY - 1; i++) { + queue.push(&slots[i]); + } + EXPECT_FALSE(queue.push(&slots[0])); + + // Pop one + PTO2TaskSlotState *out[1]; + int count = queue.pop_batch(out, 1); + ASSERT_EQ(count, 1); + + // Now push should succeed + EXPECT_TRUE(queue.push(&slots[0])); +} + +// ============================================================================= +// Wrap-around +// ============================================================================= + +TEST_F(SpscQueueTest, WrapAroundCorrectness) { + // Push-pop cycles to advance head/tail past capacity boundary + for (int cycle = 0; cycle < 100; cycle++) { + ASSERT_TRUE(queue.push(&slots[cycle % 64])) << "push failed at cycle=" << cycle; + PTO2TaskSlotState *out[1]; + int count = queue.pop_batch(out, 1); + ASSERT_EQ(count, 1) << "pop_batch failed at cycle=" << cycle; + EXPECT_EQ(out[0], &slots[cycle % 64]); + } + EXPECT_EQ(queue.size(), 0u); +} + +TEST_F(SpscQueueTest, WrapAroundBatchCorrectness) { + // Multiple cycles of batch push/pop across wrap boundary + for (int cycle = 0; cycle < 20; cycle++) { + int batch = 5; + for (int i = 0; i < batch; i++) { + ASSERT_TRUE(queue.push(&slots[(cycle * batch + i) % 64])); + } + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, batch); + ASSERT_EQ(count, batch); + for (int i = 0; i < batch; i++) { + EXPECT_EQ(out[i], &slots[(cycle * batch + i) % 64]); + } + } +} + +// ============================================================================= +// size() accuracy +// ============================================================================= + +TEST_F(SpscQueueTest, SizeTracksOperations) { + EXPECT_EQ(queue.size(), 0u); + + queue.push(&slots[0]); + EXPECT_EQ(queue.size(), 1u); + + queue.push(&slots[1]); + queue.push(&slots[2]); + EXPECT_EQ(queue.size(), 3u); + + PTO2TaskSlotState *out[2]; + queue.pop_batch(out, 2); + EXPECT_EQ(queue.size(), 1u); + + queue.pop_batch(out, 1); + EXPECT_EQ(queue.size(), 0u); +} + +// ============================================================================= +// Producer-consumer (two threads) +// ============================================================================= + +TEST_F(SpscQueueTest, TwoThreadProducerConsumer) { + constexpr int TOTAL = 10000; + std::vector consumed; + consumed.reserve(TOTAL); + + // Use a large pool of slot states for unique pointers + std::vector big_pool(TOTAL); + + std::thread producer([&]() { + for (int i = 0; i < TOTAL; i++) { + while (!queue.push(&big_pool[i])) { + // spin + } + } + }); + + std::thread consumer([&]() { + int total = 0; + PTO2TaskSlotState *out[32]; + while (total < TOTAL) { + int count = queue.pop_batch(out, 32); + for (int i = 0; i < count; i++) { + consumed.push_back(out[i]); + } + total += count; + } + }); + + producer.join(); + consumer.join(); + + ASSERT_EQ(consumed.size(), static_cast(TOTAL)); + // Verify FIFO order + for (int i = 0; i < TOTAL; i++) { + EXPECT_EQ(consumed[i], &big_pool[i]) << "FIFO violated at i=" << i; + } +} + +// ============================================================================= +// Cached index behavior +// ============================================================================= + +TEST_F(SpscQueueTest, CachedIndexLazyRefresh) { + // Fill queue + for (uint64_t i = 0; i < CAPACITY - 1; i++) { + queue.push(&slots[i]); + } + + // Consumer pops all + PTO2TaskSlotState *out[16]; + int count = queue.pop_batch(out, CAPACITY); + EXPECT_EQ(count, static_cast(CAPACITY - 1)); + + // Producer's tail_cached_ is stale (still thinks queue is full) + // Next push should refresh tail_cached_ and succeed + EXPECT_TRUE(queue.push(&slots[0])); +} + +TEST_F(SpscQueueTest, CachedIndexConsumerRefresh) { + // Consumer calls pop_batch on empty queue (head_cached_ is 0) + PTO2TaskSlotState *out[1]; + EXPECT_EQ(queue.pop_batch(out, 1), 0); + + // Producer pushes + queue.push(&slots[0]); + + // Consumer's head_cached_ is stale, pop_batch must refresh + int count = queue.pop_batch(out, 1); + EXPECT_EQ(count, 1); + EXPECT_EQ(out[0], &slots[0]); +} diff --git a/tests/ut/cpp/a2a3/test_task_allocator.cpp b/tests/ut/cpp/a2a3/test_task_allocator.cpp new file mode 100644 index 000000000..383003900 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_task_allocator.cpp @@ -0,0 +1,407 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2TaskAllocator from pto_ring_buffer.h + * + * Tests ring buffer allocation, heap bump logic, wrap-around, alignment, + * task window flow control, and heap_available semantics. + * + * The allocator is single-threaded (orchestrator thread), so no concurrency + * tests are needed. The unified PTO2TaskAllocator replaces the previous + * separate PTO2HeapRing + PTO2TaskRing. + * + * Design contracts (try_bump_heap): + * + * - Wrap-around guard uses `tail > alloc_size` (strict >). When + * tail == alloc_size the wrap branch returns nullptr. Allowing it + * would create top == tail (full/empty ambiguity). Strict > + * sacrifices one quantum of capacity. + * + * - heap_available() returns max(at_end, at_begin), not the sum. + * A single allocation cannot split across the wrap boundary. + * + * - Zero-size allocation is a no-op returning the current top. + * Two consecutive zero-size allocs return the SAME pointer. + * + * - Wrap path wasted space: space between old top and heap_size is not + * reclaimed. Inherent ring-buffer fragmentation cost. + */ + +#include + +#include +#include +#include +#include +#include + +#include "pto_ring_buffer.h" + +// ============================================================================= +// Helpers +// +// WHITE-BOX: consume_up_to simulates the scheduler consuming tasks by directly +// writing descriptor.packed_buffer_end and advancing last_alive. This binds +// to the internal tail-derivation mechanism. If the allocator's reclaim +// protocol changes (e.g. explicit tail field instead of packed_buffer_end), +// this helper and all wrap/reclaim tests must be updated. +// ============================================================================= + +static void consume_up_to( + std::vector &descriptors, std::atomic &last_alive, void *heap_base, + int32_t window_size, int32_t new_last_alive, uint64_t heap_tail_offset +) { + int32_t last_consumed = new_last_alive - 1; + descriptors[last_consumed & (window_size - 1)].packed_buffer_end = + static_cast(heap_base) + heap_tail_offset; + last_alive.store(new_last_alive, std::memory_order_release); +} + +// ============================================================================= +// Fixture +// ============================================================================= + +class TaskAllocatorTest : public ::testing::Test { +protected: + static constexpr int32_t WINDOW_SIZE = 16; + static constexpr uint64_t HEAP_SIZE = 4096; + + std::vector descriptors; + alignas(64) uint8_t heap_buf[HEAP_SIZE]{}; + std::atomic current_index{0}; + std::atomic last_alive{0}; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2TaskAllocator allocator{}; + + void SetUp() override { + descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{}); + std::memset(heap_buf, 0, sizeof(heap_buf)); + current_index.store(0); + last_alive.store(0); + error_code.store(PTO2_ERROR_NONE); + allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(TaskAllocatorTest, InitialState) { + EXPECT_EQ(allocator.window_size(), WINDOW_SIZE); + EXPECT_EQ(allocator.active_count(), 0); + EXPECT_EQ(allocator.heap_top(), 0u); + EXPECT_EQ(allocator.heap_capacity(), HEAP_SIZE); + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE); +} + +TEST_F(TaskAllocatorTest, AllocNonZeroSize) { + auto result = allocator.alloc(100); + ASSERT_FALSE(result.failed()); + EXPECT_EQ(result.task_id, 0); + EXPECT_EQ(result.slot, 0); + EXPECT_NE(result.packed_base, nullptr); + // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128 + uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE); + EXPECT_EQ(expected_aligned, 128u); + EXPECT_EQ(allocator.heap_top(), expected_aligned); + EXPECT_EQ( + static_cast(result.packed_end) - static_cast(result.packed_base), + static_cast(expected_aligned) + ); +} + +TEST_F(TaskAllocatorTest, SequentialTaskIds) { + int32_t prev_id = -1; + for (int i = 0; i < 5; i++) { + auto result = allocator.alloc(0); + ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i; + EXPECT_EQ(result.task_id, prev_id + 1) << "Task IDs must be monotonically increasing"; + EXPECT_EQ(result.slot, result.task_id & (WINDOW_SIZE - 1)); + prev_id = result.task_id; + } + EXPECT_EQ(allocator.active_count(), 5); +} + +TEST_F(TaskAllocatorTest, OutputSizeAlignment) { + // 1 byte -> aligned to 64 + auto r1 = allocator.alloc(1); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), 64u); + + // Another 33 bytes -> aligned to 64, total 128 + auto r2 = allocator.alloc(33); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator.heap_top(), 128u); + + // Exactly 64 bytes -> stays 64, total 192 + auto r3 = allocator.alloc(64); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator.heap_top(), 192u); +} + +TEST_F(TaskAllocatorTest, SlotMappingPowerOfTwoWindow) { + std::set slots; + for (int i = 0; i < WINDOW_SIZE; i++) { + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, i, 0); + auto r = allocator.alloc(0); + ASSERT_FALSE(r.failed()); + EXPECT_EQ(r.slot, r.task_id & (WINDOW_SIZE - 1)); + slots.insert(r.slot); + } + EXPECT_EQ(slots.size(), static_cast(WINDOW_SIZE)) + << "Every slot should be visited exactly once over one window cycle"; +} + +TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) { + auto r1 = allocator.alloc(256); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), 256u); + + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 256); + + // Force the allocator to observe the new last_alive by doing another alloc + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + + // top=256, tail=256: at_end = 4096-256=3840, at_begin = 256 + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u); +} + +TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) { + auto r1 = allocator.alloc(64); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.task_id, 0); + + descriptors[0].packed_buffer_end = static_cast(static_cast(heap_buf)) + 64; + last_alive.store(1, std::memory_order_release); + + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 1); +} + +TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) { + auto r1 = allocator.alloc(128); + ASSERT_FALSE(r1.failed()); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128); + + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + uint64_t avail_after_first = allocator.heap_available(); + + auto r3 = allocator.alloc(0); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator.heap_available(), avail_after_first); +} + +TEST_F(TaskAllocatorTest, HeapAvailableTopGeTail) { + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE); + + auto r1 = allocator.alloc(256); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u); +} + +TEST_F(TaskAllocatorTest, HeapAvailableTopLtTail) { + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64); + + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + // top=128, tail=HEAP_SIZE-64: available = (HEAP_SIZE-64) - 128 + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 64 - 128); +} + +// ============================================================================= +// Boundary conditions +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) { + // Allocate 4032 bytes to leave exactly 64 at end. + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE - 64u); + + auto r2 = allocator.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + EXPECT_EQ(static_cast(r2.packed_base), reinterpret_cast(heap_buf) + HEAP_SIZE - 64); +} + +// Wrap guard `tail > alloc_size` uses strict > to prevent full/empty ambiguity. +// If the allocation were allowed, heap_top would advance to alloc_size == tail, +// making top == tail. Because top == tail is the canonical "empty" state, the +// ring could not distinguish "completely full" from "completely empty". +TEST_F(TaskAllocatorTest, HeapWrapGuardRejectsTailEqualsAllocSize) { + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 64); + + auto r2 = allocator.alloc(64); + EXPECT_TRUE(r2.failed()) << "wrap guard must reject when tail == alloc_size (full/empty ambiguity)"; +} + +TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) { + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128); + + auto r2 = allocator.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.packed_base, static_cast(heap_buf)); + EXPECT_EQ(allocator.heap_top(), 64u); +} + +// Linear-gap guard `tail - top > alloc_size` uses strict > for the same reason. +TEST_F(TaskAllocatorTest, HeapLinearGapGuardRejectsExactFit) { + // Fill most of heap, leaving just 64 at end so next alloc wraps. + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64); + + // Allocate 128 bytes: space_at_end = 64, not enough -> wrap. + // tail = HEAP_SIZE-64, which is > 128 -> wraps to beginning. + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator.heap_top(), 128u); + + // Now top=128, tail=HEAP_SIZE-64 (top < tail) + // gap = (HEAP_SIZE-64) - 128 = HEAP_SIZE-192 + // Allocate exactly gap bytes: gap > alloc_size -> FALSE + uint64_t gap = (HEAP_SIZE - 64) - 128; + auto r3 = allocator.alloc(gap); + EXPECT_TRUE(r3.failed()) << "linear-gap guard must reject exact fit (full/empty ambiguity)"; +} + +TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) { + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64); + + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + + // gap = (HEAP_SIZE-64) - 128. Try to allocate more than gap. + auto r3 = allocator.alloc(HEAP_SIZE); + EXPECT_TRUE(r3.failed()); + EXPECT_NE(error_code.load(), 0); +} + +// heap_available reports max(at_end, at_begin), not the sum -- a single +// allocation cannot split across the wrap boundary. +TEST_F(TaskAllocatorTest, AvailableReportsMaxNotSum) { + auto r1 = allocator.alloc(3008); + ASSERT_FALSE(r1.failed()); + uint64_t actual_top = allocator.heap_top(); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 1024); + + auto r_probe = allocator.alloc(0); + ASSERT_FALSE(r_probe.failed()); + + uint64_t avail = allocator.heap_available(); + uint64_t at_end = HEAP_SIZE - actual_top; + uint64_t at_begin = 1024; + EXPECT_EQ(avail, std::max(at_end, at_begin)); + EXPECT_LT(avail, at_end + at_begin); +} + +// Zero-size allocs return the same address and don't advance the top. +TEST_F(TaskAllocatorTest, ZeroSizeAllocationAliased) { + auto r1 = allocator.alloc(0); + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r1.failed()); + ASSERT_FALSE(r2.failed()); + + EXPECT_EQ(r1.packed_base, r2.packed_base) << "Zero-size allocs return same address"; + EXPECT_EQ(r1.packed_base, r1.packed_end) << "packed_end == packed_base for zero-size"; + EXPECT_EQ(allocator.heap_top(), 0u) << "top doesn't advance for zero-size allocs"; +} + +// Wrap path: wasted space between old top and heap_size is not reclaimed. +TEST_F(TaskAllocatorTest, WrapPathWastedSpace) { + auto r1 = allocator.alloc(4000); + ASSERT_FALSE(r1.failed()); + uint64_t top_after = allocator.heap_top(); + EXPECT_GE(top_after, 4000u); + EXPECT_LT(top_after, HEAP_SIZE); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, top_after); + + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.packed_base, static_cast(heap_buf)) << "Allocation wrapped to beginning"; + + uint64_t avail = allocator.heap_available(); + EXPECT_LT(avail, HEAP_SIZE) << "Wasted space at end reduces available capacity"; +} + +TEST_F(TaskAllocatorTest, AllocExactlyHeapSize) { + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.packed_base, static_cast(heap_buf)); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + + auto r2 = allocator.alloc(64); + EXPECT_TRUE(r2.failed()) << "No space after full allocation"; + EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +TEST_F(TaskAllocatorTest, AllocLargerThanHeap) { + auto r = allocator.alloc(HEAP_SIZE * 2); + EXPECT_TRUE(r.failed()) << "Cannot allocate more than heap size"; + EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +TEST_F(TaskAllocatorTest, TaskWindowSaturates) { + for (int i = 0; i < WINDOW_SIZE - 1; i++) { + auto r = allocator.alloc(0); + ASSERT_FALSE(r.failed()) << "Alloc failed at i=" << i; + EXPECT_EQ(r.task_id, i); + } + EXPECT_EQ(allocator.active_count(), WINDOW_SIZE - 1); + + auto overflow = allocator.alloc(0); + EXPECT_TRUE(overflow.failed()); + EXPECT_EQ(error_code.load(), PTO2_ERROR_FLOW_CONTROL_DEADLOCK); +} + +// Task IDs grow monotonically as int32_t. Near INT32_MAX, the same +// signed-overflow concern applies but is cosmetic since we use +// task_id & window_mask for indexing. +TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { + current_index.store(INT32_MAX - 2); + last_alive.store(INT32_MAX - 2); + allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + + auto r1 = allocator.alloc(0); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.task_id, INT32_MAX - 2); + EXPECT_EQ(r1.slot, (INT32_MAX - 2) & (WINDOW_SIZE - 1)); + + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, INT32_MAX - 1); + + auto r3 = allocator.alloc(0); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(r3.task_id, INT32_MAX); + EXPECT_GE(r3.slot, 0); + EXPECT_LT(r3.slot, WINDOW_SIZE); +} diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp new file mode 100644 index 000000000..7c468a9e7 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_task_state.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2TaskSlotState lifecycle through PTO2SchedulerState API. + * + * These tests drive state transitions via src methods (release_fanin, + * on_subtask_complete, check_and_handle_consumed) rather than manually + * operating atomic fields. For concurrent exactly-once semantics of + * fanin/subtask/fanout, see test_scheduler_state.cpp which already + * covers those paths via the same API. + * + * This file focuses on: + * - Full lifecycle through src API + * - Non-profiling ready path behavior (task_state stays PENDING) + * - Double subtask completion (counter-model weakness) + */ + +#include +#include +#include +#include +#include +#include "scheduler/pto_scheduler.h" + +class TaskStateTest : public ::testing::Test { +protected: + PTO2SchedulerState sched; + PTO2SharedMemoryHandle *sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle->header); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + void init_slot(PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = 0; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.logical_block_num = 1; + } +}; + +// ============================================================================= +// Full lifecycle through src API: PENDING -> (fanin) -> READY-equivalent +// -> (subtask) -> COMPLETED -> (fanout) -> CONSUMED +// ============================================================================= +TEST_F(TaskStateTest, FullLifecycleThroughAPI) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 1, 1); + slot.total_required_subtasks = 1; + slot.completed_subtasks.store(0); + + // Fanin satisfied -> task becomes ready + bool ready = sched.release_fanin_and_check_ready(slot); + EXPECT_TRUE(ready); + + // Subtask completes -> task done + bool done = sched.on_subtask_complete(slot); + EXPECT_TRUE(done); + + // Manually transition to COMPLETED (normally done by scheduler dispatch loop) + slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); + + // Fanout released -> CONSUMED + sched.release_producer(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// Non-profiling release_fanin does not CAS task_state to READY. +// +// Readiness is determined solely by fanin_refcount reaching fanin_count. +// task_state stays PENDING after the non-profiling ready path. This is +// correct by design -- the profiling overload adds the CAS only to count +// atomic operations. +// ============================================================================= +TEST_F(TaskStateTest, NonProfilingReadyPathStaysPending) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 1, 1); + + bool ready = sched.release_fanin_and_check_ready(slot); + ASSERT_TRUE(ready) << "Task should be detected as ready via refcount"; + + // task_state remains PENDING -- this is correct by design. + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING) + << "Non-profiling path intentionally does not transition task_state to READY"; +} + +// ============================================================================= +// Multi-fanin: partial release does not trigger ready +// ============================================================================= +TEST_F(TaskStateTest, MultiFaninPartialNotReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 3, 1); + + EXPECT_FALSE(sched.release_fanin_and_check_ready(slot)); + EXPECT_FALSE(sched.release_fanin_and_check_ready(slot)); + EXPECT_TRUE(sched.release_fanin_and_check_ready(slot)); +} + +// ============================================================================= +// Concurrent fanin: exactly one thread detects ready (via src API) +// ============================================================================= +TEST_F(TaskStateTest, ConcurrentFaninExactlyOneReady) { + constexpr int ROUNDS = 500; + + for (int round = 0; round < ROUNDS; round++) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 3, 1); + std::atomic ready_count{0}; + + auto release = [&]() { + if (sched.release_fanin_and_check_ready(slot)) { + ready_count.fetch_add(1); + } + }; + + std::thread t1(release), t2(release), t3(release); + t1.join(); + t2.join(); + t3.join(); + + EXPECT_EQ(ready_count.load(), 1) << "Round " << round; + } +} + +// ============================================================================= +// Concurrent subtask completion: exactly one thread sees done (via src API) +// ============================================================================= +TEST_F(TaskStateTest, ConcurrentSubtaskCompletion) { + constexpr int ROUNDS = 500; + + for (int round = 0; round < ROUNDS; round++) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 3; + slot.completed_subtasks.store(0); + std::atomic done_count{0}; + + auto complete = [&]() { + if (sched.on_subtask_complete(slot)) { + done_count.fetch_add(1); + } + }; + + std::thread t1(complete), t2(complete), t3(complete); + t1.join(); + t2.join(); + t3.join(); + + EXPECT_EQ(done_count.load(), 1) << "Round " << round; + EXPECT_EQ(slot.completed_subtasks.load(), 3); + } +} + +// ============================================================================= +// Double subtask completion (counter-model weakness). +// With the counter model, double-completing the same subtask increments +// completed_subtasks twice, potentially reaching total prematurely. +// Unlike the old bitmask model, the counter cannot detect duplicates. +// ============================================================================= +TEST_F(TaskStateTest, DoubleSubtaskCompletionCounterWeakness) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 2; + slot.completed_subtasks.store(0); + + // First subtask completion + bool done1 = sched.on_subtask_complete(slot); + EXPECT_FALSE(done1) << "Single completion doesn't complete the task"; + + // Same subtask completes AGAIN (logic error at caller level) + bool done2 = sched.on_subtask_complete(slot); + EXPECT_TRUE(done2) << "Counter model: double-completion falsely triggers done"; +} diff --git a/tests/ut/cpp/a2a3/test_tensormap.cpp b/tests/ut/cpp/a2a3/test_tensormap.cpp new file mode 100644 index 000000000..10eef0317 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_tensormap.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2TensorMap from pto_tensormap.h / pto_tensormap.cpp + * + * Tests hash-table-based producer lookup with overlap detection: + * - Hash function distribution (golden-ratio multiplicative hash) + * - Insert / lookup / cleanup lifecycle + * - Overlap detection: fast-path (is_all_offset_zero) and slow-path (offsets) + * - Lazy invalidation (stale entries skipped, not truncated) + * - Multi-ring isolation in the same hash chain + * - Lookup returns all matches (no silent 16-result cap post-#669) + * - Entry pool allocation and free-list recycling + * - cleanup_retired correctness across task windows + */ + +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" +#include "pto_tensormap.h" + +// ============================================================================= +// Helpers +// ============================================================================= + +// Test-local mirror of the old stack-buffered lookup result. PR #669 removed +// PTO2LookupResult in favor of a callback-based API; these tests collect +// matches into a vector-like struct so assertions remain readable. +struct TestLookupResult { + struct Entry { + PTO2TensorMapEntry *entry; + OverlapStatus overlap_status; + }; + std::vector entries; + int count = 0; +}; + +static void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) { + tmap.lookup(tensor, [&](PTO2TensorMapEntry &e, OverlapStatus s) -> bool { + out.entries.push_back({&e, s}); + out.count++; + return true; + }); +} + +static Tensor make_test_tensor(uint64_t addr, uint32_t shape0, uint32_t ndims = 1, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0}; + return make_tensor_external(reinterpret_cast(addr), shapes, ndims, DataType::FLOAT32, false, version); +} + +static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {s0, s1}; + return make_tensor_external(reinterpret_cast(addr), shapes, 2, DataType::FLOAT32, false, version); +} + +// ============================================================================= +// Fixture +// ============================================================================= + +class TensorMapTest : public ::testing::Test { +protected: + static constexpr int32_t NUM_BUCKETS = 16; + static constexpr int32_t POOL_SIZE = 64; + static constexpr int32_t WINDOW_SIZE = 32; + + PTO2TensorMap tmap{}; + + void SetUp() override { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE}; + ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes)); + } + + void TearDown() override { tmap.destroy(); } +}; + +// ============================================================================= +// Initialization +// ============================================================================= + +TEST_F(TensorMapTest, InitValidState) { + EXPECT_EQ(tmap.num_buckets, NUM_BUCKETS); + EXPECT_EQ(tmap.pool_size, POOL_SIZE); + EXPECT_EQ(tmap.next_entry_idx, 0); + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.valid_count(), 0); +} + +TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { + PTO2TensorMap bad{}; + int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; + EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail"; + EXPECT_FALSE(bad.init(7, 64, ws)); + EXPECT_TRUE(bad.init(8, 64, ws)); + bad.destroy(); +} + +// ============================================================================= +// Hash function +// ============================================================================= + +TEST_F(TensorMapTest, HashDeterministic) { + uint64_t addr = 0x1000; + EXPECT_EQ(tmap.hash(addr), tmap.hash(addr)); +} + +TEST_F(TensorMapTest, HashDistributesAlignedAddresses) { + std::set hit_buckets; + // Aligned addresses (64KB stride) should still distribute across buckets + for (uint64_t i = 0; i < 64; i++) { + uint64_t addr = i * 65536; + hit_buckets.insert(tmap.hash(addr)); + } + // With golden-ratio hash, 64 aligned addresses across 16 buckets + // should hit at least 12 distinct buckets + EXPECT_GE(hit_buckets.size(), 12u) << "Aligned addresses must distribute well"; +} + +TEST_F(TensorMapTest, HashBoundedByBucketCount) { + for (uint64_t addr = 0; addr < 1000; addr++) { + EXPECT_LT(tmap.hash(addr), static_cast(NUM_BUCKETS)); + } +} + +// ============================================================================= +// Insert and lookup: basic +// ============================================================================= + +TEST_F(TensorMapTest, InsertThenLookupFindsProducer) { + Tensor t = make_test_tensor(0x1000, 256); + PTO2TaskId tid = PTO2TaskId::make(0, 0); + tmap.insert(t, tid); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, tid); +} + +TEST_F(TensorMapTest, LookupEmptyReturnsZero) { + Tensor t = make_test_tensor(0x1000, 256); + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 0); +} + +TEST_F(TensorMapTest, InsertMultipleSameBuffer) { + Tensor t1 = make_test_tensor(0x1000, 256); + Tensor t2 = make_test_tensor(0x1000, 128); + PTO2TaskId tid1 = PTO2TaskId::make(0, 0); + PTO2TaskId tid2 = PTO2TaskId::make(0, 1); + + tmap.insert(t1, tid1); + tmap.insert(t2, tid2); + + TestLookupResult result; + run_lookup(tmap, t1, result); + // Both entries share same buffer_addr, so both should be found + EXPECT_EQ(result.count, 2); +} + +TEST_F(TensorMapTest, InsertDifferentBuffersNoCollision) { + Tensor t1 = make_test_tensor(0x1000, 256); + Tensor t2 = make_test_tensor(0x2000, 256); + tmap.insert(t1, PTO2TaskId::make(0, 0)); + tmap.insert(t2, PTO2TaskId::make(0, 1)); + + TestLookupResult r1; + run_lookup(tmap, t1, r1); + EXPECT_EQ(r1.count, 1); + EXPECT_EQ(r1.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0)); + + TestLookupResult r2; + run_lookup(tmap, t2, r2); + EXPECT_EQ(r2.count, 1); + EXPECT_EQ(r2.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1)); +} + +// ============================================================================= +// Overlap detection: fast path (is_all_offset_zero) +// ============================================================================= + +TEST_F(TensorMapTest, OverlapFastPathCovered) { + // Producer output: shape [256], consumer input: shape [512] + // Consumer covers producer -> COVERED + Tensor producer = make_test_tensor(0x1000, 256); + Tensor consumer = make_test_tensor(0x1000, 512); + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +TEST_F(TensorMapTest, OverlapFastPathOther) { + // Producer output: shape [512], consumer input: shape [256] + // Consumer does NOT cover producer -> OTHER + Tensor producer = make_test_tensor(0x1000, 512); + Tensor consumer = make_test_tensor(0x1000, 256); + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER); +} + +TEST_F(TensorMapTest, OverlapFastPathExactMatch) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +// ============================================================================= +// Overlap detection: slow path (offsets via view) +// ============================================================================= + +TEST_F(TensorMapTest, OverlapSlowPathNoOverlap) { + // Producer writes [0..128), consumer reads [128..256) -> NO_OVERLAP + Tensor base = make_test_tensor_2d(0x1000, 256, 1); + uint32_t prod_shapes[] = {128, 1}; + uint32_t prod_offsets[] = {0, 0}; + Tensor producer = base.view(prod_shapes, prod_offsets); + + uint32_t con_shapes[] = {128, 1}; + uint32_t con_offsets[] = {128, 0}; + Tensor consumer = base.view(con_shapes, con_offsets); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + EXPECT_EQ(result.count, 0) << "Non-overlapping regions must return no results"; +} + +TEST_F(TensorMapTest, OverlapSlowPathPartialOverlap) { + // Producer writes [0..192), consumer reads [64..256) -> overlapping, OTHER + Tensor base = make_test_tensor_2d(0x1000, 256, 1); + uint32_t prod_shapes[] = {192, 1}; + uint32_t prod_offsets[] = {0, 0}; + Tensor producer = base.view(prod_shapes, prod_offsets); + + uint32_t con_shapes[] = {192, 1}; + uint32_t con_offsets[] = {64, 0}; + Tensor consumer = base.view(con_shapes, con_offsets); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER); +} + +TEST_F(TensorMapTest, OverlapSlowPathCovered) { + // Producer writes [64..192), consumer reads [0..256) -> consumer covers producer + Tensor base = make_test_tensor_2d(0x1000, 256, 1); + uint32_t prod_shapes[] = {128, 1}; + uint32_t prod_offsets[] = {64, 0}; + Tensor producer = base.view(prod_shapes, prod_offsets); + + uint32_t con_shapes[] = {256, 1}; + uint32_t con_offsets[] = {0, 0}; + Tensor consumer = base.view(con_shapes, con_offsets); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +// ============================================================================= +// Version-based overlap detection +// ============================================================================= + +TEST_F(TensorMapTest, VersionMismatchReturnsOther) { + // Producer v0, consumer v1 -> always OTHER regardless of shape match + Tensor producer = make_test_tensor(0x1000, 256, 1, 0); + Tensor consumer = make_test_tensor(0x1000, 256, 1, 1); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER); +} + +// ============================================================================= +// Lazy invalidation +// ============================================================================= + +TEST_F(TensorMapTest, StaleEntriesSkippedDuringLookup) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(0, 1)); + + // Advance validity to skip task 0 + tmap.sync_validity(0, 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1)); +} + +TEST_F(TensorMapTest, StaleEntriesNotTruncatedAcrossRings) { + Tensor t = make_test_tensor(0x1000, 256); + // Ring 0, task 0 and Ring 1, task 0 -> same bucket + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(1, 0)); + + // Invalidate ring 0 only + tmap.sync_validity(0, 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + // Ring 1 task 0 still valid, ring 0 task 0 invalidated + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0)); +} + +// ============================================================================= +// cleanup_retired +// ============================================================================= + +TEST_F(TensorMapTest, CleanupRetiredRemovesEntriesForRetiredTasks) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(0, 1)); + tmap.insert(t, PTO2TaskId::make(0, 2)); + EXPECT_EQ(tmap.valid_count(), 3); + + // Cleanup tasks [0, 2) on ring 0 + tmap.cleanup_retired(0, 0, 2); + + EXPECT_EQ(tmap.valid_count(), 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 2)); +} + +TEST_F(TensorMapTest, CleanupRetiredPreservesOtherRings) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(1, 0)); + + tmap.cleanup_retired(0, 0, 1); + + EXPECT_EQ(tmap.valid_count(), 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0)); +} + +TEST_F(TensorMapTest, CleanupRetiredFreesEntriesToPool) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.next_entry_idx, 1); + + tmap.cleanup_retired(0, 0, 1); + + EXPECT_EQ(tmap.free_num, 1) << "Cleaned entry should be in free list"; + + // New insert should reuse free entry instead of allocating fresh + tmap.insert(t, PTO2TaskId::make(0, 1)); + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.next_entry_idx, 1) << "Should reuse freed entry, not allocate new"; +} + +// ============================================================================= +// Multi-ring isolation +// ============================================================================= + +TEST_F(TensorMapTest, MultiRingIndependentLookup) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 5)); + tmap.insert(t, PTO2TaskId::make(1, 3)); + tmap.insert(t, PTO2TaskId::make(2, 7)); + + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 3); + + // Invalidate ring 0 up to task 6 and ring 2 up to task 8 + tmap.sync_validity(0, 6); + tmap.sync_validity(2, 8); + + TestLookupResult result2; + run_lookup(tmap, t, result2); + EXPECT_EQ(result2.count, 1); + EXPECT_EQ(result2.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 3)); +} + +// ============================================================================= +// Lookup returns all matches (PR #669 removed the 16-slot cap) +// ============================================================================= + +TEST_F(TensorMapTest, LookupReturnsAllMatches) { + Tensor t = make_test_tensor(0x1000, 256); + // Insert 20 entries for the same buffer (was capped at 16 before #669) + for (int i = 0; i < 20; i++) { + tmap.insert(t, PTO2TaskId::make(0, i)); + } + + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 20) << "Lookup must return every overlapping entry, no silent cap"; +} + +// ============================================================================= +// Entry pool lifecycle +// ============================================================================= + +TEST_F(TensorMapTest, PoolExhaustionAsserts) { + // With pool_size=64, inserting 64 entries should work, 65th should fail + for (int i = 0; i < POOL_SIZE; i++) { + Tensor t = make_test_tensor(0x1000 + i * 0x100, 256); + tmap.insert(t, PTO2TaskId::make(0, i)); + } + EXPECT_EQ(tmap.next_entry_idx, POOL_SIZE); + EXPECT_EQ(tmap.free_num, 0); + + // 65th insert should trigger always_assert (pool overflow) + Tensor overflow = make_test_tensor(0x9000, 256); + EXPECT_THROW(tmap.insert(overflow, PTO2TaskId::make(0, POOL_SIZE)), std::runtime_error); +} + +TEST_F(TensorMapTest, FreeListRecycling) { + Tensor t = make_test_tensor(0x1000, 256); + // Insert and cleanup 10 entries + for (int i = 0; i < 10; i++) { + tmap.insert(t, PTO2TaskId::make(0, i)); + } + tmap.cleanup_retired(0, 0, 10); + EXPECT_EQ(tmap.free_num, 10); + + // Re-insert should use free list + for (int i = 10; i < 20; i++) { + tmap.insert(t, PTO2TaskId::make(0, i)); + } + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.next_entry_idx, 10) << "No new pool entries consumed when free list available"; +} + +// ============================================================================= +// Task chain integrity (per-task entry list) +// ============================================================================= + +TEST_F(TensorMapTest, PerTaskEntryListTracksMultipleOutputs) { + Tensor t1 = make_test_tensor(0x1000, 256); + Tensor t2 = make_test_tensor(0x2000, 128); + PTO2TaskId tid = PTO2TaskId::make(0, 5); + + tmap.insert(t1, tid); + tmap.insert(t2, tid); + EXPECT_EQ(tmap.valid_count(), 2); + + // Cleanup task 5 should remove both entries + tmap.cleanup_retired(0, 5, 6); + EXPECT_EQ(tmap.valid_count(), 0); + EXPECT_EQ(tmap.free_num, 2); +} + +// ============================================================================= +// Bucket chain integrity (doubly-linked list) +// ============================================================================= + +TEST_F(TensorMapTest, RemoveMiddleEntryPreservesChain) { + Tensor t = make_test_tensor(0x1000, 256); + PTO2TaskId tid0 = PTO2TaskId::make(0, 0); + PTO2TaskId tid1 = PTO2TaskId::make(0, 1); + PTO2TaskId tid2 = PTO2TaskId::make(0, 2); + + tmap.insert(t, tid0); + tmap.insert(t, tid1); + tmap.insert(t, tid2); + + // Remove middle entry (task 1) + tmap.cleanup_retired(0, 1, 2); + + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 2); + + std::set found_locals; + for (int i = 0; i < result.count; i++) { + found_locals.insert(result.entries[i].entry->producer_task_id.local()); + } + EXPECT_TRUE(found_locals.count(0)); + EXPECT_TRUE(found_locals.count(2)); +} + +// ============================================================================= +// PTO2TaskId encoding/decoding +// ============================================================================= + +TEST(TaskIdTest, MakeAndDecode) { + auto tid = PTO2TaskId::make(3, 42); + EXPECT_EQ(tid.ring(), 3); + EXPECT_EQ(tid.local(), 42u); +} + +TEST(TaskIdTest, InvalidSentinel) { + auto inv = PTO2TaskId::invalid(); + EXPECT_FALSE(inv.is_valid()); + EXPECT_EQ(inv.raw, UINT64_MAX); +} + +TEST(TaskIdTest, Equality) { + auto a = PTO2TaskId::make(1, 100); + auto b = PTO2TaskId::make(1, 100); + auto c = PTO2TaskId::make(2, 100); + EXPECT_EQ(a, b); + EXPECT_NE(a, c); +} + +TEST(TaskIdTest, RingIdMaxValue) { + auto tid = PTO2TaskId::make(255, 0); + EXPECT_EQ(tid.ring(), 255); + EXPECT_EQ(tid.local(), 0u); +} + +TEST(TaskIdTest, LocalIdMaxValue) { + auto tid = PTO2TaskId::make(0, UINT32_MAX); + EXPECT_EQ(tid.ring(), 0); + EXPECT_EQ(tid.local(), UINT32_MAX); +} diff --git a/tests/ut/cpp/a2a3/test_wiring.cpp b/tests/ut/cpp/a2a3/test_wiring.cpp new file mode 100644 index 000000000..964e826f8 --- /dev/null +++ b/tests/ut/cpp/a2a3/test_wiring.cpp @@ -0,0 +1,448 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for scheduler wiring and completion paths: + * + * 1. wire_task() — fanout wiring, early-finished detection, + * fanin_count initialization, ready push + * 2. on_mixed_task_complete() — COMPLETED transition, fanout traversal, + * consumer fanin release + * 3. on_task_release() — fanin traversal, producer release, + * self-CONSUMED check + * 4. advance_ring_pointers() — CONSUMED slot scan, reset_for_reuse + * + * These tests exercise the core scheduling hot-paths that had zero coverage. + */ + +#include + +#include +#include +#include +#include + +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Fixture: sets up a scheduler with shared memory and provides helpers +// ============================================================================= + +class WiringTest : public ::testing::Test { +protected: + PTO2SchedulerState sched{}; + PTO2SharedMemoryHandle *sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle->header); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + // Initialize a slot for testing wiring/completion + void init_slot( + PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0 + ) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = ring_id; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.logical_block_num = 1; + slot.dep_pool_mark = 0; + } +}; + +// ============================================================================= +// wire_task: no fanin (independent task) +// ============================================================================= + +TEST_F(WiringTest, WireTaskNoFaninBecomesReady) { + // A task with 0 actual fanins should immediately be pushed to ready queue + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 0); + + // fanin_count set to 0 + 1 = 1 (the wiring "+1" sentinel) + EXPECT_EQ(task_slot.fanin_count, 1); + // fanin_refcount should be 1 (the +1 from no-fanin path) + EXPECT_EQ(task_slot.fanin_refcount.load(), 1); + + // Task should be in ready queue + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, &task_slot); +} + +// ============================================================================= +// wire_task: with fanin, all producers already completed (early-finished) +// ============================================================================= + +TEST_F(WiringTest, WireTaskAllProducersEarlyFinished) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producer_slots[2]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + // Set up 2 producers that are already COMPLETED + for (int i = 0; i < 2; i++) { + init_slot(producer_slots[i], PTO2_TASK_COMPLETED, 1, 2); + } + + // Consumer task with 2 fanins + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 2; + payload.fanin_inline_slot_states[0] = &producer_slots[0]; + payload.fanin_inline_slot_states[1] = &producer_slots[1]; + + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 2); + + // fanin_count = 2 + 1 = 3 + EXPECT_EQ(task_slot.fanin_count, 3); + // early_finished = 2, init_rc = 2 + 1 = 3, so refcount should hit fanin_count + EXPECT_GE(task_slot.fanin_refcount.load(), task_slot.fanin_count); + + // Task should be in ready queue + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, &task_slot); +} + +// ============================================================================= +// wire_task: with fanin, producers still pending (task NOT ready) +// ============================================================================= + +TEST_F(WiringTest, WireTaskProducersPendingTaskNotReady) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producer_slots[2]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + // Producers are RUNNING (not yet completed) + for (int i = 0; i < 2; i++) { + init_slot(producer_slots[i], PTO2_TASK_RUNNING, 1, 2); + } + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 2; + payload.fanin_inline_slot_states[0] = &producer_slots[0]; + payload.fanin_inline_slot_states[1] = &producer_slots[1]; + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 2); + + // fanin_count = 3 (2 + 1) + EXPECT_EQ(task_slot.fanin_count, 3); + // early_finished = 0, init_rc = 1 -> not ready + EXPECT_EQ(task_slot.fanin_refcount.load(), 1); + EXPECT_LT(task_slot.fanin_refcount.load(), task_slot.fanin_count); + + // Ready queue should be empty + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, nullptr); + + // Producers should have fanout_head pointing to task_slot + EXPECT_NE(producer_slots[0].fanout_head, nullptr); + EXPECT_EQ(producer_slots[0].fanout_head->slot_state, &task_slot); + EXPECT_NE(producer_slots[1].fanout_head, nullptr); + EXPECT_EQ(producer_slots[1].fanout_head->slot_state, &task_slot); +} + +// ============================================================================= +// wire_task: mixed early-finished and pending producers +// ============================================================================= + +TEST_F(WiringTest, WireTaskMixedProducerStates) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producers[3]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(producers[0], PTO2_TASK_COMPLETED, 1, 2); // early finished + init_slot(producers[1], PTO2_TASK_RUNNING, 1, 2); // still running + init_slot(producers[2], PTO2_TASK_CONSUMED, 1, 2); // early finished (>= COMPLETED) + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 3; + for (int i = 0; i < 3; i++) { + payload.fanin_inline_slot_states[i] = &producers[i]; + } + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 3); + + // fanin_count = 4 (3 + 1) + EXPECT_EQ(task_slot.fanin_count, 4); + // early_finished = 2 (COMPLETED + CONSUMED), init_rc = 3 + // Not yet 4 -> not ready (one producer still running) + EXPECT_EQ(task_slot.fanin_refcount.load(), 3); + + // Only the running producer should have the consumer in its fanout chain + EXPECT_EQ(producers[0].fanout_head, nullptr); // early finished, no dep entry added + EXPECT_NE(producers[1].fanout_head, nullptr); // running, dep entry added + EXPECT_EQ(producers[2].fanout_head, nullptr); // early finished +} + +// ============================================================================= +// on_mixed_task_complete: notifies consumers via fanout chain +// ============================================================================= + +TEST_F(WiringTest, OnMixedTaskCompleteNotifiesConsumers) { + alignas(64) PTO2TaskSlotState producer; + alignas(64) PTO2TaskSlotState consumer1, consumer2; + alignas(64) PTO2TaskPayload prod_payload; + memset(&prod_payload, 0, sizeof(prod_payload)); + PTO2TaskDescriptor desc{}; + + // Set up producer in RUNNING state with 2 consumers in fanout chain + init_slot(producer, PTO2_TASK_RUNNING, 1, 1); + producer.payload = &prod_payload; + producer.task = &desc; + + // Consumer1: needs 1 more fanin to become ready + init_slot(consumer1, PTO2_TASK_PENDING, 2, 1); + consumer1.fanin_refcount.store(1); // 1 of 2 satisfied + consumer1.active_mask = PTO2_SUBTASK_MASK_AIC; + + // Consumer2: this release will make it ready + init_slot(consumer2, PTO2_TASK_PENDING, 2, 1); + consumer2.fanin_refcount.store(1); // 1 of 2 satisfied + consumer2.active_mask = PTO2_SUBTASK_MASK_AIC; + + // Build fanout chain: producer -> consumer2 -> consumer1 + PTO2DepListEntry dep_entries[2]; + dep_entries[0].slot_state = &consumer1; + dep_entries[0].next = nullptr; + dep_entries[1].slot_state = &consumer2; + dep_entries[1].next = &dep_entries[0]; + producer.fanout_head = &dep_entries[1]; + + sched.on_mixed_task_complete(producer); + + // Producer should be COMPLETED + EXPECT_EQ(producer.task_state.load(), PTO2_TASK_COMPLETED); + + // Both consumers should have fanin_refcount incremented + EXPECT_EQ(consumer1.fanin_refcount.load(), 2); + EXPECT_EQ(consumer2.fanin_refcount.load(), 2); + + // Both consumers should be ready (fanin_refcount == fanin_count) + PTO2ResourceShape shape = pto2_active_mask_to_shape(consumer1.active_mask); + auto *r1 = sched.ready_queues[static_cast(shape)].pop(); + auto *r2 = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_TRUE((r1 == &consumer1 && r2 == &consumer2) || (r1 == &consumer2 && r2 == &consumer1)); +} + +// ============================================================================= +// on_task_release: releases producers via fanin traversal +// ============================================================================= + +TEST_F(WiringTest, OnTaskReleaseReleasesProducers) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producers[2]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + // 2 producers, each COMPLETED with fanout_count=1 + for (int i = 0; i < 2; i++) { + init_slot(producers[i], PTO2_TASK_COMPLETED, 1, 1); + } + + init_slot(task_slot, PTO2_TASK_COMPLETED, 3, 1); + payload.fanin_actual_count = 2; + payload.fanin_inline_slot_states[0] = &producers[0]; + payload.fanin_inline_slot_states[1] = &producers[1]; + // Need a valid fanin_spill_pool even though we don't spill + PTO2FaninPool dummy_pool{}; + PTO2FaninSpillEntry dummy_entries[4]; + std::atomic dummy_error{PTO2_ERROR_NONE}; + dummy_pool.init(dummy_entries, 4, &dummy_error); + payload.fanin_spill_pool = &dummy_pool; + task_slot.payload = &payload; + task_slot.task = &desc; + + int32_t fanin_count = sched.on_task_release(task_slot); + EXPECT_EQ(fanin_count, 2); + + // Each producer should have fanout_refcount incremented + EXPECT_EQ(producers[0].fanout_refcount.load(), 1); + EXPECT_EQ(producers[1].fanout_refcount.load(), 1); + + // Producers with fanout_refcount == fanout_count AND COMPLETED -> CONSUMED + EXPECT_EQ(producers[0].task_state.load(), PTO2_TASK_CONSUMED); + EXPECT_EQ(producers[1].task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// advance_ring_pointers: scans CONSUMED slots, resets, advances last_alive +// ============================================================================= + +TEST_F(WiringTest, AdvanceRingPointersScansConsumed) { + auto &rss = sched.ring_sched_states[0]; + auto *ring = rss.ring; + + // Submit 3 tasks via flow control + ring->fc.current_task_index.store(3, std::memory_order_release); + + // Mark all 3 as CONSUMED + for (int i = 0; i < 3; i++) { + auto &slot = ring->get_slot_state_by_task_id(i); + slot.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_release); + } + + EXPECT_EQ(rss.last_task_alive, 0); + rss.advance_ring_pointers(); + EXPECT_EQ(rss.last_task_alive, 3); + + // Verify SM was synced + EXPECT_EQ(ring->fc.last_task_alive.load(), 3); +} + +TEST_F(WiringTest, AdvanceRingPointersStopsAtNonConsumed) { + auto &rss = sched.ring_sched_states[0]; + auto *ring = rss.ring; + + ring->fc.current_task_index.store(5, std::memory_order_release); + + // Tasks 0,1 CONSUMED; task 2 COMPLETED (not consumed) + ring->get_slot_state_by_task_id(0).task_state.store(PTO2_TASK_CONSUMED); + ring->get_slot_state_by_task_id(1).task_state.store(PTO2_TASK_CONSUMED); + ring->get_slot_state_by_task_id(2).task_state.store(PTO2_TASK_COMPLETED); + + rss.advance_ring_pointers(); + EXPECT_EQ(rss.last_task_alive, 2) << "Should stop at first non-CONSUMED slot"; +} + +TEST_F(WiringTest, AdvanceRingPointersResetsSlots) { + auto &rss = sched.ring_sched_states[0]; + auto *ring = rss.ring; + + ring->fc.current_task_index.store(1, std::memory_order_release); + + auto &slot = ring->get_slot_state_by_task_id(0); + slot.task_state.store(PTO2_TASK_CONSUMED); + slot.fanout_count = 5; + slot.fanin_refcount.store(3); + slot.fanout_refcount.store(2); + slot.completed_subtasks.store(1); + + rss.advance_ring_pointers(); + + // After reset_for_reuse: fanout_count=1, fanin_refcount=0, etc. + EXPECT_EQ(slot.fanout_count, 1); + EXPECT_EQ(slot.fanin_refcount.load(), 0); + EXPECT_EQ(slot.fanout_refcount.load(), 0); + EXPECT_EQ(slot.completed_subtasks.load(), 0); + EXPECT_EQ(slot.fanout_head, nullptr); +} + +// ============================================================================= +// drain_wiring_queue: pushes tasks through SPSC queue +// ============================================================================= + +TEST_F(WiringTest, DrainWiringQueueProcessesTasks) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + // Push into wiring SPSC queue (orchestrator side) + ASSERT_TRUE(sched.wiring.queue.push(&task_slot)); + + // Drain (scheduler thread 0 side) + int wired = sched.drain_wiring_queue(true /* force_drain */); + EXPECT_EQ(wired, 1); + + // Task should be ready + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, &task_slot); +} + +TEST_F(WiringTest, DrainWiringQueueBackoffDefers) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + sched.wiring.queue.push(&task_slot); + + // Without force_drain, single item < BATCH_SIZE → backoff + sched.wiring.backoff_counter = 0; + int wired = sched.drain_wiring_queue(false); + EXPECT_EQ(wired, 0) << "Backoff should defer when queue < BATCH_SIZE"; + EXPECT_EQ(sched.wiring.backoff_counter, 1); +} + +TEST_F(WiringTest, DrainWiringQueueBackoffLimitForcesProcess) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + sched.wiring.queue.push(&task_slot); + + // Set backoff at limit → should process + sched.wiring.backoff_counter = PTO2SchedulerState::WiringState::BACKOFF_LIMIT; + int wired = sched.drain_wiring_queue(false); + EXPECT_EQ(wired, 1) << "Backoff limit reached should force processing"; +} diff --git a/tests/ut/cpp/a5/test_dep_list_pool.cpp b/tests/ut/cpp/a5/test_dep_list_pool.cpp new file mode 100644 index 000000000..a86a393d1 --- /dev/null +++ b/tests/ut/cpp/a5/test_dep_list_pool.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2DepListPool from pto_ring_buffer.h + * + * Tests dependency list pool allocation, prepend chaining, overflow detection, + * tail advancement, and high-water mark tracking. + * + * Design contracts: + * + * - advance_tail(new_tail) only advances if new_tail > tail; it does + * not validate new_tail <= top. Caller contract (monotonic, + * top-bounded). + * + * - The list terminator is literal nullptr. base[0] is a normal pool entry; + * init clearing it is incidental, not an invariant. + */ + +#include + +#include +#include +#include + +#include "pto_ring_buffer.h" + +// ============================================================================= +// Fixture +// ============================================================================= + +class DepListPoolTest : public ::testing::Test { +protected: + static constexpr int32_t POOL_CAP = 8; + PTO2DepListEntry entries[POOL_CAP]{}; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2DepListPool pool{}; + + void SetUp() override { + std::memset(entries, 0, sizeof(entries)); + error_code.store(PTO2_ERROR_NONE); + pool.init(entries, POOL_CAP, &error_code); + } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(DepListPoolTest, InitialState) { + EXPECT_EQ(pool.used(), 0); + EXPECT_EQ(pool.available(), POOL_CAP); +} + +TEST_F(DepListPoolTest, SingleAlloc) { + PTO2DepListEntry *entry = pool.alloc(); + ASSERT_NE(entry, nullptr); + EXPECT_EQ(pool.used(), 1); + EXPECT_EQ(pool.available(), POOL_CAP - 1); +} + +TEST_F(DepListPoolTest, OverflowDetection) { + for (int i = 0; i < POOL_CAP; i++) { + PTO2DepListEntry *e = pool.alloc(); + ASSERT_NE(e, nullptr) << "Unexpected failure at alloc " << i; + } + EXPECT_EQ(pool.used(), POOL_CAP); + EXPECT_EQ(pool.available(), 0); + + PTO2DepListEntry *overflow = pool.alloc(); + EXPECT_EQ(overflow, nullptr); + EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW); +} + +// Prepend builds LIFO linked list: verify each slot_state pointer. +TEST_F(DepListPoolTest, PrependChainCorrectness) { + PTO2TaskSlotState slots[5]{}; + PTO2DepListEntry *head = nullptr; + + for (int i = 0; i < 5; i++) { + head = pool.prepend(head, &slots[i]); + ASSERT_NE(head, nullptr); + } + + // LIFO order: head -> slots[4] -> slots[3] -> ... -> slots[0] -> nullptr. + PTO2DepListEntry *cur = head; + for (int i = 4; i >= 0; i--) { + ASSERT_NE(cur, nullptr); + EXPECT_EQ(cur->slot_state, &slots[i]) << "Entry " << (4 - i) << " should point to slots[" << i << "]"; + cur = cur->next; + } + EXPECT_EQ(cur, nullptr) << "Chain should terminate with nullptr"; +} + +TEST_F(DepListPoolTest, AdvanceTail) { + for (int i = 0; i < 4; i++) { + pool.alloc(); + } + EXPECT_EQ(pool.used(), 4); + EXPECT_EQ(pool.available(), POOL_CAP - 4); + + pool.advance_tail(4); + EXPECT_EQ(pool.used(), 1); + EXPECT_EQ(pool.available(), POOL_CAP - 1); +} + +TEST_F(DepListPoolTest, AdvanceTailBackwardsNoop) { + pool.alloc(); + pool.alloc(); + pool.advance_tail(3); + int32_t used_after = pool.used(); + + pool.advance_tail(2); + EXPECT_EQ(pool.used(), used_after); + + pool.advance_tail(3); + EXPECT_EQ(pool.used(), used_after); +} + +TEST_F(DepListPoolTest, HighWaterAccuracy) { + for (int i = 0; i < 5; i++) + pool.alloc(); + EXPECT_EQ(pool.high_water, 5); + + pool.advance_tail(4); + EXPECT_EQ(pool.high_water, 5) << "High water never decreases"; + + for (int i = 0; i < 3; i++) + pool.alloc(); + EXPECT_GE(pool.high_water, 5); +} + +// ============================================================================= +// Boundary conditions +// ============================================================================= + +// Prepend chain integrity under pool exhaustion: chain must be walkable. +TEST_F(DepListPoolTest, PrependUnderExhaustion) { + PTO2TaskSlotState slots[POOL_CAP]{}; + PTO2DepListEntry *head = nullptr; + + int count = 0; + while (count < POOL_CAP + 5) { + PTO2DepListEntry *new_head = pool.prepend(head, &slots[count % POOL_CAP]); + if (!new_head) break; + head = new_head; + count++; + } + + int walk = 0; + PTO2DepListEntry *cur = head; + while (cur) { + walk++; + cur = cur->next; + if (walk > count + 1) { + FAIL() << "Chain has cycle -- walked more entries than allocated"; + break; + } + } + EXPECT_EQ(walk, count); +} diff --git a/tests/ut/cpp/a5/test_fanin_pool.cpp b/tests/ut/cpp/a5/test_fanin_pool.cpp new file mode 100644 index 000000000..29199ae2e --- /dev/null +++ b/tests/ut/cpp/a5/test_fanin_pool.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2FaninPool and pto2_for_each_fanin_storage/slot_state + * from pto_ring_buffer.h / pto_ring_buffer.cpp + * + * Tests: + * 1. PTO2FaninPool — ring buffer allocation, overflow, tail advance, + * high-water tracking + * 2. pto2_for_each_fanin_storage — inline-only, spill without wrap, + * spill with wrap, callback early return + */ + +#include + +#include +#include +#include + +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" + +// ============================================================================= +// FaninPool fixture +// ============================================================================= + +class FaninPoolTest : public ::testing::Test { +protected: + static constexpr int32_t POOL_CAP = 32; + + std::vector entries; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2FaninPool pool{}; + + void SetUp() override { + entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr}); + error_code.store(PTO2_ERROR_NONE); + pool.init(entries.data(), POOL_CAP, &error_code); + } +}; + +// ============================================================================= +// FaninPool: basic operations +// ============================================================================= + +TEST_F(FaninPoolTest, InitialState) { + EXPECT_EQ(pool.used(), 0); + EXPECT_EQ(pool.available(), POOL_CAP); + EXPECT_EQ(pool.top, 1); + EXPECT_EQ(pool.tail, 1); + EXPECT_EQ(pool.high_water, 0); +} + +TEST_F(FaninPoolTest, AllocReturnsCorrectModuloIndex) { + // First alloc at index top%cap = 1%32 = 1 + auto *e1 = pool.alloc(); + EXPECT_EQ(e1, &entries[1]); + + auto *e2 = pool.alloc(); + EXPECT_EQ(e2, &entries[2]); +} + +TEST_F(FaninPoolTest, AllocFillsPool) { + for (int i = 0; i < POOL_CAP; i++) { + auto *e = pool.alloc(); + ASSERT_NE(e, nullptr) << "Alloc failed at i=" << i; + } + EXPECT_EQ(pool.used(), POOL_CAP); + EXPECT_EQ(pool.available(), 0); +} + +TEST_F(FaninPoolTest, OverflowReturnsNullptr) { + for (int i = 0; i < POOL_CAP; i++) { + pool.alloc(); + } + auto *overflow = pool.alloc(); + EXPECT_EQ(overflow, nullptr); + EXPECT_EQ(error_code.load(), PTO2_ERROR_DEP_POOL_OVERFLOW); +} + +TEST_F(FaninPoolTest, AdvanceTailFreesSpace) { + for (int i = 0; i < 10; i++) { + pool.alloc(); + } + EXPECT_EQ(pool.used(), 10); + + pool.advance_tail(pool.tail + 5); + EXPECT_EQ(pool.used(), 5); + EXPECT_EQ(pool.available(), POOL_CAP - 5); +} + +TEST_F(FaninPoolTest, AdvanceTailBackwardsIsNoop) { + for (int i = 0; i < 10; i++) { + pool.alloc(); + } + int32_t old_tail = pool.tail; + pool.advance_tail(old_tail - 1); + EXPECT_EQ(pool.tail, old_tail); + EXPECT_EQ(pool.used(), 10); +} + +TEST_F(FaninPoolTest, HighWaterNeverDecreases) { + for (int i = 0; i < 10; i++) { + pool.alloc(); + } + EXPECT_EQ(pool.high_water, 10); + + pool.advance_tail(pool.tail + 5); + EXPECT_EQ(pool.high_water, 10) << "high_water must never decrease"; +} + +TEST_F(FaninPoolTest, WrapAroundAllocation) { + // Fill and drain, then fill again to wrap + for (int i = 0; i < POOL_CAP; i++) { + pool.alloc(); + } + pool.advance_tail(pool.top); + EXPECT_EQ(pool.used(), 0); + + // New allocations wrap around + for (int i = 0; i < 5; i++) { + auto *e = pool.alloc(); + ASSERT_NE(e, nullptr); + // Verify modulo indexing + int32_t expected_idx = (pool.top - 1) % POOL_CAP; + EXPECT_EQ(e, &entries[expected_idx]); + } + EXPECT_EQ(pool.used(), 5); +} + +// ============================================================================= +// pto2_for_each_fanin_storage: inline only +// ============================================================================= + +class ForEachFaninTest : public ::testing::Test { +protected: + static constexpr int32_t POOL_CAP = 32; + + std::vector spill_entries; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2FaninPool spill_pool{}; + + alignas(64) PTO2TaskSlotState slots[64]; + + void SetUp() override { + spill_entries.assign(POOL_CAP, PTO2FaninSpillEntry{nullptr}); + error_code.store(PTO2_ERROR_NONE); + spill_pool.init(spill_entries.data(), POOL_CAP, &error_code); + memset(slots, 0, sizeof(slots)); + } +}; + +TEST_F(ForEachFaninTest, InlineOnlyVoid) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < 5; i++) { + inline_slots[i] = &slots[i]; + } + + std::vector visited; + pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *s) { + visited.push_back(s); + }); + + ASSERT_EQ(visited.size(), 5u); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(visited[i], &slots[i]); + } +} + +TEST_F(ForEachFaninTest, InlineOnlyBoolEarlyReturn) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < 5; i++) { + inline_slots[i] = &slots[i]; + } + + int count = 0; + bool result = pto2_for_each_fanin_storage(inline_slots, 5, 0, spill_pool, [&](PTO2TaskSlotState *) -> bool { + count++; + return count < 3; // stop after 3rd + }); + + EXPECT_FALSE(result) << "Should return false when callback returns false"; + EXPECT_EQ(count, 3); +} + +TEST_F(ForEachFaninTest, InlineOnlyBoolAllTrue) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < 3; i++) { + inline_slots[i] = &slots[i]; + } + + bool result = pto2_for_each_fanin_storage(inline_slots, 3, 0, spill_pool, [](PTO2TaskSlotState *) -> bool { + return true; + }); + + EXPECT_TRUE(result); +} + +TEST_F(ForEachFaninTest, ZeroFanin) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + int count = 0; + pto2_for_each_fanin_storage(inline_slots, 0, 0, spill_pool, [&](PTO2TaskSlotState *) { + count++; + }); + EXPECT_EQ(count, 0); +} + +// ============================================================================= +// pto2_for_each_fanin_storage: spill without wrap +// ============================================================================= + +TEST_F(ForEachFaninTest, SpillNoWrap) { + // 18 fanins = 16 inline + 2 spill + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) { + inline_slots[i] = &slots[i]; + } + + // Allocate 2 spill entries + auto *s0 = spill_pool.alloc(); + int32_t spill_start = spill_pool.top - 1; + s0->slot_state = &slots[16]; + auto *s1 = spill_pool.alloc(); + s1->slot_state = &slots[17]; + + std::vector visited; + pto2_for_each_fanin_storage(inline_slots, 18, spill_start, spill_pool, [&](PTO2TaskSlotState *s) { + visited.push_back(s); + }); + + ASSERT_EQ(visited.size(), 18u); + for (int i = 0; i < 16; i++) { + EXPECT_EQ(visited[i], &slots[i]) << "Inline slot " << i; + } + EXPECT_EQ(visited[16], &slots[16]); + EXPECT_EQ(visited[17], &slots[17]); +} + +// ============================================================================= +// pto2_for_each_fanin_storage: spill with wrap +// ============================================================================= + +TEST_F(ForEachFaninTest, SpillWithWrap) { + // Push pool near end so spill wraps around + // Pool cap = 32, advance top to 30 so next alloc is at index 30 + spill_pool.top = POOL_CAP - 2; + spill_pool.tail = POOL_CAP - 2; + + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) { + inline_slots[i] = &slots[i]; + } + + // 4 spill entries: indices 30, 31, 0, 1 (wraps around) + int32_t spill_start = spill_pool.top; + for (int i = 0; i < 4; i++) { + auto *e = spill_pool.alloc(); + ASSERT_NE(e, nullptr); + e->slot_state = &slots[16 + i]; + } + + std::vector visited; + pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *s) { + visited.push_back(s); + }); + + ASSERT_EQ(visited.size(), 20u); + // Inline + for (int i = 0; i < 16; i++) { + EXPECT_EQ(visited[i], &slots[i]); + } + // Spill (wrapped) + for (int i = 0; i < 4; i++) { + EXPECT_EQ(visited[16 + i], &slots[16 + i]); + } +} + +// ============================================================================= +// pto2_for_each_fanin_storage: spill with bool callback early return +// ============================================================================= + +TEST_F(ForEachFaninTest, SpillBoolEarlyReturnInSpillRegion) { + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP] = {}; + for (int i = 0; i < PTO2_FANIN_INLINE_CAP; i++) { + inline_slots[i] = &slots[i]; + } + + int32_t spill_start = spill_pool.top; + for (int i = 0; i < 4; i++) { + auto *e = spill_pool.alloc(); + e->slot_state = &slots[16 + i]; + } + + int count = 0; + bool result = + pto2_for_each_fanin_storage(inline_slots, 20, spill_start, spill_pool, [&](PTO2TaskSlotState *) -> bool { + count++; + return count < 17; // stop on 17th (first spill entry) + }); + + EXPECT_FALSE(result); + EXPECT_EQ(count, 17); +} diff --git a/tests/ut/cpp/a5/test_ready_queue.cpp b/tests/ut/cpp/a5/test_ready_queue.cpp new file mode 100644 index 000000000..1a139a8f1 --- /dev/null +++ b/tests/ut/cpp/a5/test_ready_queue.cpp @@ -0,0 +1,446 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2ReadyQueue and PTO2LocalReadyBuffer from pto_scheduler.h + * + * Tests the lock-free bounded MPMC queue (Vyukov design) and the thread-local + * ready buffer used for local-first dispatch optimization. + * + * Design contracts: + * + * - Sequence wrap: The sequence counter is int64_t. Practically unreachable + * wrap at 2^63; two's-complement comparisons still work. + * + * - Pop fast-path: pop() checks enqueue_pos == dequeue_pos as an early-empty + * hint. A push between the hint and the CAS can race; standard TOCTOU of + * Vyukov MPMC, acceptable. + * + * - Push near full: All producers that see a full slot return false + * simultaneously even if a pop happens right after. Acceptable + * back-pressure. + * + * - size() relaxed ordering: size() reads both positions with + * memory_order_relaxed and is a hint, not a snapshot. If a stale read + * produces d > e the guard returns 0. + * + * - LocalReadyBuffer LIFO dispatch: try_push appends at count++, pop returns + * slot_states[--count]. LIFO reversal is intentional for cache-locality + * when a producer immediately dispatches its fanout. + */ + +#include + +#include +#include +#include +#include +#include + +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// ReadyQueue: Single-threaded fixture (malloc-backed) +// ============================================================================= + +class ReadyQueueTest : public ::testing::Test { +protected: + static constexpr uint64_t CAPACITY = 16; // Power of 2 + + PTO2ReadyQueue queue; + + void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); } + + void TearDown() override { pto2_ready_queue_destroy(&queue); } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(ReadyQueueTest, EmptyPopReturnsNullptr) { EXPECT_EQ(queue.pop(), nullptr); } + +TEST_F(ReadyQueueTest, SinglePushPop) { + PTO2TaskSlotState item; + ASSERT_TRUE(queue.push(&item)); + + PTO2TaskSlotState *result = queue.pop(); + EXPECT_EQ(result, &item); +} + +TEST_F(ReadyQueueTest, FIFOOrdering) { + PTO2TaskSlotState a, b, c; + + ASSERT_TRUE(queue.push(&a)); + ASSERT_TRUE(queue.push(&b)); + ASSERT_TRUE(queue.push(&c)); + + EXPECT_EQ(queue.pop(), &a); + EXPECT_EQ(queue.pop(), &b); + EXPECT_EQ(queue.pop(), &c); + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, QueueFullReturnsFalse) { + std::vector items(CAPACITY); + + for (uint64_t i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState extra; + EXPECT_FALSE(queue.push(&extra)); +} + +TEST_F(ReadyQueueTest, SlotReuseAfterFullDrain) { + std::vector items(CAPACITY); + + for (uint64_t i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + for (uint64_t i = 0; i < CAPACITY; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); + + for (uint64_t i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + for (uint64_t i = 0; i < CAPACITY; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, PushBatchThenIndividualPop) { + constexpr int BATCH_SIZE = 5; + PTO2TaskSlotState items[BATCH_SIZE]; + PTO2TaskSlotState *ptrs[BATCH_SIZE]; + for (int i = 0; i < BATCH_SIZE; i++) { + ptrs[i] = &items[i]; + } + + queue.push_batch(ptrs, BATCH_SIZE); + + for (int i = 0; i < BATCH_SIZE; i++) { + EXPECT_EQ(queue.pop(), &items[i]); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, PushBatchZeroIsNoop) { + queue.push_batch(nullptr, 0); + + EXPECT_EQ(queue.size(), 0u); + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueTest, PopBatchReturnsFive) { + constexpr int PUSH_COUNT = 10; + PTO2TaskSlotState items[PUSH_COUNT]; + + for (int i = 0; i < PUSH_COUNT; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState *out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, 5); + + for (int i = 0; i < 5; i++) { + EXPECT_EQ(out[i], &items[i]); + } +} + +TEST_F(ReadyQueueTest, PopBatchPartial) { + constexpr int PUSH_COUNT = 3; + PTO2TaskSlotState items[PUSH_COUNT]; + + for (int i = 0; i < PUSH_COUNT; i++) { + ASSERT_TRUE(queue.push(&items[i])); + } + + PTO2TaskSlotState *out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, PUSH_COUNT); + + for (int i = 0; i < PUSH_COUNT; i++) { + EXPECT_EQ(out[i], &items[i]); + } +} + +TEST_F(ReadyQueueTest, PopBatchEmpty) { + PTO2TaskSlotState *out[5]; + int popped = queue.pop_batch(out, 5); + EXPECT_EQ(popped, 0); +} + +TEST_F(ReadyQueueTest, SizeAccuracy) { + EXPECT_EQ(queue.size(), 0u); + + PTO2TaskSlotState items[8]; + + queue.push(&items[0]); + EXPECT_EQ(queue.size(), 1u); + + queue.push(&items[1]); + queue.push(&items[2]); + EXPECT_EQ(queue.size(), 3u); + + queue.pop(); + EXPECT_EQ(queue.size(), 2u); + + queue.pop(); + queue.pop(); + EXPECT_EQ(queue.size(), 0u); + + for (int i = 0; i < 5; i++) { + queue.push(&items[i]); + } + EXPECT_EQ(queue.size(), 5u); +} + +// ============================================================================= +// Boundary conditions (small capacity for precise boundary testing) +// ============================================================================= + +class ReadyQueueBoundaryTest : public ::testing::Test { +protected: + static constexpr uint64_t QUEUE_CAP = 8; // Small for boundary testing + PTO2ReadyQueue queue{}; + PTO2TaskSlotState dummy[8]{}; + + void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, QUEUE_CAP)); } + void TearDown() override { pto2_ready_queue_destroy(&queue); } +}; + +TEST_F(ReadyQueueBoundaryTest, ExactCapacityFillDrain) { + int pushed = 0; + for (uint64_t i = 0; i < QUEUE_CAP; i++) { + if (queue.push(&dummy[i % 8])) pushed++; + else break; + } + EXPECT_GE(pushed, (int)(QUEUE_CAP - 1)); + + for (int i = 0; i < pushed; i++) { + EXPECT_NE(queue.pop(), nullptr); + } + EXPECT_EQ(queue.pop(), nullptr); +} + +TEST_F(ReadyQueueBoundaryTest, PushToFullThenRecover) { + int pushed = 0; + while (queue.push(&dummy[0])) + pushed++; + + EXPECT_FALSE(queue.push(&dummy[1])) << "Push to full queue returns false"; + + EXPECT_NE(queue.pop(), nullptr); + EXPECT_TRUE(queue.push(&dummy[1])) << "Push succeeds after pop from full queue"; +} + +// size() with relaxed ordering: exact in single-threaded context. +TEST_F(ReadyQueueBoundaryTest, SizeRelaxedOrdering) { + queue.push(&dummy[0]); + queue.push(&dummy[1]); + queue.push(&dummy[2]); + EXPECT_EQ(queue.size(), 3u); + + queue.pop(); + EXPECT_EQ(queue.size(), 2u); + + queue.pop(); + queue.pop(); + EXPECT_EQ(queue.size(), 0u); +} + +// size() guard: after many push/pop cycles, never goes negative. +TEST_F(ReadyQueueBoundaryTest, SizeNeverNegative) { + for (int i = 0; i < 100; i++) { + ASSERT_TRUE(queue.push(&dummy[0])); + queue.pop(); + } + EXPECT_EQ(queue.size(), 0u) << "size() returns 0 after balanced push/pop cycles"; +} + +TEST_F(ReadyQueueBoundaryTest, RepeatedEmptyPop) { + for (int i = 0; i < 100; i++) { + EXPECT_EQ(queue.pop(), nullptr); + } + EXPECT_EQ(queue.size(), 0u); +} + +// Sequence numbers grow large after many cycles but remain correct. +TEST_F(ReadyQueueBoundaryTest, ManyPushPopCycles) { + for (int i = 0; i < 10000; i++) { + ASSERT_TRUE(queue.push(&dummy[0])); + PTO2TaskSlotState *s = queue.pop(); + ASSERT_NE(s, nullptr); + EXPECT_EQ(s, &dummy[0]); + } + + EXPECT_EQ(queue.size(), 0u); + EXPECT_TRUE(queue.push(&dummy[1])); + EXPECT_EQ(queue.pop(), &dummy[1]); +} + +// ============================================================================= +// Concurrency +// ============================================================================= + +// Parameterized MPMC stress test: {producers, consumers, items_per_producer} +struct MPMCConfig { + int producers; + int consumers; + int items_per_producer; +}; + +class ReadyQueueMPMCTest : public ::testing::TestWithParam { +protected: + static constexpr uint64_t CAPACITY = 1024; + PTO2ReadyQueue queue; + + void SetUp() override { ASSERT_TRUE(pto2_ready_queue_init(&queue, CAPACITY)); } + void TearDown() override { pto2_ready_queue_destroy(&queue); } +}; + +TEST_P(ReadyQueueMPMCTest, NoDuplicateNoLoss) { + auto cfg = GetParam(); + int total = cfg.producers * cfg.items_per_producer; + + std::vector items(total); + std::vector> consumed_count(total); + for (int i = 0; i < total; i++) { + consumed_count[i].store(0, std::memory_order_relaxed); + } + + auto item_index = [&](PTO2TaskSlotState *s) -> int { + return static_cast(s - items.data()); + }; + + std::atomic producers_done{0}; + + auto producer = [&](int id) { + for (int i = id; i < total; i += cfg.producers) { + while (!queue.push(&items[i])) {} + } + producers_done.fetch_add(1, std::memory_order_release); + }; + + std::atomic total_consumed{0}; + + auto consumer = [&]() { + while (true) { + PTO2TaskSlotState *item = queue.pop(); + if (item != nullptr) { + consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed); + total_consumed.fetch_add(1, std::memory_order_relaxed); + } else if (producers_done.load(std::memory_order_acquire) == cfg.producers) { + // Drain remaining + while ((item = queue.pop()) != nullptr) { + consumed_count[item_index(item)].fetch_add(1, std::memory_order_relaxed); + total_consumed.fetch_add(1, std::memory_order_relaxed); + } + break; + } + } + }; + + std::vector threads; + for (int i = 0; i < cfg.producers; i++) + threads.emplace_back(producer, i); + for (int i = 0; i < cfg.consumers; i++) + threads.emplace_back(consumer); + for (auto &t : threads) + t.join(); + + EXPECT_EQ(total_consumed.load(), total); + for (int i = 0; i < total; i++) { + EXPECT_EQ(consumed_count[i].load(), 1) + << "Item " << i << " consumed " << consumed_count[i].load() << " times (expected 1)"; + } +} + +INSTANTIATE_TEST_SUITE_P( + MPMCVariants, ReadyQueueMPMCTest, + ::testing::Values( + MPMCConfig{2, 2, 200}, // TwoProducersTwoConsumers + MPMCConfig{1, 4, 500}, // OneProducerNConsumers + MPMCConfig{4, 4, 1250} // HighContentionStress + ) +); + +// ============================================================================= +// LocalReadyBuffer +// ============================================================================= + +class LocalReadyBufferTest : public ::testing::Test { +protected: + static constexpr int CAPACITY = 8; + + PTO2LocalReadyBuffer buffer; + PTO2TaskSlotState *backing[CAPACITY]; + + void SetUp() override { buffer.reset(backing, CAPACITY); } +}; + +// --- Normal path --- + +TEST_F(LocalReadyBufferTest, PopEmptyReturnsNullptr) { EXPECT_EQ(buffer.pop(), nullptr); } + +// LIFO dispatch: try_push appends at count++, pop returns slot_states[--count]. +TEST_F(LocalReadyBufferTest, LIFOOrdering) { + PTO2TaskSlotState a, b; + + ASSERT_TRUE(buffer.try_push(&a)); + ASSERT_TRUE(buffer.try_push(&b)); + + EXPECT_EQ(buffer.pop(), &b); + EXPECT_EQ(buffer.pop(), &a); + EXPECT_EQ(buffer.pop(), nullptr); +} + +TEST_F(LocalReadyBufferTest, TryPushFullReturnsFalse) { + PTO2TaskSlotState items[CAPACITY + 1]; + + for (int i = 0; i < CAPACITY; i++) { + ASSERT_TRUE(buffer.try_push(&items[i])); + } + + EXPECT_FALSE(buffer.try_push(&items[CAPACITY])); +} + +TEST_F(LocalReadyBufferTest, ResetSetsCleanState) { + EXPECT_EQ(buffer.pop(), nullptr) << "Fresh buffer is empty"; + + PTO2TaskSlotState a, b; + ASSERT_TRUE(buffer.try_push(&a)); + ASSERT_TRUE(buffer.try_push(&b)); + + buffer.reset(backing, CAPACITY); + EXPECT_EQ(buffer.pop(), nullptr) << "Buffer is empty after reset"; + + PTO2TaskSlotState items[CAPACITY]; + for (int i = 0; i < CAPACITY; i++) { + EXPECT_TRUE(buffer.try_push(&items[i])); + } + EXPECT_FALSE(buffer.try_push(&a)) << "Full after pushing capacity items post-reset"; +} + +// --- Boundary conditions --- + +TEST_F(LocalReadyBufferTest, NullBackingBuffer) { + PTO2LocalReadyBuffer buf; + buf.reset(nullptr, 0); + + PTO2TaskSlotState item{}; + EXPECT_FALSE(buf.try_push(&item)) << "Push fails with null backing"; + EXPECT_EQ(buf.pop(), nullptr) << "Pop returns null with null backing"; +} diff --git a/tests/ut/cpp/a5/test_scheduler_state.cpp b/tests/ut/cpp/a5/test_scheduler_state.cpp new file mode 100644 index 000000000..13647c320 --- /dev/null +++ b/tests/ut/cpp/a5/test_scheduler_state.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2SchedulerState from pto_scheduler.h + * + * Tests task state transitions, fanin/fanout logic, subtask completion. + */ + +#include + +#include +#include + +#include "scheduler/pto_scheduler.h" + +class SchedulerStateTest : public ::testing::Test { +protected: + PTO2SchedulerState sched; + PTO2SharedMemoryHandle *sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle->header); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + void init_slot( + PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0 + ) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = ring_id; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.logical_block_num = 1; + } +}; + +// ============================================================================= +// check_and_handle_consumed +// ============================================================================= + +TEST_F(SchedulerStateTest, ConsumedNotReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(1); // 1 != 2 + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_COMPLETED); +} + +TEST_F(SchedulerStateTest, ConsumedTransition) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(2); // matches fanout_count + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +TEST_F(SchedulerStateTest, ConsumedNotCompletedState) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.fanout_refcount.store(1); + + sched.check_and_handle_consumed(slot); + // CAS fails because state is RUNNING, not COMPLETED + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_RUNNING); +} + +TEST_F(SchedulerStateTest, ConsumedIdempotent) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_CONSUMED, 1, 1); + slot.fanout_refcount.store(1); + + sched.check_and_handle_consumed(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// release_producer +// ============================================================================= + +TEST_F(SchedulerStateTest, ReleaseProducerIncrements) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 3); + + sched.release_producer(slot); + EXPECT_EQ(slot.fanout_refcount.load(), 1); + + sched.release_producer(slot); + EXPECT_EQ(slot.fanout_refcount.load(), 2); +} + +TEST_F(SchedulerStateTest, ReleaseProducerTriggersConsumed) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_COMPLETED, 1, 2); + slot.fanout_refcount.store(1); // One away + + sched.release_producer(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// on_subtask_complete +// ============================================================================= + +TEST_F(SchedulerStateTest, SubtaskCompleteSingle) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 1; + slot.completed_subtasks.store(0); + + EXPECT_TRUE(sched.on_subtask_complete(slot)); +} + +TEST_F(SchedulerStateTest, SubtaskCompleteMultiBlock) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 6; // 3 cores * 2 blocks + slot.completed_subtasks.store(0); + + for (int i = 0; i < 5; i++) { + EXPECT_FALSE(sched.on_subtask_complete(slot)); + } + EXPECT_TRUE(sched.on_subtask_complete(slot)); +} + +// ============================================================================= +// on_scope_end +// ============================================================================= + +TEST_F(SchedulerStateTest, ScopeEndBatchRelease) { + constexpr int N = 4; + alignas(64) PTO2TaskSlotState slots[N]; + PTO2TaskSlotState *ptrs[N]; + + for (int i = 0; i < N; i++) { + init_slot(slots[i], PTO2_TASK_COMPLETED, 1, 2); + ptrs[i] = &slots[i]; + } + + sched.on_scope_end(ptrs, N); + + for (int i = 0; i < N; i++) { + EXPECT_EQ(slots[i].fanout_refcount.load(), 1); + } +} + +// ============================================================================= +// get_ready_tasks_batch: local buffer first +// ============================================================================= + +TEST_F(SchedulerStateTest, GetReadyTasksBatchLocalFirst) { + alignas(64) PTO2TaskSlotState slot_a, slot_b; + init_slot(slot_a, PTO2_TASK_READY, 0, 1); + init_slot(slot_b, PTO2_TASK_PENDING, 1, 1); + + PTO2TaskSlotState *local_buf_storage[4]; + PTO2LocalReadyBuffer local_buf; + local_buf.reset(local_buf_storage, 4); + local_buf.try_push(&slot_a); + + // Use src API to route slot_b into the global ready queue + sched.release_fanin_and_check_ready(slot_b); + + PTO2TaskSlotState *out[4]; + int count = sched.get_ready_tasks_batch(PTO2ResourceShape::AIC, local_buf, out, 4); + + EXPECT_EQ(count, 2); + // Local buffer drains first (LIFO), so slot_a comes first + EXPECT_EQ(out[0], &slot_a); + EXPECT_EQ(out[1], &slot_b); +} diff --git a/tests/ut/cpp/a5/test_shared_memory.cpp b/tests/ut/cpp/a5/test_shared_memory.cpp new file mode 100644 index 000000000..ffcbb7821 --- /dev/null +++ b/tests/ut/cpp/a5/test_shared_memory.cpp @@ -0,0 +1,191 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2SharedMemory layout from pto_shared_memory.h + * + * Tests creation, validation, per-ring independence, alignment, size + * calculation, and error handling. + * + * Design contracts: + * + * - pto2_sm_validate checks `top > heap_size`. top == heap_size is a + * legitimate "filled exactly to end" state, so strict > is correct. + * + * - Zero window size: if pto2_sm_calculate_size() is called with 0, all ring + * descriptors/payloads alias the same address. Current entry path + * (pto2_sm_create) is called only with valid sizes, but there is no + * explicit guard. pto2_sm_create should reject task_window_size==0. + * + * - Flow control heap_top validation: validate() does not verify + * heap_top <= heap_size. After a corruption, heap_top could exceed + * heap_size without detection. validate should check both bounds. + */ + +#include +#include +#include "pto_shared_memory.h" + +// ============================================================================= +// Fixture (default-created handle) +// ============================================================================= + +class SharedMemoryTest : public ::testing::Test { +protected: + PTO2SharedMemoryHandle *handle = nullptr; + + void SetUp() override { + handle = pto2_sm_create_default(); + ASSERT_NE(handle, nullptr); + } + + void TearDown() override { + if (handle) { + pto2_sm_destroy(handle); + handle = nullptr; + } + } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(SharedMemoryTest, CreateDefaultReturnsNonNull) { + EXPECT_NE(handle->sm_base, nullptr); + EXPECT_GT(handle->sm_size, 0u); +} + +TEST_F(SharedMemoryTest, IsOwner) { EXPECT_TRUE(handle->is_owner); } + +TEST_F(SharedMemoryTest, HeaderInitValues) { + auto *hdr = handle->header; + EXPECT_EQ(hdr->orchestrator_done.load(), 0); + EXPECT_EQ(hdr->orch_error_code.load(), 0); + EXPECT_EQ(hdr->sched_error_bitmap.load(), 0); + EXPECT_EQ(hdr->sched_error_code.load(), 0); + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &fc = hdr->rings[r].fc; + EXPECT_EQ(fc.current_task_index.load(), 0); + EXPECT_EQ(fc.last_task_alive.load(), 0); + } +} + +TEST_F(SharedMemoryTest, Validate) { EXPECT_TRUE(pto2_sm_validate(handle)); } + +TEST_F(SharedMemoryTest, PerRingIndependence) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + EXPECT_NE(handle->header->rings[r].task_descriptors, nullptr) << "Ring " << r; + EXPECT_NE(handle->header->rings[r].task_payloads, nullptr) << "Ring " << r; + } + for (int r = 1; r < PTO2_MAX_RING_DEPTH; r++) { + EXPECT_NE(handle->header->rings[r].task_descriptors, handle->header->rings[0].task_descriptors) << "Ring " << r; + } +} + +TEST_F(SharedMemoryTest, PointerAlignment) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto addr = reinterpret_cast(handle->header->rings[r].task_descriptors); + EXPECT_EQ(addr % PTO2_ALIGN_SIZE, 0u) << "Ring " << r << " descriptors not aligned"; + } +} + +TEST_F(SharedMemoryTest, HeaderAlignment) { + uintptr_t header_addr = (uintptr_t)handle->header; + EXPECT_EQ(header_addr % PTO2_ALIGN_SIZE, 0u) << "Header must be cache-line aligned"; +} + +// Descriptor and payload regions don't overlap within or across rings. +TEST_F(SharedMemoryTest, RegionsNonOverlapping) { + uint64_t ws = 64; // Use a known window size for byte arithmetic + PTO2SharedMemoryHandle *h = pto2_sm_create(ws, 4096); + ASSERT_NE(h, nullptr); + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + uintptr_t desc_start = (uintptr_t)h->header->rings[r].task_descriptors; + uintptr_t desc_end = desc_start + ws * sizeof(PTO2TaskDescriptor); + uintptr_t payload_start = (uintptr_t)h->header->rings[r].task_payloads; + + EXPECT_GE(payload_start, desc_end) << "Ring " << r << ": payload region should not overlap descriptors"; + } + + for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) { + uintptr_t this_payload_end = (uintptr_t)h->header->rings[r].task_payloads + ws * sizeof(PTO2TaskPayload); + uintptr_t next_desc_start = (uintptr_t)h->header->rings[r + 1].task_descriptors; + EXPECT_GE(next_desc_start, this_payload_end) << "Ring " << r << " and " << (r + 1) << " should not overlap"; + } + + pto2_sm_destroy(h); +} + +// ============================================================================= +// Size calculation +// ============================================================================= + +TEST(SharedMemoryCalcSize, NonZero) { + uint64_t size = pto2_sm_calculate_size(PTO2_TASK_WINDOW_SIZE); + EXPECT_GT(size, 0u); +} + +TEST(SharedMemoryCalcSize, LargerWindowGivesLargerSize) { + uint64_t small_size = pto2_sm_calculate_size(64); + uint64_t large_size = pto2_sm_calculate_size(256); + EXPECT_GT(large_size, small_size); +} + +TEST(SharedMemoryCalcSize, HeaderAligned) { EXPECT_EQ(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE, 0u); } + +TEST(SharedMemoryCalcSize, PerRingDifferentSizes) { + uint64_t ws[PTO2_MAX_RING_DEPTH] = {128, 256, 512, 1024}; + uint64_t size = pto2_sm_calculate_size_per_ring(ws); + + uint64_t uniform_size = pto2_sm_calculate_size(128); + EXPECT_GT(size, uniform_size); +} + +// ============================================================================= +// Boundary conditions +// ============================================================================= + +// Zero window size: all ring descriptors collapse to same address. +TEST(SharedMemoryBoundary, ZeroWindowSize) { + uint64_t size = pto2_sm_calculate_size(0); + uint64_t header_size = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + EXPECT_EQ(size, header_size); + + PTO2SharedMemoryHandle *h = pto2_sm_create(0, 4096); + if (h) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH - 1; r++) { + EXPECT_EQ(h->header->rings[r].task_descriptors, h->header->rings[r + 1].task_descriptors) + << "Zero window: all rings' descriptor pointers collapse to same address"; + } + pto2_sm_destroy(h); + } +} + +TEST(SharedMemoryBoundary, ValidateDetectsCorruption) { + PTO2SharedMemoryHandle *h = pto2_sm_create(256, 4096); + ASSERT_NE(h, nullptr); + EXPECT_TRUE(pto2_sm_validate(h)); + + h->header->rings[0].fc.current_task_index.store(-1); + EXPECT_FALSE(pto2_sm_validate(h)); + + pto2_sm_destroy(h); +} + +TEST(SharedMemoryBoundary, ValidateNullHandle) { EXPECT_FALSE(pto2_sm_validate(nullptr)); } + +TEST(SharedMemoryBoundary, CreateFromUndersizedBuffer) { + char buf[64]{}; + PTO2SharedMemoryHandle *h = pto2_sm_create_from_buffer(buf, 64, 256, 4096); + EXPECT_EQ(h, nullptr) << "Undersized buffer should fail"; +} diff --git a/tests/ut/cpp/a5/test_spsc_queue.cpp b/tests/ut/cpp/a5/test_spsc_queue.cpp new file mode 100644 index 000000000..a2c80ca05 --- /dev/null +++ b/tests/ut/cpp/a5/test_spsc_queue.cpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2SpscQueue from pto_scheduler.h + * + * Tests the Rigtorp cached-index SPSC queue used as the orchestrator → + * scheduler wiring channel: + * - Basic push / pop_batch correctness + * - Full / empty detection (including cached-index lazy refresh) + * - Wrap-around via modulo indexing + * - Capacity is capacity-1 (one sentinel slot) + * - pop_batch partial reads + * - size() accuracy + */ + +#include + +#include +#include +#include + +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Fixture +// ============================================================================= + +class SpscQueueTest : public ::testing::Test { +protected: + static constexpr uint64_t CAPACITY = 16; // must be power of 2 + + PTO2SpscQueue queue{}; + // Dummy slot states used as push values + alignas(64) PTO2TaskSlotState slots[64]{}; + + void SetUp() override { + memset(&queue, 0, sizeof(queue)); + ASSERT_TRUE(queue.init(CAPACITY)); + } + + void TearDown() override { queue.destroy(); } +}; + +// ============================================================================= +// Initialization +// ============================================================================= + +TEST_F(SpscQueueTest, InitValidState) { + EXPECT_EQ(queue.size(), 0u); + EXPECT_EQ(queue.mask_, CAPACITY - 1); + EXPECT_NE(queue.buffer_, nullptr); +} + +TEST_F(SpscQueueTest, InitRejectsNonPowerOfTwo) { + PTO2SpscQueue bad{}; + EXPECT_FALSE(bad.init(3)); + EXPECT_FALSE(bad.init(7)); + EXPECT_FALSE(bad.init(0)); +} + +TEST_F(SpscQueueTest, InitAcceptsPowerOfTwo) { + PTO2SpscQueue q{}; + EXPECT_TRUE(q.init(4)); + q.destroy(); + EXPECT_TRUE(q.init(1024)); + q.destroy(); +} + +// ============================================================================= +// Basic push / pop +// ============================================================================= + +TEST_F(SpscQueueTest, PushPopSingle) { + EXPECT_TRUE(queue.push(&slots[0])); + EXPECT_EQ(queue.size(), 1u); + + PTO2TaskSlotState *out[1]; + int count = queue.pop_batch(out, 1); + ASSERT_EQ(count, 1); + EXPECT_EQ(out[0], &slots[0]); + EXPECT_EQ(queue.size(), 0u); +} + +TEST_F(SpscQueueTest, FIFOOrdering) { + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(queue.push(&slots[i])); + } + + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, 5); + ASSERT_EQ(count, 5); + for (int i = 0; i < 5; i++) { + EXPECT_EQ(out[i], &slots[i]) << "FIFO order violated at i=" << i; + } +} + +TEST_F(SpscQueueTest, PopBatchPartial) { + for (int i = 0; i < 3; i++) { + queue.push(&slots[i]); + } + + // Request 5 but only 3 available + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, 5); + EXPECT_EQ(count, 3); +} + +TEST_F(SpscQueueTest, PopBatchEmpty) { + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, 5); + EXPECT_EQ(count, 0); +} + +// ============================================================================= +// Full detection +// ============================================================================= + +TEST_F(SpscQueueTest, FullReturnsFalse) { + // Usable capacity = CAPACITY - 1 = 15 + for (uint64_t i = 0; i < CAPACITY - 1; i++) { + ASSERT_TRUE(queue.push(&slots[i])) << "push failed at i=" << i; + } + EXPECT_EQ(queue.size(), CAPACITY - 1); + + // Queue full + EXPECT_FALSE(queue.push(&slots[CAPACITY - 1])) << "Push to full queue must return false"; +} + +TEST_F(SpscQueueTest, UsableCapacityIsCapacityMinusOne) { + int pushed = 0; + while (queue.push(&slots[pushed % 64])) { + pushed++; + if (pushed > 100) break; // safety + } + EXPECT_EQ(pushed, static_cast(CAPACITY - 1)); +} + +// ============================================================================= +// Full then recover +// ============================================================================= + +TEST_F(SpscQueueTest, FullThenPopThenPush) { + for (uint64_t i = 0; i < CAPACITY - 1; i++) { + queue.push(&slots[i]); + } + EXPECT_FALSE(queue.push(&slots[0])); + + // Pop one + PTO2TaskSlotState *out[1]; + int count = queue.pop_batch(out, 1); + ASSERT_EQ(count, 1); + + // Now push should succeed + EXPECT_TRUE(queue.push(&slots[0])); +} + +// ============================================================================= +// Wrap-around +// ============================================================================= + +TEST_F(SpscQueueTest, WrapAroundCorrectness) { + // Push-pop cycles to advance head/tail past capacity boundary + for (int cycle = 0; cycle < 100; cycle++) { + ASSERT_TRUE(queue.push(&slots[cycle % 64])) << "push failed at cycle=" << cycle; + PTO2TaskSlotState *out[1]; + int count = queue.pop_batch(out, 1); + ASSERT_EQ(count, 1) << "pop_batch failed at cycle=" << cycle; + EXPECT_EQ(out[0], &slots[cycle % 64]); + } + EXPECT_EQ(queue.size(), 0u); +} + +TEST_F(SpscQueueTest, WrapAroundBatchCorrectness) { + // Multiple cycles of batch push/pop across wrap boundary + for (int cycle = 0; cycle < 20; cycle++) { + int batch = 5; + for (int i = 0; i < batch; i++) { + ASSERT_TRUE(queue.push(&slots[(cycle * batch + i) % 64])); + } + PTO2TaskSlotState *out[5]; + int count = queue.pop_batch(out, batch); + ASSERT_EQ(count, batch); + for (int i = 0; i < batch; i++) { + EXPECT_EQ(out[i], &slots[(cycle * batch + i) % 64]); + } + } +} + +// ============================================================================= +// size() accuracy +// ============================================================================= + +TEST_F(SpscQueueTest, SizeTracksOperations) { + EXPECT_EQ(queue.size(), 0u); + + queue.push(&slots[0]); + EXPECT_EQ(queue.size(), 1u); + + queue.push(&slots[1]); + queue.push(&slots[2]); + EXPECT_EQ(queue.size(), 3u); + + PTO2TaskSlotState *out[2]; + queue.pop_batch(out, 2); + EXPECT_EQ(queue.size(), 1u); + + queue.pop_batch(out, 1); + EXPECT_EQ(queue.size(), 0u); +} + +// ============================================================================= +// Producer-consumer (two threads) +// ============================================================================= + +TEST_F(SpscQueueTest, TwoThreadProducerConsumer) { + constexpr int TOTAL = 10000; + std::vector consumed; + consumed.reserve(TOTAL); + + // Use a large pool of slot states for unique pointers + std::vector big_pool(TOTAL); + + std::thread producer([&]() { + for (int i = 0; i < TOTAL; i++) { + while (!queue.push(&big_pool[i])) { + // spin + } + } + }); + + std::thread consumer([&]() { + int total = 0; + PTO2TaskSlotState *out[32]; + while (total < TOTAL) { + int count = queue.pop_batch(out, 32); + for (int i = 0; i < count; i++) { + consumed.push_back(out[i]); + } + total += count; + } + }); + + producer.join(); + consumer.join(); + + ASSERT_EQ(consumed.size(), static_cast(TOTAL)); + // Verify FIFO order + for (int i = 0; i < TOTAL; i++) { + EXPECT_EQ(consumed[i], &big_pool[i]) << "FIFO violated at i=" << i; + } +} + +// ============================================================================= +// Cached index behavior +// ============================================================================= + +TEST_F(SpscQueueTest, CachedIndexLazyRefresh) { + // Fill queue + for (uint64_t i = 0; i < CAPACITY - 1; i++) { + queue.push(&slots[i]); + } + + // Consumer pops all + PTO2TaskSlotState *out[16]; + int count = queue.pop_batch(out, CAPACITY); + EXPECT_EQ(count, static_cast(CAPACITY - 1)); + + // Producer's tail_cached_ is stale (still thinks queue is full) + // Next push should refresh tail_cached_ and succeed + EXPECT_TRUE(queue.push(&slots[0])); +} + +TEST_F(SpscQueueTest, CachedIndexConsumerRefresh) { + // Consumer calls pop_batch on empty queue (head_cached_ is 0) + PTO2TaskSlotState *out[1]; + EXPECT_EQ(queue.pop_batch(out, 1), 0); + + // Producer pushes + queue.push(&slots[0]); + + // Consumer's head_cached_ is stale, pop_batch must refresh + int count = queue.pop_batch(out, 1); + EXPECT_EQ(count, 1); + EXPECT_EQ(out[0], &slots[0]); +} diff --git a/tests/ut/cpp/a5/test_task_allocator.cpp b/tests/ut/cpp/a5/test_task_allocator.cpp new file mode 100644 index 000000000..383003900 --- /dev/null +++ b/tests/ut/cpp/a5/test_task_allocator.cpp @@ -0,0 +1,407 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2TaskAllocator from pto_ring_buffer.h + * + * Tests ring buffer allocation, heap bump logic, wrap-around, alignment, + * task window flow control, and heap_available semantics. + * + * The allocator is single-threaded (orchestrator thread), so no concurrency + * tests are needed. The unified PTO2TaskAllocator replaces the previous + * separate PTO2HeapRing + PTO2TaskRing. + * + * Design contracts (try_bump_heap): + * + * - Wrap-around guard uses `tail > alloc_size` (strict >). When + * tail == alloc_size the wrap branch returns nullptr. Allowing it + * would create top == tail (full/empty ambiguity). Strict > + * sacrifices one quantum of capacity. + * + * - heap_available() returns max(at_end, at_begin), not the sum. + * A single allocation cannot split across the wrap boundary. + * + * - Zero-size allocation is a no-op returning the current top. + * Two consecutive zero-size allocs return the SAME pointer. + * + * - Wrap path wasted space: space between old top and heap_size is not + * reclaimed. Inherent ring-buffer fragmentation cost. + */ + +#include + +#include +#include +#include +#include +#include + +#include "pto_ring_buffer.h" + +// ============================================================================= +// Helpers +// +// WHITE-BOX: consume_up_to simulates the scheduler consuming tasks by directly +// writing descriptor.packed_buffer_end and advancing last_alive. This binds +// to the internal tail-derivation mechanism. If the allocator's reclaim +// protocol changes (e.g. explicit tail field instead of packed_buffer_end), +// this helper and all wrap/reclaim tests must be updated. +// ============================================================================= + +static void consume_up_to( + std::vector &descriptors, std::atomic &last_alive, void *heap_base, + int32_t window_size, int32_t new_last_alive, uint64_t heap_tail_offset +) { + int32_t last_consumed = new_last_alive - 1; + descriptors[last_consumed & (window_size - 1)].packed_buffer_end = + static_cast(heap_base) + heap_tail_offset; + last_alive.store(new_last_alive, std::memory_order_release); +} + +// ============================================================================= +// Fixture +// ============================================================================= + +class TaskAllocatorTest : public ::testing::Test { +protected: + static constexpr int32_t WINDOW_SIZE = 16; + static constexpr uint64_t HEAP_SIZE = 4096; + + std::vector descriptors; + alignas(64) uint8_t heap_buf[HEAP_SIZE]{}; + std::atomic current_index{0}; + std::atomic last_alive{0}; + std::atomic error_code{PTO2_ERROR_NONE}; + PTO2TaskAllocator allocator{}; + + void SetUp() override { + descriptors.assign(WINDOW_SIZE, PTO2TaskDescriptor{}); + std::memset(heap_buf, 0, sizeof(heap_buf)); + current_index.store(0); + last_alive.store(0); + error_code.store(PTO2_ERROR_NONE); + allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + } +}; + +// ============================================================================= +// Normal path +// ============================================================================= + +TEST_F(TaskAllocatorTest, InitialState) { + EXPECT_EQ(allocator.window_size(), WINDOW_SIZE); + EXPECT_EQ(allocator.active_count(), 0); + EXPECT_EQ(allocator.heap_top(), 0u); + EXPECT_EQ(allocator.heap_capacity(), HEAP_SIZE); + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE); +} + +TEST_F(TaskAllocatorTest, AllocNonZeroSize) { + auto result = allocator.alloc(100); + ASSERT_FALSE(result.failed()); + EXPECT_EQ(result.task_id, 0); + EXPECT_EQ(result.slot, 0); + EXPECT_NE(result.packed_base, nullptr); + // 100 bytes aligned up to PTO2_ALIGN_SIZE (64) = 128 + uint64_t expected_aligned = PTO2_ALIGN_UP(100u, PTO2_ALIGN_SIZE); + EXPECT_EQ(expected_aligned, 128u); + EXPECT_EQ(allocator.heap_top(), expected_aligned); + EXPECT_EQ( + static_cast(result.packed_end) - static_cast(result.packed_base), + static_cast(expected_aligned) + ); +} + +TEST_F(TaskAllocatorTest, SequentialTaskIds) { + int32_t prev_id = -1; + for (int i = 0; i < 5; i++) { + auto result = allocator.alloc(0); + ASSERT_FALSE(result.failed()) << "Alloc failed at i=" << i; + EXPECT_EQ(result.task_id, prev_id + 1) << "Task IDs must be monotonically increasing"; + EXPECT_EQ(result.slot, result.task_id & (WINDOW_SIZE - 1)); + prev_id = result.task_id; + } + EXPECT_EQ(allocator.active_count(), 5); +} + +TEST_F(TaskAllocatorTest, OutputSizeAlignment) { + // 1 byte -> aligned to 64 + auto r1 = allocator.alloc(1); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), 64u); + + // Another 33 bytes -> aligned to 64, total 128 + auto r2 = allocator.alloc(33); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator.heap_top(), 128u); + + // Exactly 64 bytes -> stays 64, total 192 + auto r3 = allocator.alloc(64); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator.heap_top(), 192u); +} + +TEST_F(TaskAllocatorTest, SlotMappingPowerOfTwoWindow) { + std::set slots; + for (int i = 0; i < WINDOW_SIZE; i++) { + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, i, 0); + auto r = allocator.alloc(0); + ASSERT_FALSE(r.failed()); + EXPECT_EQ(r.slot, r.task_id & (WINDOW_SIZE - 1)); + slots.insert(r.slot); + } + EXPECT_EQ(slots.size(), static_cast(WINDOW_SIZE)) + << "Every slot should be visited exactly once over one window cycle"; +} + +TEST_F(TaskAllocatorTest, UpdateHeapTailFromConsumedTask) { + auto r1 = allocator.alloc(256); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), 256u); + + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 256); + + // Force the allocator to observe the new last_alive by doing another alloc + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + + // top=256, tail=256: at_end = 4096-256=3840, at_begin = 256 + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u); +} + +TEST_F(TaskAllocatorTest, UpdateHeapTailAtTask0) { + auto r1 = allocator.alloc(64); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.task_id, 0); + + descriptors[0].packed_buffer_end = static_cast(static_cast(heap_buf)) + 64; + last_alive.store(1, std::memory_order_release); + + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, 1); +} + +TEST_F(TaskAllocatorTest, UpdateHeapTailIdempotent) { + auto r1 = allocator.alloc(128); + ASSERT_FALSE(r1.failed()); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128); + + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + uint64_t avail_after_first = allocator.heap_available(); + + auto r3 = allocator.alloc(0); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(allocator.heap_available(), avail_after_first); +} + +TEST_F(TaskAllocatorTest, HeapAvailableTopGeTail) { + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE); + + auto r1 = allocator.alloc(256); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 256u); +} + +TEST_F(TaskAllocatorTest, HeapAvailableTopLtTail) { + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64); + + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + // top=128, tail=HEAP_SIZE-64: available = (HEAP_SIZE-64) - 128 + EXPECT_EQ(allocator.heap_available(), HEAP_SIZE - 64 - 128); +} + +// ============================================================================= +// Boundary conditions +// ============================================================================= + +TEST_F(TaskAllocatorTest, HeapExactFitAtEnd) { + // Allocate 4032 bytes to leave exactly 64 at end. + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE - 64u); + + auto r2 = allocator.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + EXPECT_EQ(static_cast(r2.packed_base), reinterpret_cast(heap_buf) + HEAP_SIZE - 64); +} + +// Wrap guard `tail > alloc_size` uses strict > to prevent full/empty ambiguity. +// If the allocation were allowed, heap_top would advance to alloc_size == tail, +// making top == tail. Because top == tail is the canonical "empty" state, the +// ring could not distinguish "completely full" from "completely empty". +TEST_F(TaskAllocatorTest, HeapWrapGuardRejectsTailEqualsAllocSize) { + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 64); + + auto r2 = allocator.alloc(64); + EXPECT_TRUE(r2.failed()) << "wrap guard must reject when tail == alloc_size (full/empty ambiguity)"; +} + +TEST_F(TaskAllocatorTest, HeapWrapAroundSuccess) { + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 128); + + auto r2 = allocator.alloc(64); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.packed_base, static_cast(heap_buf)); + EXPECT_EQ(allocator.heap_top(), 64u); +} + +// Linear-gap guard `tail - top > alloc_size` uses strict > for the same reason. +TEST_F(TaskAllocatorTest, HeapLinearGapGuardRejectsExactFit) { + // Fill most of heap, leaving just 64 at end so next alloc wraps. + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64); + + // Allocate 128 bytes: space_at_end = 64, not enough -> wrap. + // tail = HEAP_SIZE-64, which is > 128 -> wraps to beginning. + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(allocator.heap_top(), 128u); + + // Now top=128, tail=HEAP_SIZE-64 (top < tail) + // gap = (HEAP_SIZE-64) - 128 = HEAP_SIZE-192 + // Allocate exactly gap bytes: gap > alloc_size -> FALSE + uint64_t gap = (HEAP_SIZE - 64) - 128; + auto r3 = allocator.alloc(gap); + EXPECT_TRUE(r3.failed()) << "linear-gap guard must reject exact fit (full/empty ambiguity)"; +} + +TEST_F(TaskAllocatorTest, HeapTopLessThanTailInsufficientSpace) { + auto r1 = allocator.alloc(HEAP_SIZE - 64); + ASSERT_FALSE(r1.failed()); + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, HEAP_SIZE - 64); + + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + + // gap = (HEAP_SIZE-64) - 128. Try to allocate more than gap. + auto r3 = allocator.alloc(HEAP_SIZE); + EXPECT_TRUE(r3.failed()); + EXPECT_NE(error_code.load(), 0); +} + +// heap_available reports max(at_end, at_begin), not the sum -- a single +// allocation cannot split across the wrap boundary. +TEST_F(TaskAllocatorTest, AvailableReportsMaxNotSum) { + auto r1 = allocator.alloc(3008); + ASSERT_FALSE(r1.failed()); + uint64_t actual_top = allocator.heap_top(); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, 1024); + + auto r_probe = allocator.alloc(0); + ASSERT_FALSE(r_probe.failed()); + + uint64_t avail = allocator.heap_available(); + uint64_t at_end = HEAP_SIZE - actual_top; + uint64_t at_begin = 1024; + EXPECT_EQ(avail, std::max(at_end, at_begin)); + EXPECT_LT(avail, at_end + at_begin); +} + +// Zero-size allocs return the same address and don't advance the top. +TEST_F(TaskAllocatorTest, ZeroSizeAllocationAliased) { + auto r1 = allocator.alloc(0); + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r1.failed()); + ASSERT_FALSE(r2.failed()); + + EXPECT_EQ(r1.packed_base, r2.packed_base) << "Zero-size allocs return same address"; + EXPECT_EQ(r1.packed_base, r1.packed_end) << "packed_end == packed_base for zero-size"; + EXPECT_EQ(allocator.heap_top(), 0u) << "top doesn't advance for zero-size allocs"; +} + +// Wrap path: wasted space between old top and heap_size is not reclaimed. +TEST_F(TaskAllocatorTest, WrapPathWastedSpace) { + auto r1 = allocator.alloc(4000); + ASSERT_FALSE(r1.failed()); + uint64_t top_after = allocator.heap_top(); + EXPECT_GE(top_after, 4000u); + EXPECT_LT(top_after, HEAP_SIZE); + + consume_up_to(descriptors, last_alive, heap_buf, WINDOW_SIZE, 1, top_after); + + auto r2 = allocator.alloc(128); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.packed_base, static_cast(heap_buf)) << "Allocation wrapped to beginning"; + + uint64_t avail = allocator.heap_available(); + EXPECT_LT(avail, HEAP_SIZE) << "Wasted space at end reduces available capacity"; +} + +TEST_F(TaskAllocatorTest, AllocExactlyHeapSize) { + auto r1 = allocator.alloc(HEAP_SIZE); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.packed_base, static_cast(heap_buf)); + EXPECT_EQ(allocator.heap_top(), HEAP_SIZE); + + auto r2 = allocator.alloc(64); + EXPECT_TRUE(r2.failed()) << "No space after full allocation"; + EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +TEST_F(TaskAllocatorTest, AllocLargerThanHeap) { + auto r = allocator.alloc(HEAP_SIZE * 2); + EXPECT_TRUE(r.failed()) << "Cannot allocate more than heap size"; + EXPECT_EQ(error_code.load(), PTO2_ERROR_HEAP_RING_DEADLOCK); +} + +TEST_F(TaskAllocatorTest, TaskWindowSaturates) { + for (int i = 0; i < WINDOW_SIZE - 1; i++) { + auto r = allocator.alloc(0); + ASSERT_FALSE(r.failed()) << "Alloc failed at i=" << i; + EXPECT_EQ(r.task_id, i); + } + EXPECT_EQ(allocator.active_count(), WINDOW_SIZE - 1); + + auto overflow = allocator.alloc(0); + EXPECT_TRUE(overflow.failed()); + EXPECT_EQ(error_code.load(), PTO2_ERROR_FLOW_CONTROL_DEADLOCK); +} + +// Task IDs grow monotonically as int32_t. Near INT32_MAX, the same +// signed-overflow concern applies but is cosmetic since we use +// task_id & window_mask for indexing. +TEST_F(TaskAllocatorTest, TaskIdNearInt32Max) { + current_index.store(INT32_MAX - 2); + last_alive.store(INT32_MAX - 2); + allocator.init(descriptors.data(), WINDOW_SIZE, ¤t_index, &last_alive, heap_buf, HEAP_SIZE, &error_code); + + auto r1 = allocator.alloc(0); + ASSERT_FALSE(r1.failed()); + EXPECT_EQ(r1.task_id, INT32_MAX - 2); + EXPECT_EQ(r1.slot, (INT32_MAX - 2) & (WINDOW_SIZE - 1)); + + auto r2 = allocator.alloc(0); + ASSERT_FALSE(r2.failed()); + EXPECT_EQ(r2.task_id, INT32_MAX - 1); + + auto r3 = allocator.alloc(0); + ASSERT_FALSE(r3.failed()); + EXPECT_EQ(r3.task_id, INT32_MAX); + EXPECT_GE(r3.slot, 0); + EXPECT_LT(r3.slot, WINDOW_SIZE); +} diff --git a/tests/ut/cpp/a5/test_task_state.cpp b/tests/ut/cpp/a5/test_task_state.cpp new file mode 100644 index 000000000..7c468a9e7 --- /dev/null +++ b/tests/ut/cpp/a5/test_task_state.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2TaskSlotState lifecycle through PTO2SchedulerState API. + * + * These tests drive state transitions via src methods (release_fanin, + * on_subtask_complete, check_and_handle_consumed) rather than manually + * operating atomic fields. For concurrent exactly-once semantics of + * fanin/subtask/fanout, see test_scheduler_state.cpp which already + * covers those paths via the same API. + * + * This file focuses on: + * - Full lifecycle through src API + * - Non-profiling ready path behavior (task_state stays PENDING) + * - Double subtask completion (counter-model weakness) + */ + +#include +#include +#include +#include +#include +#include "scheduler/pto_scheduler.h" + +class TaskStateTest : public ::testing::Test { +protected: + PTO2SchedulerState sched; + PTO2SharedMemoryHandle *sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle->header); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + void init_slot(PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = 0; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.logical_block_num = 1; + } +}; + +// ============================================================================= +// Full lifecycle through src API: PENDING -> (fanin) -> READY-equivalent +// -> (subtask) -> COMPLETED -> (fanout) -> CONSUMED +// ============================================================================= +TEST_F(TaskStateTest, FullLifecycleThroughAPI) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 1, 1); + slot.total_required_subtasks = 1; + slot.completed_subtasks.store(0); + + // Fanin satisfied -> task becomes ready + bool ready = sched.release_fanin_and_check_ready(slot); + EXPECT_TRUE(ready); + + // Subtask completes -> task done + bool done = sched.on_subtask_complete(slot); + EXPECT_TRUE(done); + + // Manually transition to COMPLETED (normally done by scheduler dispatch loop) + slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); + + // Fanout released -> CONSUMED + sched.release_producer(slot); + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// Non-profiling release_fanin does not CAS task_state to READY. +// +// Readiness is determined solely by fanin_refcount reaching fanin_count. +// task_state stays PENDING after the non-profiling ready path. This is +// correct by design -- the profiling overload adds the CAS only to count +// atomic operations. +// ============================================================================= +TEST_F(TaskStateTest, NonProfilingReadyPathStaysPending) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 1, 1); + + bool ready = sched.release_fanin_and_check_ready(slot); + ASSERT_TRUE(ready) << "Task should be detected as ready via refcount"; + + // task_state remains PENDING -- this is correct by design. + EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING) + << "Non-profiling path intentionally does not transition task_state to READY"; +} + +// ============================================================================= +// Multi-fanin: partial release does not trigger ready +// ============================================================================= +TEST_F(TaskStateTest, MultiFaninPartialNotReady) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 3, 1); + + EXPECT_FALSE(sched.release_fanin_and_check_ready(slot)); + EXPECT_FALSE(sched.release_fanin_and_check_ready(slot)); + EXPECT_TRUE(sched.release_fanin_and_check_ready(slot)); +} + +// ============================================================================= +// Concurrent fanin: exactly one thread detects ready (via src API) +// ============================================================================= +TEST_F(TaskStateTest, ConcurrentFaninExactlyOneReady) { + constexpr int ROUNDS = 500; + + for (int round = 0; round < ROUNDS; round++) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_PENDING, 3, 1); + std::atomic ready_count{0}; + + auto release = [&]() { + if (sched.release_fanin_and_check_ready(slot)) { + ready_count.fetch_add(1); + } + }; + + std::thread t1(release), t2(release), t3(release); + t1.join(); + t2.join(); + t3.join(); + + EXPECT_EQ(ready_count.load(), 1) << "Round " << round; + } +} + +// ============================================================================= +// Concurrent subtask completion: exactly one thread sees done (via src API) +// ============================================================================= +TEST_F(TaskStateTest, ConcurrentSubtaskCompletion) { + constexpr int ROUNDS = 500; + + for (int round = 0; round < ROUNDS; round++) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 3; + slot.completed_subtasks.store(0); + std::atomic done_count{0}; + + auto complete = [&]() { + if (sched.on_subtask_complete(slot)) { + done_count.fetch_add(1); + } + }; + + std::thread t1(complete), t2(complete), t3(complete); + t1.join(); + t2.join(); + t3.join(); + + EXPECT_EQ(done_count.load(), 1) << "Round " << round; + EXPECT_EQ(slot.completed_subtasks.load(), 3); + } +} + +// ============================================================================= +// Double subtask completion (counter-model weakness). +// With the counter model, double-completing the same subtask increments +// completed_subtasks twice, potentially reaching total prematurely. +// Unlike the old bitmask model, the counter cannot detect duplicates. +// ============================================================================= +TEST_F(TaskStateTest, DoubleSubtaskCompletionCounterWeakness) { + alignas(64) PTO2TaskSlotState slot; + init_slot(slot, PTO2_TASK_RUNNING, 1, 1); + slot.total_required_subtasks = 2; + slot.completed_subtasks.store(0); + + // First subtask completion + bool done1 = sched.on_subtask_complete(slot); + EXPECT_FALSE(done1) << "Single completion doesn't complete the task"; + + // Same subtask completes AGAIN (logic error at caller level) + bool done2 = sched.on_subtask_complete(slot); + EXPECT_TRUE(done2) << "Counter model: double-completion falsely triggers done"; +} diff --git a/tests/ut/cpp/a5/test_tensormap.cpp b/tests/ut/cpp/a5/test_tensormap.cpp new file mode 100644 index 000000000..10eef0317 --- /dev/null +++ b/tests/ut/cpp/a5/test_tensormap.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for PTO2TensorMap from pto_tensormap.h / pto_tensormap.cpp + * + * Tests hash-table-based producer lookup with overlap detection: + * - Hash function distribution (golden-ratio multiplicative hash) + * - Insert / lookup / cleanup lifecycle + * - Overlap detection: fast-path (is_all_offset_zero) and slow-path (offsets) + * - Lazy invalidation (stale entries skipped, not truncated) + * - Multi-ring isolation in the same hash chain + * - Lookup returns all matches (no silent 16-result cap post-#669) + * - Entry pool allocation and free-list recycling + * - cleanup_retired correctness across task windows + */ + +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" +#include "pto_tensormap.h" + +// ============================================================================= +// Helpers +// ============================================================================= + +// Test-local mirror of the old stack-buffered lookup result. PR #669 removed +// PTO2LookupResult in favor of a callback-based API; these tests collect +// matches into a vector-like struct so assertions remain readable. +struct TestLookupResult { + struct Entry { + PTO2TensorMapEntry *entry; + OverlapStatus overlap_status; + }; + std::vector entries; + int count = 0; +}; + +static void run_lookup(PTO2TensorMap &tmap, const Tensor &tensor, TestLookupResult &out) { + tmap.lookup(tensor, [&](PTO2TensorMapEntry &e, OverlapStatus s) -> bool { + out.entries.push_back({&e, s}); + out.count++; + return true; + }); +} + +static Tensor make_test_tensor(uint64_t addr, uint32_t shape0, uint32_t ndims = 1, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {shape0}; + return make_tensor_external(reinterpret_cast(addr), shapes, ndims, DataType::FLOAT32, false, version); +} + +static Tensor make_test_tensor_2d(uint64_t addr, uint32_t s0, uint32_t s1, int32_t version = 0) { + uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS] = {s0, s1}; + return make_tensor_external(reinterpret_cast(addr), shapes, 2, DataType::FLOAT32, false, version); +} + +// ============================================================================= +// Fixture +// ============================================================================= + +class TensorMapTest : public ::testing::Test { +protected: + static constexpr int32_t NUM_BUCKETS = 16; + static constexpr int32_t POOL_SIZE = 64; + static constexpr int32_t WINDOW_SIZE = 32; + + PTO2TensorMap tmap{}; + + void SetUp() override { + int32_t window_sizes[PTO2_MAX_RING_DEPTH] = {WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE, WINDOW_SIZE}; + ASSERT_TRUE(tmap.init(NUM_BUCKETS, POOL_SIZE, window_sizes)); + } + + void TearDown() override { tmap.destroy(); } +}; + +// ============================================================================= +// Initialization +// ============================================================================= + +TEST_F(TensorMapTest, InitValidState) { + EXPECT_EQ(tmap.num_buckets, NUM_BUCKETS); + EXPECT_EQ(tmap.pool_size, POOL_SIZE); + EXPECT_EQ(tmap.next_entry_idx, 0); + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.valid_count(), 0); +} + +TEST_F(TensorMapTest, InitRequiresPowerOfTwoBuckets) { + PTO2TensorMap bad{}; + int32_t ws[PTO2_MAX_RING_DEPTH] = {8, 8, 8, 8}; + EXPECT_FALSE(bad.init(3, 64, ws)) << "non-power-of-2 bucket count must fail"; + EXPECT_FALSE(bad.init(7, 64, ws)); + EXPECT_TRUE(bad.init(8, 64, ws)); + bad.destroy(); +} + +// ============================================================================= +// Hash function +// ============================================================================= + +TEST_F(TensorMapTest, HashDeterministic) { + uint64_t addr = 0x1000; + EXPECT_EQ(tmap.hash(addr), tmap.hash(addr)); +} + +TEST_F(TensorMapTest, HashDistributesAlignedAddresses) { + std::set hit_buckets; + // Aligned addresses (64KB stride) should still distribute across buckets + for (uint64_t i = 0; i < 64; i++) { + uint64_t addr = i * 65536; + hit_buckets.insert(tmap.hash(addr)); + } + // With golden-ratio hash, 64 aligned addresses across 16 buckets + // should hit at least 12 distinct buckets + EXPECT_GE(hit_buckets.size(), 12u) << "Aligned addresses must distribute well"; +} + +TEST_F(TensorMapTest, HashBoundedByBucketCount) { + for (uint64_t addr = 0; addr < 1000; addr++) { + EXPECT_LT(tmap.hash(addr), static_cast(NUM_BUCKETS)); + } +} + +// ============================================================================= +// Insert and lookup: basic +// ============================================================================= + +TEST_F(TensorMapTest, InsertThenLookupFindsProducer) { + Tensor t = make_test_tensor(0x1000, 256); + PTO2TaskId tid = PTO2TaskId::make(0, 0); + tmap.insert(t, tid); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, tid); +} + +TEST_F(TensorMapTest, LookupEmptyReturnsZero) { + Tensor t = make_test_tensor(0x1000, 256); + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 0); +} + +TEST_F(TensorMapTest, InsertMultipleSameBuffer) { + Tensor t1 = make_test_tensor(0x1000, 256); + Tensor t2 = make_test_tensor(0x1000, 128); + PTO2TaskId tid1 = PTO2TaskId::make(0, 0); + PTO2TaskId tid2 = PTO2TaskId::make(0, 1); + + tmap.insert(t1, tid1); + tmap.insert(t2, tid2); + + TestLookupResult result; + run_lookup(tmap, t1, result); + // Both entries share same buffer_addr, so both should be found + EXPECT_EQ(result.count, 2); +} + +TEST_F(TensorMapTest, InsertDifferentBuffersNoCollision) { + Tensor t1 = make_test_tensor(0x1000, 256); + Tensor t2 = make_test_tensor(0x2000, 256); + tmap.insert(t1, PTO2TaskId::make(0, 0)); + tmap.insert(t2, PTO2TaskId::make(0, 1)); + + TestLookupResult r1; + run_lookup(tmap, t1, r1); + EXPECT_EQ(r1.count, 1); + EXPECT_EQ(r1.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 0)); + + TestLookupResult r2; + run_lookup(tmap, t2, r2); + EXPECT_EQ(r2.count, 1); + EXPECT_EQ(r2.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1)); +} + +// ============================================================================= +// Overlap detection: fast path (is_all_offset_zero) +// ============================================================================= + +TEST_F(TensorMapTest, OverlapFastPathCovered) { + // Producer output: shape [256], consumer input: shape [512] + // Consumer covers producer -> COVERED + Tensor producer = make_test_tensor(0x1000, 256); + Tensor consumer = make_test_tensor(0x1000, 512); + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +TEST_F(TensorMapTest, OverlapFastPathOther) { + // Producer output: shape [512], consumer input: shape [256] + // Consumer does NOT cover producer -> OTHER + Tensor producer = make_test_tensor(0x1000, 512); + Tensor consumer = make_test_tensor(0x1000, 256); + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER); +} + +TEST_F(TensorMapTest, OverlapFastPathExactMatch) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +// ============================================================================= +// Overlap detection: slow path (offsets via view) +// ============================================================================= + +TEST_F(TensorMapTest, OverlapSlowPathNoOverlap) { + // Producer writes [0..128), consumer reads [128..256) -> NO_OVERLAP + Tensor base = make_test_tensor_2d(0x1000, 256, 1); + uint32_t prod_shapes[] = {128, 1}; + uint32_t prod_offsets[] = {0, 0}; + Tensor producer = base.view(prod_shapes, prod_offsets); + + uint32_t con_shapes[] = {128, 1}; + uint32_t con_offsets[] = {128, 0}; + Tensor consumer = base.view(con_shapes, con_offsets); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + EXPECT_EQ(result.count, 0) << "Non-overlapping regions must return no results"; +} + +TEST_F(TensorMapTest, OverlapSlowPathPartialOverlap) { + // Producer writes [0..192), consumer reads [64..256) -> overlapping, OTHER + Tensor base = make_test_tensor_2d(0x1000, 256, 1); + uint32_t prod_shapes[] = {192, 1}; + uint32_t prod_offsets[] = {0, 0}; + Tensor producer = base.view(prod_shapes, prod_offsets); + + uint32_t con_shapes[] = {192, 1}; + uint32_t con_offsets[] = {64, 0}; + Tensor consumer = base.view(con_shapes, con_offsets); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER); +} + +TEST_F(TensorMapTest, OverlapSlowPathCovered) { + // Producer writes [64..192), consumer reads [0..256) -> consumer covers producer + Tensor base = make_test_tensor_2d(0x1000, 256, 1); + uint32_t prod_shapes[] = {128, 1}; + uint32_t prod_offsets[] = {64, 0}; + Tensor producer = base.view(prod_shapes, prod_offsets); + + uint32_t con_shapes[] = {256, 1}; + uint32_t con_offsets[] = {0, 0}; + Tensor consumer = base.view(con_shapes, con_offsets); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::COVERED); +} + +// ============================================================================= +// Version-based overlap detection +// ============================================================================= + +TEST_F(TensorMapTest, VersionMismatchReturnsOther) { + // Producer v0, consumer v1 -> always OTHER regardless of shape match + Tensor producer = make_test_tensor(0x1000, 256, 1, 0); + Tensor consumer = make_test_tensor(0x1000, 256, 1, 1); + + tmap.insert(producer, PTO2TaskId::make(0, 0)); + + TestLookupResult result; + run_lookup(tmap, consumer, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].overlap_status, OverlapStatus::OTHER); +} + +// ============================================================================= +// Lazy invalidation +// ============================================================================= + +TEST_F(TensorMapTest, StaleEntriesSkippedDuringLookup) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(0, 1)); + + // Advance validity to skip task 0 + tmap.sync_validity(0, 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 1)); +} + +TEST_F(TensorMapTest, StaleEntriesNotTruncatedAcrossRings) { + Tensor t = make_test_tensor(0x1000, 256); + // Ring 0, task 0 and Ring 1, task 0 -> same bucket + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(1, 0)); + + // Invalidate ring 0 only + tmap.sync_validity(0, 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + // Ring 1 task 0 still valid, ring 0 task 0 invalidated + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0)); +} + +// ============================================================================= +// cleanup_retired +// ============================================================================= + +TEST_F(TensorMapTest, CleanupRetiredRemovesEntriesForRetiredTasks) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(0, 1)); + tmap.insert(t, PTO2TaskId::make(0, 2)); + EXPECT_EQ(tmap.valid_count(), 3); + + // Cleanup tasks [0, 2) on ring 0 + tmap.cleanup_retired(0, 0, 2); + + EXPECT_EQ(tmap.valid_count(), 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(0, 2)); +} + +TEST_F(TensorMapTest, CleanupRetiredPreservesOtherRings) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + tmap.insert(t, PTO2TaskId::make(1, 0)); + + tmap.cleanup_retired(0, 0, 1); + + EXPECT_EQ(tmap.valid_count(), 1); + + TestLookupResult result; + run_lookup(tmap, t, result); + ASSERT_EQ(result.count, 1); + EXPECT_EQ(result.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 0)); +} + +TEST_F(TensorMapTest, CleanupRetiredFreesEntriesToPool) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 0)); + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.next_entry_idx, 1); + + tmap.cleanup_retired(0, 0, 1); + + EXPECT_EQ(tmap.free_num, 1) << "Cleaned entry should be in free list"; + + // New insert should reuse free entry instead of allocating fresh + tmap.insert(t, PTO2TaskId::make(0, 1)); + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.next_entry_idx, 1) << "Should reuse freed entry, not allocate new"; +} + +// ============================================================================= +// Multi-ring isolation +// ============================================================================= + +TEST_F(TensorMapTest, MultiRingIndependentLookup) { + Tensor t = make_test_tensor(0x1000, 256); + tmap.insert(t, PTO2TaskId::make(0, 5)); + tmap.insert(t, PTO2TaskId::make(1, 3)); + tmap.insert(t, PTO2TaskId::make(2, 7)); + + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 3); + + // Invalidate ring 0 up to task 6 and ring 2 up to task 8 + tmap.sync_validity(0, 6); + tmap.sync_validity(2, 8); + + TestLookupResult result2; + run_lookup(tmap, t, result2); + EXPECT_EQ(result2.count, 1); + EXPECT_EQ(result2.entries[0].entry->producer_task_id, PTO2TaskId::make(1, 3)); +} + +// ============================================================================= +// Lookup returns all matches (PR #669 removed the 16-slot cap) +// ============================================================================= + +TEST_F(TensorMapTest, LookupReturnsAllMatches) { + Tensor t = make_test_tensor(0x1000, 256); + // Insert 20 entries for the same buffer (was capped at 16 before #669) + for (int i = 0; i < 20; i++) { + tmap.insert(t, PTO2TaskId::make(0, i)); + } + + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 20) << "Lookup must return every overlapping entry, no silent cap"; +} + +// ============================================================================= +// Entry pool lifecycle +// ============================================================================= + +TEST_F(TensorMapTest, PoolExhaustionAsserts) { + // With pool_size=64, inserting 64 entries should work, 65th should fail + for (int i = 0; i < POOL_SIZE; i++) { + Tensor t = make_test_tensor(0x1000 + i * 0x100, 256); + tmap.insert(t, PTO2TaskId::make(0, i)); + } + EXPECT_EQ(tmap.next_entry_idx, POOL_SIZE); + EXPECT_EQ(tmap.free_num, 0); + + // 65th insert should trigger always_assert (pool overflow) + Tensor overflow = make_test_tensor(0x9000, 256); + EXPECT_THROW(tmap.insert(overflow, PTO2TaskId::make(0, POOL_SIZE)), std::runtime_error); +} + +TEST_F(TensorMapTest, FreeListRecycling) { + Tensor t = make_test_tensor(0x1000, 256); + // Insert and cleanup 10 entries + for (int i = 0; i < 10; i++) { + tmap.insert(t, PTO2TaskId::make(0, i)); + } + tmap.cleanup_retired(0, 0, 10); + EXPECT_EQ(tmap.free_num, 10); + + // Re-insert should use free list + for (int i = 10; i < 20; i++) { + tmap.insert(t, PTO2TaskId::make(0, i)); + } + EXPECT_EQ(tmap.free_num, 0); + EXPECT_EQ(tmap.next_entry_idx, 10) << "No new pool entries consumed when free list available"; +} + +// ============================================================================= +// Task chain integrity (per-task entry list) +// ============================================================================= + +TEST_F(TensorMapTest, PerTaskEntryListTracksMultipleOutputs) { + Tensor t1 = make_test_tensor(0x1000, 256); + Tensor t2 = make_test_tensor(0x2000, 128); + PTO2TaskId tid = PTO2TaskId::make(0, 5); + + tmap.insert(t1, tid); + tmap.insert(t2, tid); + EXPECT_EQ(tmap.valid_count(), 2); + + // Cleanup task 5 should remove both entries + tmap.cleanup_retired(0, 5, 6); + EXPECT_EQ(tmap.valid_count(), 0); + EXPECT_EQ(tmap.free_num, 2); +} + +// ============================================================================= +// Bucket chain integrity (doubly-linked list) +// ============================================================================= + +TEST_F(TensorMapTest, RemoveMiddleEntryPreservesChain) { + Tensor t = make_test_tensor(0x1000, 256); + PTO2TaskId tid0 = PTO2TaskId::make(0, 0); + PTO2TaskId tid1 = PTO2TaskId::make(0, 1); + PTO2TaskId tid2 = PTO2TaskId::make(0, 2); + + tmap.insert(t, tid0); + tmap.insert(t, tid1); + tmap.insert(t, tid2); + + // Remove middle entry (task 1) + tmap.cleanup_retired(0, 1, 2); + + TestLookupResult result; + run_lookup(tmap, t, result); + EXPECT_EQ(result.count, 2); + + std::set found_locals; + for (int i = 0; i < result.count; i++) { + found_locals.insert(result.entries[i].entry->producer_task_id.local()); + } + EXPECT_TRUE(found_locals.count(0)); + EXPECT_TRUE(found_locals.count(2)); +} + +// ============================================================================= +// PTO2TaskId encoding/decoding +// ============================================================================= + +TEST(TaskIdTest, MakeAndDecode) { + auto tid = PTO2TaskId::make(3, 42); + EXPECT_EQ(tid.ring(), 3); + EXPECT_EQ(tid.local(), 42u); +} + +TEST(TaskIdTest, InvalidSentinel) { + auto inv = PTO2TaskId::invalid(); + EXPECT_FALSE(inv.is_valid()); + EXPECT_EQ(inv.raw, UINT64_MAX); +} + +TEST(TaskIdTest, Equality) { + auto a = PTO2TaskId::make(1, 100); + auto b = PTO2TaskId::make(1, 100); + auto c = PTO2TaskId::make(2, 100); + EXPECT_EQ(a, b); + EXPECT_NE(a, c); +} + +TEST(TaskIdTest, RingIdMaxValue) { + auto tid = PTO2TaskId::make(255, 0); + EXPECT_EQ(tid.ring(), 255); + EXPECT_EQ(tid.local(), 0u); +} + +TEST(TaskIdTest, LocalIdMaxValue) { + auto tid = PTO2TaskId::make(0, UINT32_MAX); + EXPECT_EQ(tid.ring(), 0); + EXPECT_EQ(tid.local(), UINT32_MAX); +} diff --git a/tests/ut/cpp/a5/test_wiring.cpp b/tests/ut/cpp/a5/test_wiring.cpp new file mode 100644 index 000000000..964e826f8 --- /dev/null +++ b/tests/ut/cpp/a5/test_wiring.cpp @@ -0,0 +1,448 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Unit tests for scheduler wiring and completion paths: + * + * 1. wire_task() — fanout wiring, early-finished detection, + * fanin_count initialization, ready push + * 2. on_mixed_task_complete() — COMPLETED transition, fanout traversal, + * consumer fanin release + * 3. on_task_release() — fanin traversal, producer release, + * self-CONSUMED check + * 4. advance_ring_pointers() — CONSUMED slot scan, reset_for_reuse + * + * These tests exercise the core scheduling hot-paths that had zero coverage. + */ + +#include + +#include +#include +#include +#include + +#include "scheduler/pto_scheduler.h" + +// ============================================================================= +// Fixture: sets up a scheduler with shared memory and provides helpers +// ============================================================================= + +class WiringTest : public ::testing::Test { +protected: + PTO2SchedulerState sched{}; + PTO2SharedMemoryHandle *sm_handle = nullptr; + + void SetUp() override { + sm_handle = pto2_sm_create_default(); + ASSERT_NE(sm_handle, nullptr); + bool ok = pto2_scheduler_init(&sched, sm_handle->header); + ASSERT_TRUE(ok); + } + + void TearDown() override { + pto2_scheduler_destroy(&sched); + if (sm_handle) { + pto2_sm_destroy(sm_handle); + } + } + + // Initialize a slot for testing wiring/completion + void init_slot( + PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count, uint8_t ring_id = 0 + ) { + memset(&slot, 0, sizeof(slot)); + slot.task_state.store(state); + slot.fanin_count = fanin_count; + slot.fanin_refcount.store(0); + slot.fanout_count = fanout_count; + slot.fanout_refcount.store(0); + slot.fanout_lock.store(0); + slot.fanout_head = nullptr; + slot.ring_id = ring_id; + slot.active_mask = PTO2_SUBTASK_MASK_AIC; + slot.completed_subtasks.store(0); + slot.total_required_subtasks = 1; + slot.logical_block_num = 1; + slot.dep_pool_mark = 0; + } +}; + +// ============================================================================= +// wire_task: no fanin (independent task) +// ============================================================================= + +TEST_F(WiringTest, WireTaskNoFaninBecomesReady) { + // A task with 0 actual fanins should immediately be pushed to ready queue + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 0); + + // fanin_count set to 0 + 1 = 1 (the wiring "+1" sentinel) + EXPECT_EQ(task_slot.fanin_count, 1); + // fanin_refcount should be 1 (the +1 from no-fanin path) + EXPECT_EQ(task_slot.fanin_refcount.load(), 1); + + // Task should be in ready queue + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, &task_slot); +} + +// ============================================================================= +// wire_task: with fanin, all producers already completed (early-finished) +// ============================================================================= + +TEST_F(WiringTest, WireTaskAllProducersEarlyFinished) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producer_slots[2]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + // Set up 2 producers that are already COMPLETED + for (int i = 0; i < 2; i++) { + init_slot(producer_slots[i], PTO2_TASK_COMPLETED, 1, 2); + } + + // Consumer task with 2 fanins + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 2; + payload.fanin_inline_slot_states[0] = &producer_slots[0]; + payload.fanin_inline_slot_states[1] = &producer_slots[1]; + + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 2); + + // fanin_count = 2 + 1 = 3 + EXPECT_EQ(task_slot.fanin_count, 3); + // early_finished = 2, init_rc = 2 + 1 = 3, so refcount should hit fanin_count + EXPECT_GE(task_slot.fanin_refcount.load(), task_slot.fanin_count); + + // Task should be in ready queue + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, &task_slot); +} + +// ============================================================================= +// wire_task: with fanin, producers still pending (task NOT ready) +// ============================================================================= + +TEST_F(WiringTest, WireTaskProducersPendingTaskNotReady) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producer_slots[2]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + // Producers are RUNNING (not yet completed) + for (int i = 0; i < 2; i++) { + init_slot(producer_slots[i], PTO2_TASK_RUNNING, 1, 2); + } + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 2; + payload.fanin_inline_slot_states[0] = &producer_slots[0]; + payload.fanin_inline_slot_states[1] = &producer_slots[1]; + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 2); + + // fanin_count = 3 (2 + 1) + EXPECT_EQ(task_slot.fanin_count, 3); + // early_finished = 0, init_rc = 1 -> not ready + EXPECT_EQ(task_slot.fanin_refcount.load(), 1); + EXPECT_LT(task_slot.fanin_refcount.load(), task_slot.fanin_count); + + // Ready queue should be empty + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, nullptr); + + // Producers should have fanout_head pointing to task_slot + EXPECT_NE(producer_slots[0].fanout_head, nullptr); + EXPECT_EQ(producer_slots[0].fanout_head->slot_state, &task_slot); + EXPECT_NE(producer_slots[1].fanout_head, nullptr); + EXPECT_EQ(producer_slots[1].fanout_head->slot_state, &task_slot); +} + +// ============================================================================= +// wire_task: mixed early-finished and pending producers +// ============================================================================= + +TEST_F(WiringTest, WireTaskMixedProducerStates) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producers[3]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(producers[0], PTO2_TASK_COMPLETED, 1, 2); // early finished + init_slot(producers[1], PTO2_TASK_RUNNING, 1, 2); // still running + init_slot(producers[2], PTO2_TASK_CONSUMED, 1, 2); // early finished (>= COMPLETED) + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 3; + for (int i = 0; i < 3; i++) { + payload.fanin_inline_slot_states[i] = &producers[i]; + } + task_slot.payload = &payload; + task_slot.task = &desc; + + auto &rss = sched.ring_sched_states[0]; + sched.wire_task(rss, &task_slot, 3); + + // fanin_count = 4 (3 + 1) + EXPECT_EQ(task_slot.fanin_count, 4); + // early_finished = 2 (COMPLETED + CONSUMED), init_rc = 3 + // Not yet 4 -> not ready (one producer still running) + EXPECT_EQ(task_slot.fanin_refcount.load(), 3); + + // Only the running producer should have the consumer in its fanout chain + EXPECT_EQ(producers[0].fanout_head, nullptr); // early finished, no dep entry added + EXPECT_NE(producers[1].fanout_head, nullptr); // running, dep entry added + EXPECT_EQ(producers[2].fanout_head, nullptr); // early finished +} + +// ============================================================================= +// on_mixed_task_complete: notifies consumers via fanout chain +// ============================================================================= + +TEST_F(WiringTest, OnMixedTaskCompleteNotifiesConsumers) { + alignas(64) PTO2TaskSlotState producer; + alignas(64) PTO2TaskSlotState consumer1, consumer2; + alignas(64) PTO2TaskPayload prod_payload; + memset(&prod_payload, 0, sizeof(prod_payload)); + PTO2TaskDescriptor desc{}; + + // Set up producer in RUNNING state with 2 consumers in fanout chain + init_slot(producer, PTO2_TASK_RUNNING, 1, 1); + producer.payload = &prod_payload; + producer.task = &desc; + + // Consumer1: needs 1 more fanin to become ready + init_slot(consumer1, PTO2_TASK_PENDING, 2, 1); + consumer1.fanin_refcount.store(1); // 1 of 2 satisfied + consumer1.active_mask = PTO2_SUBTASK_MASK_AIC; + + // Consumer2: this release will make it ready + init_slot(consumer2, PTO2_TASK_PENDING, 2, 1); + consumer2.fanin_refcount.store(1); // 1 of 2 satisfied + consumer2.active_mask = PTO2_SUBTASK_MASK_AIC; + + // Build fanout chain: producer -> consumer2 -> consumer1 + PTO2DepListEntry dep_entries[2]; + dep_entries[0].slot_state = &consumer1; + dep_entries[0].next = nullptr; + dep_entries[1].slot_state = &consumer2; + dep_entries[1].next = &dep_entries[0]; + producer.fanout_head = &dep_entries[1]; + + sched.on_mixed_task_complete(producer); + + // Producer should be COMPLETED + EXPECT_EQ(producer.task_state.load(), PTO2_TASK_COMPLETED); + + // Both consumers should have fanin_refcount incremented + EXPECT_EQ(consumer1.fanin_refcount.load(), 2); + EXPECT_EQ(consumer2.fanin_refcount.load(), 2); + + // Both consumers should be ready (fanin_refcount == fanin_count) + PTO2ResourceShape shape = pto2_active_mask_to_shape(consumer1.active_mask); + auto *r1 = sched.ready_queues[static_cast(shape)].pop(); + auto *r2 = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_TRUE((r1 == &consumer1 && r2 == &consumer2) || (r1 == &consumer2 && r2 == &consumer1)); +} + +// ============================================================================= +// on_task_release: releases producers via fanin traversal +// ============================================================================= + +TEST_F(WiringTest, OnTaskReleaseReleasesProducers) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskSlotState producers[2]; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + // 2 producers, each COMPLETED with fanout_count=1 + for (int i = 0; i < 2; i++) { + init_slot(producers[i], PTO2_TASK_COMPLETED, 1, 1); + } + + init_slot(task_slot, PTO2_TASK_COMPLETED, 3, 1); + payload.fanin_actual_count = 2; + payload.fanin_inline_slot_states[0] = &producers[0]; + payload.fanin_inline_slot_states[1] = &producers[1]; + // Need a valid fanin_spill_pool even though we don't spill + PTO2FaninPool dummy_pool{}; + PTO2FaninSpillEntry dummy_entries[4]; + std::atomic dummy_error{PTO2_ERROR_NONE}; + dummy_pool.init(dummy_entries, 4, &dummy_error); + payload.fanin_spill_pool = &dummy_pool; + task_slot.payload = &payload; + task_slot.task = &desc; + + int32_t fanin_count = sched.on_task_release(task_slot); + EXPECT_EQ(fanin_count, 2); + + // Each producer should have fanout_refcount incremented + EXPECT_EQ(producers[0].fanout_refcount.load(), 1); + EXPECT_EQ(producers[1].fanout_refcount.load(), 1); + + // Producers with fanout_refcount == fanout_count AND COMPLETED -> CONSUMED + EXPECT_EQ(producers[0].task_state.load(), PTO2_TASK_CONSUMED); + EXPECT_EQ(producers[1].task_state.load(), PTO2_TASK_CONSUMED); +} + +// ============================================================================= +// advance_ring_pointers: scans CONSUMED slots, resets, advances last_alive +// ============================================================================= + +TEST_F(WiringTest, AdvanceRingPointersScansConsumed) { + auto &rss = sched.ring_sched_states[0]; + auto *ring = rss.ring; + + // Submit 3 tasks via flow control + ring->fc.current_task_index.store(3, std::memory_order_release); + + // Mark all 3 as CONSUMED + for (int i = 0; i < 3; i++) { + auto &slot = ring->get_slot_state_by_task_id(i); + slot.task_state.store(PTO2_TASK_CONSUMED, std::memory_order_release); + } + + EXPECT_EQ(rss.last_task_alive, 0); + rss.advance_ring_pointers(); + EXPECT_EQ(rss.last_task_alive, 3); + + // Verify SM was synced + EXPECT_EQ(ring->fc.last_task_alive.load(), 3); +} + +TEST_F(WiringTest, AdvanceRingPointersStopsAtNonConsumed) { + auto &rss = sched.ring_sched_states[0]; + auto *ring = rss.ring; + + ring->fc.current_task_index.store(5, std::memory_order_release); + + // Tasks 0,1 CONSUMED; task 2 COMPLETED (not consumed) + ring->get_slot_state_by_task_id(0).task_state.store(PTO2_TASK_CONSUMED); + ring->get_slot_state_by_task_id(1).task_state.store(PTO2_TASK_CONSUMED); + ring->get_slot_state_by_task_id(2).task_state.store(PTO2_TASK_COMPLETED); + + rss.advance_ring_pointers(); + EXPECT_EQ(rss.last_task_alive, 2) << "Should stop at first non-CONSUMED slot"; +} + +TEST_F(WiringTest, AdvanceRingPointersResetsSlots) { + auto &rss = sched.ring_sched_states[0]; + auto *ring = rss.ring; + + ring->fc.current_task_index.store(1, std::memory_order_release); + + auto &slot = ring->get_slot_state_by_task_id(0); + slot.task_state.store(PTO2_TASK_CONSUMED); + slot.fanout_count = 5; + slot.fanin_refcount.store(3); + slot.fanout_refcount.store(2); + slot.completed_subtasks.store(1); + + rss.advance_ring_pointers(); + + // After reset_for_reuse: fanout_count=1, fanin_refcount=0, etc. + EXPECT_EQ(slot.fanout_count, 1); + EXPECT_EQ(slot.fanin_refcount.load(), 0); + EXPECT_EQ(slot.fanout_refcount.load(), 0); + EXPECT_EQ(slot.completed_subtasks.load(), 0); + EXPECT_EQ(slot.fanout_head, nullptr); +} + +// ============================================================================= +// drain_wiring_queue: pushes tasks through SPSC queue +// ============================================================================= + +TEST_F(WiringTest, DrainWiringQueueProcessesTasks) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + // Push into wiring SPSC queue (orchestrator side) + ASSERT_TRUE(sched.wiring.queue.push(&task_slot)); + + // Drain (scheduler thread 0 side) + int wired = sched.drain_wiring_queue(true /* force_drain */); + EXPECT_EQ(wired, 1); + + // Task should be ready + PTO2ResourceShape shape = pto2_active_mask_to_shape(task_slot.active_mask); + auto *popped = sched.ready_queues[static_cast(shape)].pop(); + EXPECT_EQ(popped, &task_slot); +} + +TEST_F(WiringTest, DrainWiringQueueBackoffDefers) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + sched.wiring.queue.push(&task_slot); + + // Without force_drain, single item < BATCH_SIZE → backoff + sched.wiring.backoff_counter = 0; + int wired = sched.drain_wiring_queue(false); + EXPECT_EQ(wired, 0) << "Backoff should defer when queue < BATCH_SIZE"; + EXPECT_EQ(sched.wiring.backoff_counter, 1); +} + +TEST_F(WiringTest, DrainWiringQueueBackoffLimitForcesProcess) { + alignas(64) PTO2TaskSlotState task_slot; + alignas(64) PTO2TaskPayload payload; + memset(&payload, 0, sizeof(payload)); + PTO2TaskDescriptor desc{}; + + init_slot(task_slot, PTO2_TASK_PENDING, 0, 1); + payload.fanin_actual_count = 0; + task_slot.payload = &payload; + task_slot.task = &desc; + + sched.wiring.queue.push(&task_slot); + + // Set backoff at limit → should process + sched.wiring.backoff_counter = PTO2SchedulerState::WiringState::BACKOFF_LIMIT; + int wired = sched.drain_wiring_queue(false); + EXPECT_EQ(wired, 1) << "Backoff limit reached should force processing"; +}