From 1ca86599f18a1f3a343e58992acd4b810570dd35 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Wed, 22 Jan 2025 01:02:32 +0100 Subject: [PATCH 01/15] Fixed size atomic binary tree --- relaxed_concurrent_fifo/atomic_binary_tree.h | 107 +++++++++++++++++++ relaxed_concurrent_fifo/main.cpp | 28 +++++ 2 files changed, 135 insertions(+) create mode 100644 relaxed_concurrent_fifo/atomic_binary_tree.h diff --git a/relaxed_concurrent_fifo/atomic_binary_tree.h b/relaxed_concurrent_fifo/atomic_binary_tree.h new file mode 100644 index 0000000..ae295e6 --- /dev/null +++ b/relaxed_concurrent_fifo/atomic_binary_tree.h @@ -0,0 +1,107 @@ +#ifndef ATOMIC_BINARY_TREE_H_INCLUDED +#define ATOMIC_BINARY_TREE_H_INCLUDED + +#include + +struct atomic_binary_tree { + std::atomic data = 0; + + int claim_bit() { + static thread_local std::random_device dev; + static thread_local std::minstd_rand rng{ dev() }; + static thread_local std::uniform_int_distribution dist{ 0, static_cast(3) }; + + std::uint8_t loaded = data.load(std::memory_order_relaxed); + while (true) { + int initial = 3 + dist(rng); + int prev = initial; + while ((loaded & (1 << initial)) && initial > 0) { + prev = initial; + initial = parent(initial); + } + + if (initial < 0) { + return -1; + } + + int mask = 0; + bool all_set = true; + if (prev != initial) { + prev = sibling(prev); + // Go down + while (prev < 3) { + mask |= 1 << prev; + if (!(loaded & (1 << left_child(prev)))) { + if (!(loaded & (1 << right_child(prev)))) { + all_set = false; + mask = 0; + } + // TODO: By default, always left child. + prev = left_child(prev); + } else { + prev = right_child(prev); + } + } + } + + if ((loaded & (1 << prev))) { + return -1; + } + + mask |= 1 << prev; + + if (all_set) { + mask |= 1 << initial; + auto p = initial; + while ((loaded & (1 << sibling(p)))) { + p = parent(p); + mask |= 1 << p; + } + } + + if (data.compare_exchange_strong(loaded, loaded | mask, std::memory_order_relaxed)) { + return prev - 3; + } + } + } + + bool check_invariants() { + for (int i = 1; i < 7; i++) { + if ((data & (1 << i))) { + if (data & (1 << sibling(i))) { + if (!(data & (1 << parent(i)))) { + return false; + } + } else { + if (data & (1 << parent(i))) { + return false; + } + } + } else { + if (data & (1 << parent(i))) { + return false; + } + } + } + return true; + } + +private: + int parent(int index) { + return (index - 1) / 2; + } + + int left_child(int index) { + return 2 * index + 1; + } + + int right_child(int index) { + return 2 * index + 2; + } + + int sibling(int index) { + return index % 2 == 0 ? index - 1 : index + 1; + } +}; + +#endif // ATOMIC_BINARY_TREE_H_INCLUDED diff --git a/relaxed_concurrent_fifo/main.cpp b/relaxed_concurrent_fifo/main.cpp index 8bdb0d5..fcfedbb 100644 --- a/relaxed_concurrent_fifo/main.cpp +++ b/relaxed_concurrent_fifo/main.cpp @@ -171,11 +171,39 @@ std::size_t get_total_system_memory_bytes() { #endif } +#include "atomic_binary_tree.h" + int main(int argc, const char** argv) { #ifndef NDEBUG std::cout << "Running in debug mode!" << std::endl; #endif // NDEBUG + while (true) { + atomic_binary_tree a; + bool claimed[4] = { false, false, false, false }; + for (int i = 0; i < 4; i++) { + auto c = a.claim_bit(); + if (claimed[c]) { + throw std::exception("ALREADY CLAIMED"); + } + claimed[c] = true; + for (int i = 0; i < 7; i++) { + // std::cout << !!(a.data & (1 << i)); + } + //std::cout << std::endl; + if (!a.check_invariants()) { + throw std::exception("AAA"); + } + } + if (a.claim_bit() != -1) { + throw std::exception("AAA"); + } + if (!a.check_invariants()) { + throw std::exception("AAA"); + } + } + + //test_consistency<8, 16>(20000, 200000, 0); constexpr int TEST_ITERATIONS_DEFAULT = 2; From cec05e8dcfae0bd783b9544c210d5d0ba0c97242 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Wed, 5 Feb 2025 16:14:03 +0100 Subject: [PATCH 02/15] Organize tree into fragments --- relaxed_concurrent_fifo/atomic_binary_tree.h | 119 +++++++------------ 1 file changed, 45 insertions(+), 74 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_binary_tree.h b/relaxed_concurrent_fifo/atomic_binary_tree.h index ae295e6..38b43de 100644 --- a/relaxed_concurrent_fifo/atomic_binary_tree.h +++ b/relaxed_concurrent_fifo/atomic_binary_tree.h @@ -3,103 +3,74 @@ #include +template struct atomic_binary_tree { - std::atomic data = 0; + struct alignas(std::hardware_destructive_interference_size) tree_fragment { + std::atomic data = 0; + }; + + static constexpr size_t FRAGMENT_COUNT = SIZE / 4 + SIZE / 4 / 4; + static constexpr size_t LEAF_COUNT = (FRAGMENT_COUNT + 1) / 2; + static constexpr size_t LEAF_START = LEAF_COUNT - 1; + + tree_fragment fragments[FRAGMENT_COUNT]; int claim_bit() { static thread_local std::random_device dev; static thread_local std::minstd_rand rng{ dev() }; - static thread_local std::uniform_int_distribution dist{ 0, static_cast(3) }; - - std::uint8_t loaded = data.load(std::memory_order_relaxed); - while (true) { - int initial = 3 + dist(rng); - int prev = initial; - while ((loaded & (1 << initial)) && initial > 0) { - prev = initial; - initial = parent(initial); + static thread_local std::uniform_int_distribution dist_outer{ 0, static_cast(LEAF_COUNT - 1) }; + static thread_local std::uniform_int_distribution dist_inner{ 0, 3 }; + + // Select random starting leaf. + int idx = LEAF_START + dist_outer(rng); + int inner_idx = 3 + dist_inner(rng); + + // Ascend to find highest 0 node. + bool succ = false; + while (!succ && idx > 0) { + auto loaded = fragments[idx].data.load(); + while (loaded & (1 << inner_idx) && inner_idx > 0) { + inner_idx = parent(inner_idx); } - - if (initial < 0) { - return -1; - } - - int mask = 0; - bool all_set = true; - if (prev != initial) { - prev = sibling(prev); - // Go down - while (prev < 3) { - mask |= 1 << prev; - if (!(loaded & (1 << left_child(prev)))) { - if (!(loaded & (1 << right_child(prev)))) { - all_set = false; - mask = 0; - } - // TODO: By default, always left child. - prev = left_child(prev); - } else { - prev = right_child(prev); - } - } - } - - if ((loaded & (1 << prev))) { - return -1; + if (!(loaded & (1 << inner_idx))) { + succ = true; + } else { + // Position in parent. + // We could immediately take the parent, because we know that the entire child fragment is filled. + inner_idx = 3 + (idx - 1) % 4; + idx = parent<4>(idx); } + } - mask |= 1 << prev; + auto remember_idx = idx; + auto remember_inner_idx = inner_idx; - if (all_set) { - mask |= 1 << initial; - auto p = initial; - while ((loaded & (1 << sibling(p)))) { - p = parent(p); - mask |= 1 << p; - } - } + // Descend to find bit. - if (data.compare_exchange_strong(loaded, loaded | mask, std::memory_order_relaxed)) { - return prev - 3; - } - } - } + // Ascend further to fulfill invariants. - bool check_invariants() { - for (int i = 1; i < 7; i++) { - if ((data & (1 << i))) { - if (data & (1 << sibling(i))) { - if (!(data & (1 << parent(i)))) { - return false; - } - } else { - if (data & (1 << parent(i))) { - return false; - } - } - } else { - if (data & (1 << parent(i))) { - return false; - } - } - } - return true; + return 0; } private: - int parent(int index) { + static int parent(int index) { return (index - 1) / 2; } - int left_child(int index) { + template + static int parent(int index) { + return (index - 1) / N; + } + + static int left_child(int index) { return 2 * index + 1; } - int right_child(int index) { + static int right_child(int index) { return 2 * index + 2; } - int sibling(int index) { + static int sibling(int index) { return index % 2 == 0 ? index - 1 : index + 1; } }; From 2d0df13785e3f1e2542be052e328ce09572277ac Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Mon, 17 Nov 2025 07:01:33 +0100 Subject: [PATCH 03/15] Working bit tree --- relaxed_concurrent_fifo/atomic_binary_tree.h | 78 ------- relaxed_concurrent_fifo/atomic_bit_tree.h | 220 +++++++++++++++++++ relaxed_concurrent_fifo/atomic_bitset.h | 2 +- relaxed_concurrent_fifo/block_based_queue.h | 10 +- relaxed_concurrent_fifo/config.hpp | 6 +- relaxed_concurrent_fifo/main.cpp | 33 --- 6 files changed, 229 insertions(+), 120 deletions(-) delete mode 100644 relaxed_concurrent_fifo/atomic_binary_tree.h create mode 100644 relaxed_concurrent_fifo/atomic_bit_tree.h diff --git a/relaxed_concurrent_fifo/atomic_binary_tree.h b/relaxed_concurrent_fifo/atomic_binary_tree.h deleted file mode 100644 index 38b43de..0000000 --- a/relaxed_concurrent_fifo/atomic_binary_tree.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef ATOMIC_BINARY_TREE_H_INCLUDED -#define ATOMIC_BINARY_TREE_H_INCLUDED - -#include - -template -struct atomic_binary_tree { - struct alignas(std::hardware_destructive_interference_size) tree_fragment { - std::atomic data = 0; - }; - - static constexpr size_t FRAGMENT_COUNT = SIZE / 4 + SIZE / 4 / 4; - static constexpr size_t LEAF_COUNT = (FRAGMENT_COUNT + 1) / 2; - static constexpr size_t LEAF_START = LEAF_COUNT - 1; - - tree_fragment fragments[FRAGMENT_COUNT]; - - int claim_bit() { - static thread_local std::random_device dev; - static thread_local std::minstd_rand rng{ dev() }; - static thread_local std::uniform_int_distribution dist_outer{ 0, static_cast(LEAF_COUNT - 1) }; - static thread_local std::uniform_int_distribution dist_inner{ 0, 3 }; - - // Select random starting leaf. - int idx = LEAF_START + dist_outer(rng); - int inner_idx = 3 + dist_inner(rng); - - // Ascend to find highest 0 node. - bool succ = false; - while (!succ && idx > 0) { - auto loaded = fragments[idx].data.load(); - while (loaded & (1 << inner_idx) && inner_idx > 0) { - inner_idx = parent(inner_idx); - } - if (!(loaded & (1 << inner_idx))) { - succ = true; - } else { - // Position in parent. - // We could immediately take the parent, because we know that the entire child fragment is filled. - inner_idx = 3 + (idx - 1) % 4; - idx = parent<4>(idx); - } - } - - auto remember_idx = idx; - auto remember_inner_idx = inner_idx; - - // Descend to find bit. - - // Ascend further to fulfill invariants. - - return 0; - } - -private: - static int parent(int index) { - return (index - 1) / 2; - } - - template - static int parent(int index) { - return (index - 1) / N; - } - - static int left_child(int index) { - return 2 * index + 1; - } - - static int right_child(int index) { - return 2 * index + 2; - } - - static int sibling(int index) { - return index % 2 == 0 ? index - 1 : index + 1; - } -}; - -#endif // ATOMIC_BINARY_TREE_H_INCLUDED diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h new file mode 100644 index 0000000..146e444 --- /dev/null +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -0,0 +1,220 @@ +#ifndef ATOMIC_BINARY_TREE_H_INCLUDED +#define ATOMIC_BINARY_TREE_H_INCLUDED + +#include + +template +struct atomic_bit_tree { +private: + static_assert(sizeof(ARR_TYPE) <= 4, "Inner bitset type must be 4 bytes or smaller to allow for storing epoch."); + + std::size_t leaves_per_window; + std::size_t fragments_per_window; + // TODO: int or std::size_t? + int leaves_start_index; + + static constexpr std::size_t bit_count = sizeof(ARR_TYPE) * 8; + // TODO Don't need this, bit_count is constexpr. + static constexpr std::size_t bit_count_log_2 = std::bit_width(bit_count) - 1; + std::unique_ptr>[]> data; + + // TODO: The state of the tree (whether it's being filled/emptied) must be encoded, as this varies the semantics. + static constexpr std::uint64_t get_epoch(std::uint64_t epoch_and_bits) { return epoch_and_bits >> 32; } + static constexpr std::uint64_t get_bits(std::uint64_t epoch_and_bits) { return epoch_and_bits & 0xffff'ffff; } + static constexpr std::uint64_t make_unit(std::uint64_t epoch) { return epoch << 32; } + + static constexpr std::size_t calculate_fragment_count(std::size_t leaves) { + auto height = (std::bit_width(leaves) - 1) / bit_count_log_2; + return ((1ull << ((height + 1) * bit_count_log_2)) - 1) / (bit_count - 1); + } + + template + ARR_TYPE modify(std::uint64_t value, int bit_idx) { + ARR_TYPE raw = static_cast(value); + if constexpr (VALUE == claim_value::ONE) { + return raw & ~(1ull << bit_idx); + } else { + return raw | (1ull << bit_idx); + } + } + + template + std::pair try_change_bit(std::uint64_t epoch, std::atomic_uint64_t& leaf, std::uint64_t& leaf_val, int bit_idx, std::memory_order order) { + ARR_TYPE modified = modify(leaf_val, bit_idx); + // TODO: These conditions are not always needed. + while (modified != get_bits(leaf_val) && epoch == get_epoch(leaf_val)) { + bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); + if (leaf.compare_exchange_strong(leaf_val, advanced_epoch + ? (make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) + : (make_unit(epoch) | modified), order)) { + return {true, advanced_epoch}; + } + modified = modify(leaf_val, bit_idx); + } + return {false, false}; + } + + template + bool has_valid_bit(std::uint64_t value) { + return VALUE == claim_value::ONE ? get_bits(value) : ~get_bits(value); + } + + + template + static int select_random_bit_index(ARR_TYPE value) { + //unsigned value32 = value; + //return VALUE == claim_value::ZERO ? std::countr_one(value32) : std::countr_zero(value32); + + // TODO: Don't randomize? (FIFO semantic on fragment level??) + if constexpr (VALUE == claim_value::ZERO) { + value = ~value; + } + + static thread_local std::minstd_rand rng{std::random_device()()}; + auto valid_bits = std::popcount(value); + auto nth_bit = valid_bits <= 1 ? 0 : std::uniform_int_distribution<>{0, valid_bits - 1}(rng); + return std::countr_zero(_pdep_u32(1 << nth_bit, value)); + } + + template + std::size_t claim_bit_singular(cache_aligned_t>* root, int starting_bit, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { + int off = starting_bit / bit_count; + // TODO: Rotate. + int initial_rot = starting_bit % bit_count; + auto idx = leaves_start_index + off; + auto* leaf = &root[idx]; + auto leaf_val = leaf->value.load(order); + + bool success = false; + std::size_t ret = 0; + do { + // TODO: Potentially directly use countl_xxx here to avoid it later? + while (idx > 0 && (get_epoch(leaf_val) != epoch || !has_valid_bit(leaf_val))) { + idx = get_parent(idx); + leaf = &root[idx]; + leaf_val = leaf->value.load(order); + // TODO: Automatically fix parent here if child is erroneously marked? + } + + if (get_epoch(leaf_val) != epoch || !has_valid_bit(leaf_val)) { + // Root is invalid as well. + return std::numeric_limits::max(); + } + + bool advanced_epoch = false; + while (idx < leaves_start_index) { + idx = get_child(static_cast(leaf_val), idx); + if (idx == -1) { + advanced_epoch = true; + break; + } + leaf = &root[idx]; + leaf_val = leaf->value.load(order); + if (get_epoch(leaf_val) != epoch) { + advanced_epoch = true; + break; + } + } + + // Skip if we didn't find a leaf but stepped into an invalid + if (!advanced_epoch) { + do { + auto bit_idx = select_random_bit_index(static_cast(leaf_val)); + if (bit_idx == 32 || get_epoch(leaf_val) != epoch) { + // Leaf empty, need to move up again. + advanced_epoch = true; + break; + } + + ret = (idx - leaves_start_index) * bit_count + bit_idx; + if constexpr (MODE == claim_mode::READ_ONLY) { + return ret; + } + auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, bit_idx, order); + success = bit_change_ret.first; + advanced_epoch = bit_change_ret.second; + } while (!success); + } + + while (advanced_epoch && idx > 0) { + // idx = bit_count * parent + child_idx + 1 + int child_idx = idx - 1 - get_parent(idx) * bit_count; + idx = get_parent(idx); + leaf = &root[idx]; + leaf_val = leaf->value.load(order); + auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, child_idx, order); + advanced_epoch = bit_change_ret.second; + } + } while (!success); + return ret; + } + + int get_parent(int index) { + return (index - 1) / bit_count; + } + + template + int get_child(ARR_TYPE node, int index) { + auto offset = select_random_bit_index(node); + if (offset == 32) { + return -1; + } + return index * bit_count + offset + 1; + } + +public: + atomic_bit_tree(std::size_t window_count, std::size_t blocks_per_window) : + leaves_per_window(blocks_per_window / bit_count), + fragments_per_window(calculate_fragment_count(leaves_per_window)), + leaves_start_index(static_cast(fragments_per_window - leaves_per_window)), + data(std::make_unique>[]>(fragments_per_window * window_count)) { + // Must be a perfect k-ary tree. + assert(blocks_per_window == 1ull << ((std::bit_width(blocks_per_window) - 1) / bit_count_log_2 * bit_count_log_2)); + } + + template + std::size_t claim_bit(std::size_t window_index, int starting_bit, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { + // We use modified epochs. + epoch = epoch * 2 + (VALUE == claim_value::ONE ? 1 : 0); + auto ret = claim_bit_singular(&data[window_index * fragments_per_window], starting_bit, epoch, order); + + /*std::cout << window_index << " " << (int)VALUE << " " << (int)MODE << " "; + for (auto i = 0; i < fragments_per_window; i++) { + auto val = data[window_index * fragments_per_window + i]->load(); + std::cout << get_epoch(val) << " " << std::bitset(get_bits(val)) << " | "; + } + std::cout << std::endl;*/ + return ret; + } + + void set_epoch_if_empty(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { + epoch *= 2; + std::uint64_t next_eb = make_unit(epoch + 2); + for (std::size_t i = 0; i < fragments_per_window; i++) { + std::uint64_t eb = make_unit(epoch); + data[window_index * fragments_per_window + i]->compare_exchange_strong(eb, next_eb, order); + } + } + + void reset(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { + epoch = epoch * 2 + 1; + //assert(window_index < window_count); + //assert(index < blocks_per_window); + int idx = leaves_start_index + static_cast(index / bit_count); + auto root = &data[window_index * fragments_per_window]; + auto* leaf = &root[idx]; + auto leaf_val = leaf->value.load(order); + auto [success, advanced_epoch] = try_change_bit(epoch, *leaf, leaf_val, index % bit_count, order); + while (advanced_epoch && idx > 0) { + // idx = bit_count * parent + child_idx + 1 + int child_idx = idx - 1 - get_parent(idx) * bit_count; + idx = get_parent(idx); + leaf = &root[idx]; + leaf_val = leaf->value.load(order); + auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, child_idx, order); + advanced_epoch = bit_change_ret.second; + } + } +}; + +#endif // ATOMIC_BINARY_TREE_H_INCLUDED diff --git a/relaxed_concurrent_fifo/atomic_bitset.h b/relaxed_concurrent_fifo/atomic_bitset.h index 99d623c..526fea5 100644 --- a/relaxed_concurrent_fifo/atomic_bitset.h +++ b/relaxed_concurrent_fifo/atomic_bitset.h @@ -67,7 +67,7 @@ class atomic_bitset { template static constexpr std::size_t claim_bit_singular(std::atomic& epoch_and_bits, int initial_rot, std::uint64_t epoch, std::memory_order order) { std::uint64_t eb = epoch_and_bits.load(order); - if (get_epoch(eb) != epoch) { + if (get_epoch(eb) != epoch) { // TODO Do we properly mask the epoch we pass here??? return std::numeric_limits::max(); } while (true) { diff --git a/relaxed_concurrent_fifo/block_based_queue.h b/relaxed_concurrent_fifo/block_based_queue.h index 3c72fc5..ca72db7 100644 --- a/relaxed_concurrent_fifo/block_based_queue.h +++ b/relaxed_concurrent_fifo/block_based_queue.h @@ -11,6 +11,7 @@ #include "fifo.h" #include "atomic_bitset.h" #include "atomic_bitset_no_epoch.h" +#include "atomic_bit_tree.h" #ifndef BBQ_LOG_WINDOW_MOVE #define BBQ_LOG_WINDOW_MOVE 0 @@ -88,7 +89,7 @@ class block_based_queue { static inline block_t dummy_block{ reinterpret_cast(&dummy_block_value) }; atomic_bitset_no_epoch touched_set; - atomic_bitset filled_set; + atomic_bit_tree filled_set; std::unique_ptr buffer; std::uint64_t window_to_epoch(std::uint64_t window) const { @@ -150,8 +151,7 @@ class block_based_queue { public: block_based_queue(int thread_count, std::size_t min_size, double blocks_per_window_per_thread, std::size_t cells_per_block) : - blocks_per_window(std::bit_ceil(std::max(sizeof(BITSET_T) * 8, - std::lround(thread_count * blocks_per_window_per_thread)))), + blocks_per_window(512), window_block_distribution(0, static_cast(blocks_per_window - 1)), window_count(std::max(4, std::bit_ceil(min_size / blocks_per_window / cells_per_block))), window_count_mod_mask(window_count - 1), @@ -316,9 +316,9 @@ class block_based_queue { if (write_window == window_index + 1) { std::uint64_t write_epoch = fifo.window_to_epoch(write_window); std::uint64_t write_window_index = fifo.window_to_index(write_window); - if (!fifo.filled_set.any(write_window_index, write_epoch, std::memory_order_relaxed)) { + //if (!fifo.filled_set.any(write_window_index, write_epoch, std::memory_order_relaxed)) { return false; - } + //} // Before we force-move the write window, there might be unclaimed blocks in the current one. // We need to make sure we clean those up BEFORE we move the write window in order to prevent diff --git a/relaxed_concurrent_fifo/config.hpp b/relaxed_concurrent_fifo/config.hpp index fd5db38..e0ca7ce 100644 --- a/relaxed_concurrent_fifo/config.hpp +++ b/relaxed_concurrent_fifo/config.hpp @@ -41,11 +41,11 @@ template static void add_instances(std::vector>>& instances, bool parameter_tuning, std::unordered_set& filter_set, bool are_exclude_filters) { #if defined(INCLUDE_BBQ) || defined(INCLUDE_ALL) if (parameter_tuning) { - for (double b = 0.5; b <= 16; b *= 2) { + //for (double b = 0.5; b <= 16; b *= 2) { for (int c = 2; c <= 2048; c *= 2) { - instances.push_back(std::make_unique>("{},{},blockfifo", b, c - 1)); + instances.push_back(std::make_unique>("{},{},blockfifo", 1, c - 1)); } - } + //} } else { instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 7)); instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 63)); diff --git a/relaxed_concurrent_fifo/main.cpp b/relaxed_concurrent_fifo/main.cpp index fcfedbb..4b5440b 100644 --- a/relaxed_concurrent_fifo/main.cpp +++ b/relaxed_concurrent_fifo/main.cpp @@ -2,7 +2,6 @@ #include "block_based_queue.h" - #include #include #include @@ -171,39 +170,7 @@ std::size_t get_total_system_memory_bytes() { #endif } -#include "atomic_binary_tree.h" - int main(int argc, const char** argv) { -#ifndef NDEBUG - std::cout << "Running in debug mode!" << std::endl; -#endif // NDEBUG - - while (true) { - atomic_binary_tree a; - bool claimed[4] = { false, false, false, false }; - for (int i = 0; i < 4; i++) { - auto c = a.claim_bit(); - if (claimed[c]) { - throw std::exception("ALREADY CLAIMED"); - } - claimed[c] = true; - for (int i = 0; i < 7; i++) { - // std::cout << !!(a.data & (1 << i)); - } - //std::cout << std::endl; - if (!a.check_invariants()) { - throw std::exception("AAA"); - } - } - if (a.claim_bit() != -1) { - throw std::exception("AAA"); - } - if (!a.check_invariants()) { - throw std::exception("AAA"); - } - } - - //test_consistency<8, 16>(20000, 200000, 0); constexpr int TEST_ITERATIONS_DEFAULT = 2; From aee67d5cd36612c94d4102632584962ebd8954a4 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Sun, 30 Nov 2025 00:23:30 +0100 Subject: [PATCH 04/15] Add more TODOs, missing includes and explicitly ignore unused parameters --- relaxed_concurrent_fifo/atomic_bit_tree.h | 19 ++++++++++++++----- relaxed_concurrent_fifo/block_based_queue.h | 3 +++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index 146e444..e306bb8 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -3,6 +3,8 @@ #include +#include + template struct atomic_bit_tree { private: @@ -59,6 +61,7 @@ struct atomic_bit_tree { return VALUE == claim_value::ONE ? get_bits(value) : ~get_bits(value); } + static inline thread_local std::minstd_rand rng{std::random_device()()}; template static int select_random_bit_index(ARR_TYPE value) { @@ -70,9 +73,12 @@ struct atomic_bit_tree { value = ~value; } - static thread_local std::minstd_rand rng{std::random_device()()}; + if (value == 0) { + return 32; + } + auto valid_bits = std::popcount(value); - auto nth_bit = valid_bits <= 1 ? 0 : std::uniform_int_distribution<>{0, valid_bits - 1}(rng); + auto nth_bit = std::uniform_int_distribution<>{0, valid_bits - 1}(rng); return std::countr_zero(_pdep_u32(1 << nth_bit, value)); } @@ -80,7 +86,7 @@ struct atomic_bit_tree { std::size_t claim_bit_singular(cache_aligned_t>* root, int starting_bit, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { int off = starting_bit / bit_count; // TODO: Rotate. - int initial_rot = starting_bit % bit_count; + //int initial_rot = starting_bit % bit_count; auto idx = leaves_start_index + off; auto* leaf = &root[idx]; auto leaf_val = leaf->value.load(order); @@ -89,6 +95,7 @@ struct atomic_bit_tree { std::size_t ret = 0; do { // TODO: Potentially directly use countl_xxx here to avoid it later? + // TODO: Epoch check more explicit (+1). while (idx > 0 && (get_epoch(leaf_val) != epoch || !has_valid_bit(leaf_val))) { idx = get_parent(idx); leaf = &root[idx]; @@ -103,8 +110,9 @@ struct atomic_bit_tree { bool advanced_epoch = false; while (idx < leaves_start_index) { - idx = get_child(static_cast(leaf_val), idx); + idx = get_random_child(static_cast(leaf_val), idx); if (idx == -1) { + // TODO advanced_epoch = true; break; } @@ -144,6 +152,7 @@ struct atomic_bit_tree { leaf_val = leaf->value.load(order); auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, child_idx, order); advanced_epoch = bit_change_ret.second; + // TODO: Set idx to restart? } } while (!success); return ret; @@ -154,7 +163,7 @@ struct atomic_bit_tree { } template - int get_child(ARR_TYPE node, int index) { + int get_random_child(ARR_TYPE node, int index) { auto offset = select_random_bit_index(node); if (offset == 32) { return -1; diff --git a/relaxed_concurrent_fifo/block_based_queue.h b/relaxed_concurrent_fifo/block_based_queue.h index ca72db7..d77ae59 100644 --- a/relaxed_concurrent_fifo/block_based_queue.h +++ b/relaxed_concurrent_fifo/block_based_queue.h @@ -166,6 +166,9 @@ class block_based_queue { std::cout << "Block count: " << blocks_per_window << std::endl; #endif // BBQ_LOG_CREATION_SIZE + (void)thread_count; + (void)blocks_per_window_per_thread; + // At least as big as the bitset's type. assert(blocks_per_window >= sizeof(BITSET_T) * 8); assert(std::bit_ceil(blocks_per_window) == blocks_per_window); From 9b7eb8da45cabcf443b03820d9f803716dd5c541 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Sun, 30 Nov 2025 02:25:26 +0100 Subject: [PATCH 05/15] Merge branch 'main' into atomic-binary-tree --- relaxed_concurrent_fifo/atomic_bit_tree.h | 72 ++++++----- relaxed_concurrent_fifo/atomic_bitset.h | 61 +++++---- .../atomic_bitset_no_epoch.h | 116 ------------------ relaxed_concurrent_fifo/block_based_queue.h | 8 +- relaxed_concurrent_fifo/epoch_handling.hpp | 42 +++++++ 5 files changed, 118 insertions(+), 181 deletions(-) delete mode 100644 relaxed_concurrent_fifo/atomic_bitset_no_epoch.h create mode 100644 relaxed_concurrent_fifo/epoch_handling.hpp diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index e306bb8..b782d46 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -5,7 +5,10 @@ #include -template +#include "atomic_bitset.h" +#include "epoch_handling.hpp" + +template struct atomic_bit_tree { private: static_assert(sizeof(ARR_TYPE) <= 4, "Inner bitset type must be 4 bytes or smaller to allow for storing epoch."); @@ -20,11 +23,6 @@ struct atomic_bit_tree { static constexpr std::size_t bit_count_log_2 = std::bit_width(bit_count) - 1; std::unique_ptr>[]> data; - // TODO: The state of the tree (whether it's being filled/emptied) must be encoded, as this varies the semantics. - static constexpr std::uint64_t get_epoch(std::uint64_t epoch_and_bits) { return epoch_and_bits >> 32; } - static constexpr std::uint64_t get_bits(std::uint64_t epoch_and_bits) { return epoch_and_bits & 0xffff'ffff; } - static constexpr std::uint64_t make_unit(std::uint64_t epoch) { return epoch << 32; } - static constexpr std::size_t calculate_fragment_count(std::size_t leaves) { auto height = (std::bit_width(leaves) - 1) / bit_count_log_2; return ((1ull << ((height + 1) * bit_count_log_2)) - 1) / (bit_count - 1); @@ -44,11 +42,11 @@ struct atomic_bit_tree { std::pair try_change_bit(std::uint64_t epoch, std::atomic_uint64_t& leaf, std::uint64_t& leaf_val, int bit_idx, std::memory_order order) { ARR_TYPE modified = modify(leaf_val, bit_idx); // TODO: These conditions are not always needed. - while (modified != get_bits(leaf_val) && epoch == get_epoch(leaf_val)) { + while (modified != EPOCH::get_bits(leaf_val) && EPOCH::compare_epochs(leaf_val, epoch)) { bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); if (leaf.compare_exchange_strong(leaf_val, advanced_epoch - ? (make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) - : (make_unit(epoch) | modified), order)) { + ? (EPOCH::make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) + : (EPOCH::make_unit(epoch) | modified), order)) { return {true, advanced_epoch}; } modified = modify(leaf_val, bit_idx); @@ -58,7 +56,7 @@ struct atomic_bit_tree { template bool has_valid_bit(std::uint64_t value) { - return VALUE == claim_value::ONE ? get_bits(value) : ~get_bits(value); + return VALUE == claim_value::ONE ? EPOCH::get_bits(value) : ~EPOCH::get_bits(value); } static inline thread_local std::minstd_rand rng{std::random_device()()}; @@ -96,14 +94,14 @@ struct atomic_bit_tree { do { // TODO: Potentially directly use countl_xxx here to avoid it later? // TODO: Epoch check more explicit (+1). - while (idx > 0 && (get_epoch(leaf_val) != epoch || !has_valid_bit(leaf_val))) { + while (idx > 0 && (!EPOCH::compare_epochs(leaf_val, epoch) || !has_valid_bit(leaf_val))) { idx = get_parent(idx); leaf = &root[idx]; leaf_val = leaf->value.load(order); // TODO: Automatically fix parent here if child is erroneously marked? } - if (get_epoch(leaf_val) != epoch || !has_valid_bit(leaf_val)) { + if (!EPOCH::compare_epochs(leaf_val, epoch) || !has_valid_bit(leaf_val)) { // Root is invalid as well. return std::numeric_limits::max(); } @@ -118,7 +116,7 @@ struct atomic_bit_tree { } leaf = &root[idx]; leaf_val = leaf->value.load(order); - if (get_epoch(leaf_val) != epoch) { + if (!EPOCH::compare_epochs(leaf_val, epoch)) { advanced_epoch = true; break; } @@ -128,7 +126,7 @@ struct atomic_bit_tree { if (!advanced_epoch) { do { auto bit_idx = select_random_bit_index(static_cast(leaf_val)); - if (bit_idx == 32 || get_epoch(leaf_val) != epoch) { + if (bit_idx == 32 || !EPOCH::compare_epochs(leaf_val, epoch)) { // Leaf empty, need to move up again. advanced_epoch = true; break; @@ -171,6 +169,26 @@ struct atomic_bit_tree { return index * bit_count + offset + 1; } + template + void change_bit(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { + //assert(window_index < window_count); + //assert(index < blocks_per_window); + int idx = leaves_start_index + static_cast(index / bit_count); + auto root = &data[window_index * fragments_per_window]; + auto* leaf = &root[idx]; + auto leaf_val = leaf->value.load(order); + auto [success, advanced_epoch] = try_change_bit(epoch, *leaf, leaf_val, index % bit_count, order); + while (advanced_epoch && idx > 0) { + // idx = bit_count * parent + child_idx + 1 + int child_idx = idx - 1 - get_parent(idx) * bit_count; + idx = get_parent(idx); + leaf = &root[idx]; + leaf_val = leaf->value.load(order); + auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, child_idx, order); + advanced_epoch = bit_change_ret.second; + } + } + public: atomic_bit_tree(std::size_t window_count, std::size_t blocks_per_window) : leaves_per_window(blocks_per_window / bit_count), @@ -198,31 +216,19 @@ struct atomic_bit_tree { void set_epoch_if_empty(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { epoch *= 2; - std::uint64_t next_eb = make_unit(epoch + 2); + std::uint64_t next_eb = EPOCH::make_unit(epoch + 2); for (std::size_t i = 0; i < fragments_per_window; i++) { - std::uint64_t eb = make_unit(epoch); + std::uint64_t eb = EPOCH::make_unit(epoch); data[window_index * fragments_per_window + i]->compare_exchange_strong(eb, next_eb, order); } } + void set(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { + return change_bit(window_index, index, epoch * 2, order); + } + void reset(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - epoch = epoch * 2 + 1; - //assert(window_index < window_count); - //assert(index < blocks_per_window); - int idx = leaves_start_index + static_cast(index / bit_count); - auto root = &data[window_index * fragments_per_window]; - auto* leaf = &root[idx]; - auto leaf_val = leaf->value.load(order); - auto [success, advanced_epoch] = try_change_bit(epoch, *leaf, leaf_val, index % bit_count, order); - while (advanced_epoch && idx > 0) { - // idx = bit_count * parent + child_idx + 1 - int child_idx = idx - 1 - get_parent(idx) * bit_count; - idx = get_parent(idx); - leaf = &root[idx]; - leaf_val = leaf->value.load(order); - auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, child_idx, order); - advanced_epoch = bit_change_ret.second; - } + return change_bit(window_index, index, epoch * 2 + 1, order); } }; diff --git a/relaxed_concurrent_fifo/atomic_bitset.h b/relaxed_concurrent_fifo/atomic_bitset.h index 526fea5..456cbda 100644 --- a/relaxed_concurrent_fifo/atomic_bitset.h +++ b/relaxed_concurrent_fifo/atomic_bitset.h @@ -7,6 +7,7 @@ #include #include +#include "epoch_handling.hpp" #include "utility.h" #ifndef BITSET_DEFAULT_MEMORY_ORDER @@ -23,7 +24,7 @@ enum class claim_mode { READ_ONLY, }; -template +template class atomic_bitset { private: static_assert(sizeof(ARR_TYPE) <= 4, "Inner bitset type must be 4 bytes or smaller to allow for storing epoch."); @@ -38,36 +39,42 @@ class atomic_bitset { static constexpr std::size_t bit_count = sizeof(ARR_TYPE) * 8; std::unique_ptr>[]> data; - static constexpr std::uint64_t get_epoch(std::uint64_t epoch_and_bits) { return epoch_and_bits >> 32; } - static constexpr std::uint64_t get_bits(std::uint64_t epoch_and_bits) { return epoch_and_bits & 0xffff'ffff; } - static constexpr std::uint64_t make_unit(std::uint64_t epoch) { return epoch << 32; } - template static constexpr void set_bit_atomic(std::atomic& epoch_and_bits, std::size_t index, std::uint64_t epoch, std::memory_order order) { - std::uint64_t eb = epoch_and_bits.load(order); - std::uint64_t test; - std::uint64_t stencil = 1ull << index; - do { - if (get_epoch(eb) != epoch) { - return; - } + if constexpr (EPOCH::uses_epochs) { + std::uint64_t eb = epoch_and_bits.load(order); + std::uint64_t test; + std::uint64_t stencil = 1ull << index; + do { + if (!EPOCH::compare_epochs(eb, epoch)) { + return; + } + if constexpr (SET) { + test = eb | stencil; + } else { + // TODO: Special case handling like this is probably bad. + // We basically want to increment the epoch when the last filled bit has been reset. + test = eb & ~stencil; + if (EPOCH::get_bits(test) == 0) { + test = EPOCH::make_unit(epoch + 1); + } + } + } while (!epoch_and_bits.compare_exchange_weak(eb, test, order)); + } else { + ARR_TYPE mask = static_cast(1) << index; if constexpr (SET) { - test = eb | stencil; + epoch_and_bits.fetch_or(mask, order); } else { - // TODO: Special case handling like this is probably bad. - // We basically want to increment the epoch when the last filled bit has been reset. - test = eb & ~stencil; - if (get_bits(test) == 0) { - test = make_unit(epoch + 1); - } + epoch_and_bits.fetch_and(~mask, order); } - } while (!epoch_and_bits.compare_exchange_weak(eb, test, order)); + } + } template static constexpr std::size_t claim_bit_singular(std::atomic& epoch_and_bits, int initial_rot, std::uint64_t epoch, std::memory_order order) { std::uint64_t eb = epoch_and_bits.load(order); - if (get_epoch(eb) != epoch) { // TODO Do we properly mask the epoch we pass here??? + if (!EPOCH::compare_epochs(eb, epoch)) { // TODO Do we properly mask the epoch we pass here (we only have 32 bits)??? return std::numeric_limits::max(); } while (true) { @@ -89,11 +96,11 @@ class atomic_bitset { while (true) { if (epoch_and_bits.compare_exchange_weak(eb, VALUE == claim_value::ONE && test == 0 - ? make_unit(epoch + 1) - : (make_unit(epoch) | test), order)) { + ? EPOCH::make_unit(epoch + 1) + : (EPOCH::make_unit(epoch) | test), order)) { return original_index; } - if (get_epoch(eb) != epoch) [[unlikely]] { + if (!EPOCH::compare_epochs(eb, epoch)) [[unlikely]] { return std::numeric_limits::max(); } raw = static_cast(eb); @@ -149,7 +156,7 @@ class atomic_bitset { [[nodiscard]] constexpr bool any(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) const { for (std::size_t i = 0; i < units_per_window; i++) { std::uint64_t eb = data[window_index * units_per_window + i]->load(order); - if (get_epoch(eb) == epoch && get_bits(eb)) { + if (EPOCH::compare_epochs(eb, epoch) && EPOCH::get_bits(eb)) { return true; } } @@ -157,9 +164,9 @@ class atomic_bitset { } void set_epoch_if_empty(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - std::uint64_t next_eb = make_unit(epoch + 1); + std::uint64_t next_eb = EPOCH::make_unit(epoch + 1); for (std::size_t i = 0; i < units_per_window; i++) { - std::uint64_t eb = make_unit(epoch); + std::uint64_t eb = EPOCH::make_unit(epoch); data[window_index * units_per_window + i]->compare_exchange_strong(eb, next_eb, order); } } diff --git a/relaxed_concurrent_fifo/atomic_bitset_no_epoch.h b/relaxed_concurrent_fifo/atomic_bitset_no_epoch.h deleted file mode 100644 index a7cd406..0000000 --- a/relaxed_concurrent_fifo/atomic_bitset_no_epoch.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef ATOMIC_BITSET_NO_EPOCH_H_INCLUDED -#define ATOMIC_BITSET_NO_EPOCH_H_INCLUDED - -#include "atomic_bitset.h" - -#include -#include -#include -#include -#include -#include - -#include "utility.h" - -template -class atomic_bitset_no_epoch { -private: -#ifndef NDEBUG - std::size_t window_count; - std::size_t blocks_per_window; -#endif - std::size_t units_per_window; - std::size_t units_per_window_mod_mask; - - static constexpr std::size_t bit_count = sizeof(ARR_TYPE) * 8; - std::unique_ptr>[]> data; - - template - static constexpr void set_bit_atomic(std::atomic& bits, std::size_t index, std::memory_order order) { - ARR_TYPE mask = static_cast(1) << index; - if constexpr (SET) { - bits.fetch_or(mask, order); - } else { - bits.fetch_and(~mask, order); - } - } - - template - static constexpr std::size_t claim_bit_singular(std::atomic& bits, int initial_rot, std::memory_order order) { - ARR_TYPE raw = bits.load(order); - while (true) { - ARR_TYPE rotated = std::rotr(raw, initial_rot); - int counted = VALUE == claim_value::ONE ? std::countr_zero(rotated) : std::countr_one(rotated); - if (counted == bit_count) { - return std::numeric_limits::max(); - } - std::size_t original_index = (initial_rot + counted) % bit_count; - if constexpr (MODE == claim_mode::READ_WRITE) { - ARR_TYPE test; - if constexpr (VALUE == claim_value::ONE) { - test = raw & ~(1ull << original_index); - } else { - test = raw | (1ull << original_index); - } - // Keep retrying until the bit we are trying to claim has changed. - while (true) { - if (bits.compare_exchange_weak(raw, test, order)) { - return original_index; - } - if constexpr (VALUE == claim_value::ONE) { - test = raw & ~(1ull << original_index); - } else { - test = raw | (1ull << original_index); - } - if (test == raw) [[unlikely]] { - break; - } - } - } else { - return original_index; - } - } - } - -public: - atomic_bitset_no_epoch(std::size_t window_count, std::size_t blocks_per_window) : -#ifndef NDEBUG - window_count(window_count), - blocks_per_window(blocks_per_window), -#endif - units_per_window(blocks_per_window / bit_count), - units_per_window_mod_mask((blocks_per_window / bit_count) - 1), - data(std::make_unique>[]>(window_count * units_per_window)) { - assert(blocks_per_window % bit_count == 0); - } - - constexpr void set(std::size_t window_index, std::size_t index, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - assert(window_index < window_count); - assert(index < blocks_per_window); - set_bit_atomic(data[window_index * units_per_window + index / bit_count], index % bit_count, order); - } - - constexpr void reset(std::size_t window_index, std::size_t index, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - assert(window_index < window_count); - assert(index < blocks_per_window); - set_bit_atomic(data[window_index * units_per_window + index / bit_count], index % bit_count, order); - } - - template - std::size_t claim_bit(std::size_t window_index, int starting_bit, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - assert(window_index < window_count); - assert(static_cast(starting_bit) < blocks_per_window); - int off = starting_bit / bit_count; - int initial_rot = starting_bit % bit_count; - for (std::size_t i = 0; i < units_per_window; i++) { - auto index = (i + off) & units_per_window_mod_mask; - if (auto ret = claim_bit_singular(data[window_index * units_per_window + index], initial_rot, order); - ret != std::numeric_limits::max()) { - return ret + index * bit_count; - } - } - return std::numeric_limits::max(); - } -}; - -#endif // ATOMIC_BITSET_NO_EPOCH_H_INCLUDED diff --git a/relaxed_concurrent_fifo/block_based_queue.h b/relaxed_concurrent_fifo/block_based_queue.h index d77ae59..0f13e7f 100644 --- a/relaxed_concurrent_fifo/block_based_queue.h +++ b/relaxed_concurrent_fifo/block_based_queue.h @@ -9,8 +9,6 @@ #include #include "fifo.h" -#include "atomic_bitset.h" -#include "atomic_bitset_no_epoch.h" #include "atomic_bit_tree.h" #ifndef BBQ_LOG_WINDOW_MOVE @@ -88,7 +86,7 @@ class block_based_queue { static inline std::atomic_uint64_t dummy_block_value{ epoch_to_header(0x1000'0000ull) }; static inline block_t dummy_block{ reinterpret_cast(&dummy_block_value) }; - atomic_bitset_no_epoch touched_set; + atomic_bit_tree touched_set; atomic_bit_tree filled_set; std::unique_ptr buffer; @@ -116,13 +114,13 @@ class block_based_queue { } // The touched set update can be missed, which might trigger a reader to attempt to move, // but the filled set will prevent the move from occuring. - touched_set.set(index, free_bit, std::memory_order_relaxed); + touched_set.set(index, free_bit, 0, std::memory_order_relaxed); return get_block(index, free_bit); } block_t try_get_free_read_block(std::uint64_t window_index, int starting_bit) { auto index = window_to_index(window_index); - std::size_t free_bit = touched_set.template claim_bit(index, starting_bit, std::memory_order_relaxed); + std::size_t free_bit = touched_set.template claim_bit(index, starting_bit, 0, std::memory_order_relaxed); if (free_bit == std::numeric_limits::max()) { return nullptr; } diff --git a/relaxed_concurrent_fifo/epoch_handling.hpp b/relaxed_concurrent_fifo/epoch_handling.hpp new file mode 100644 index 0000000..c792e9a --- /dev/null +++ b/relaxed_concurrent_fifo/epoch_handling.hpp @@ -0,0 +1,42 @@ +#ifndef EPOCH_HANDLING +#define EPOCH_HANDLING + +#include + +template +concept epoch_handling = requires(std::uint64_t u64) { + { T::compare_epochs(u64, u64) } -> std::same_as; + { T::uses_epochs } -> std::convertible_to; + { T::make_unit(u64) } -> std::same_as; + { T::get_bits(u64) } -> std::same_as; +}; + +struct default_epoch_handling { + static constexpr bool uses_epochs = true; + static constexpr bool compare_epochs(std::uint64_t epoch_and_bits, std::uint64_t epoch) { + return (epoch_and_bits >> 32) == epoch; + } + static constexpr std::uint64_t make_unit(std::uint64_t epoch) { + return epoch << 32; + } + static constexpr std::uint64_t get_bits(std::uint64_t bits) { + return bits & 0xffff'ffff; + } +}; +static_assert(epoch_handling); + +struct no_epoch_handling { + static constexpr bool uses_epochs = false; + static constexpr bool compare_epochs(std::uint64_t, std::uint64_t) { + return true; + } + static constexpr std::uint64_t make_unit(std::uint64_t) { + return 0; + } + static constexpr std::uint64_t get_bits(std::uint64_t bits) { + return bits; + } +}; +static_assert(epoch_handling); + +#endif // EPOCH_HANDLING From f0ce80425c21c6a25b1396ce4fd1cf287fee5a38 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Sun, 30 Nov 2025 03:32:04 +0100 Subject: [PATCH 06/15] Fix has_valid_bit check --- relaxed_concurrent_fifo/atomic_bit_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index b782d46..8f9febd 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -56,7 +56,7 @@ struct atomic_bit_tree { template bool has_valid_bit(std::uint64_t value) { - return VALUE == claim_value::ONE ? EPOCH::get_bits(value) : ~EPOCH::get_bits(value); + return VALUE == claim_value::ONE ? EPOCH::get_bits(value) : static_cast(~value); } static inline thread_local std::minstd_rand rng{std::random_device()()}; From 9ebe65284be2c1c50f295a7c41e1d834b4bacbf6 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Sun, 30 Nov 2025 03:43:37 +0100 Subject: [PATCH 07/15] Fix up propagation when stepping into empty/full node --- relaxed_concurrent_fifo/atomic_bit_tree.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index 8f9febd..a9fd196 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -108,12 +108,13 @@ struct atomic_bit_tree { bool advanced_epoch = false; while (idx < leaves_start_index) { - idx = get_random_child(static_cast(leaf_val), idx); - if (idx == -1) { - // TODO + auto new_idx = get_random_child(static_cast(leaf_val), idx); + if (new_idx == -1) { + // We walked into an out-of-date node. Let's propagate this information up. advanced_epoch = true; break; } + idx = new_idx; leaf = &root[idx]; leaf_val = leaf->value.load(order); if (!EPOCH::compare_epochs(leaf_val, epoch)) { @@ -122,7 +123,7 @@ struct atomic_bit_tree { } } - // Skip if we didn't find a leaf but stepped into an invalid + // Skip if we didn't find a leaf but stepped into an invalid node. if (!advanced_epoch) { do { auto bit_idx = select_random_bit_index(static_cast(leaf_val)); From 634190ff0cd34dd6df8820b70661d6a9a3e244ae Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Sun, 30 Nov 2025 04:03:30 +0100 Subject: [PATCH 08/15] Avoid extra epoch for read phase --- relaxed_concurrent_fifo/atomic_bit_tree.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index a9fd196..7966205 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -44,8 +44,8 @@ struct atomic_bit_tree { // TODO: These conditions are not always needed. while (modified != EPOCH::get_bits(leaf_val) && EPOCH::compare_epochs(leaf_val, epoch)) { bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); - if (leaf.compare_exchange_strong(leaf_val, advanced_epoch - ? (EPOCH::make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) + if (leaf.compare_exchange_strong(leaf_val, advanced_epoch && VALUE == claim_value::ONE + ? (EPOCH::make_unit(epoch + 1)) : (EPOCH::make_unit(epoch) | modified), order)) { return {true, advanced_epoch}; } @@ -117,6 +117,7 @@ struct atomic_bit_tree { idx = new_idx; leaf = &root[idx]; leaf_val = leaf->value.load(order); + // TODO: Check for has_valid_bit here (avoids random call)? if (!EPOCH::compare_epochs(leaf_val, epoch)) { advanced_epoch = true; break; @@ -203,7 +204,6 @@ struct atomic_bit_tree { template std::size_t claim_bit(std::size_t window_index, int starting_bit, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { // We use modified epochs. - epoch = epoch * 2 + (VALUE == claim_value::ONE ? 1 : 0); auto ret = claim_bit_singular(&data[window_index * fragments_per_window], starting_bit, epoch, order); /*std::cout << window_index << " " << (int)VALUE << " " << (int)MODE << " "; @@ -216,8 +216,7 @@ struct atomic_bit_tree { } void set_epoch_if_empty(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - epoch *= 2; - std::uint64_t next_eb = EPOCH::make_unit(epoch + 2); + std::uint64_t next_eb = EPOCH::make_unit(epoch + 1); for (std::size_t i = 0; i < fragments_per_window; i++) { std::uint64_t eb = EPOCH::make_unit(epoch); data[window_index * fragments_per_window + i]->compare_exchange_strong(eb, next_eb, order); @@ -225,11 +224,11 @@ struct atomic_bit_tree { } void set(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - return change_bit(window_index, index, epoch * 2, order); + return change_bit(window_index, index, epoch, order); } void reset(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - return change_bit(window_index, index, epoch * 2 + 1, order); + return change_bit(window_index, index, epoch, order); } }; From 3543f774b6ed458639f0af4f84808904a55df9c9 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Sun, 30 Nov 2025 04:10:53 +0100 Subject: [PATCH 09/15] Revert "Avoid extra epoch for read phase" This reverts commit 634190ff0cd34dd6df8820b70661d6a9a3e244ae. --- relaxed_concurrent_fifo/atomic_bit_tree.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index 7966205..a9fd196 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -44,8 +44,8 @@ struct atomic_bit_tree { // TODO: These conditions are not always needed. while (modified != EPOCH::get_bits(leaf_val) && EPOCH::compare_epochs(leaf_val, epoch)) { bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); - if (leaf.compare_exchange_strong(leaf_val, advanced_epoch && VALUE == claim_value::ONE - ? (EPOCH::make_unit(epoch + 1)) + if (leaf.compare_exchange_strong(leaf_val, advanced_epoch + ? (EPOCH::make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) : (EPOCH::make_unit(epoch) | modified), order)) { return {true, advanced_epoch}; } @@ -117,7 +117,6 @@ struct atomic_bit_tree { idx = new_idx; leaf = &root[idx]; leaf_val = leaf->value.load(order); - // TODO: Check for has_valid_bit here (avoids random call)? if (!EPOCH::compare_epochs(leaf_val, epoch)) { advanced_epoch = true; break; @@ -204,6 +203,7 @@ struct atomic_bit_tree { template std::size_t claim_bit(std::size_t window_index, int starting_bit, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { // We use modified epochs. + epoch = epoch * 2 + (VALUE == claim_value::ONE ? 1 : 0); auto ret = claim_bit_singular(&data[window_index * fragments_per_window], starting_bit, epoch, order); /*std::cout << window_index << " " << (int)VALUE << " " << (int)MODE << " "; @@ -216,7 +216,8 @@ struct atomic_bit_tree { } void set_epoch_if_empty(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - std::uint64_t next_eb = EPOCH::make_unit(epoch + 1); + epoch *= 2; + std::uint64_t next_eb = EPOCH::make_unit(epoch + 2); for (std::size_t i = 0; i < fragments_per_window; i++) { std::uint64_t eb = EPOCH::make_unit(epoch); data[window_index * fragments_per_window + i]->compare_exchange_strong(eb, next_eb, order); @@ -224,11 +225,11 @@ struct atomic_bit_tree { } void set(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - return change_bit(window_index, index, epoch, order); + return change_bit(window_index, index, epoch * 2, order); } void reset(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { - return change_bit(window_index, index, epoch, order); + return change_bit(window_index, index, epoch * 2 + 1, order); } }; From fa20449ecdca147b2a2246f90e4d38f998e2cec7 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Mon, 1 Dec 2025 01:18:59 +0100 Subject: [PATCH 10/15] Avoid redundant checks for invalid bits --- relaxed_concurrent_fifo/atomic_bit_tree.h | 24 +++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index a9fd196..c209a27 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -42,7 +42,7 @@ struct atomic_bit_tree { std::pair try_change_bit(std::uint64_t epoch, std::atomic_uint64_t& leaf, std::uint64_t& leaf_val, int bit_idx, std::memory_order order) { ARR_TYPE modified = modify(leaf_val, bit_idx); // TODO: These conditions are not always needed. - while (modified != EPOCH::get_bits(leaf_val) && EPOCH::compare_epochs(leaf_val, epoch)) { + while (modified != EPOCH::get_bits(leaf_val) && compare_epoch(leaf_val, epoch)) { bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); if (leaf.compare_exchange_strong(leaf_val, advanced_epoch ? (EPOCH::make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) @@ -54,11 +54,6 @@ struct atomic_bit_tree { return {false, false}; } - template - bool has_valid_bit(std::uint64_t value) { - return VALUE == claim_value::ONE ? EPOCH::get_bits(value) : static_cast(~value); - } - static inline thread_local std::minstd_rand rng{std::random_device()()}; template @@ -94,14 +89,14 @@ struct atomic_bit_tree { do { // TODO: Potentially directly use countl_xxx here to avoid it later? // TODO: Epoch check more explicit (+1). - while (idx > 0 && (!EPOCH::compare_epochs(leaf_val, epoch) || !has_valid_bit(leaf_val))) { + while (idx > 0 && !compare_epoch(leaf_val, epoch)) { idx = get_parent(idx); leaf = &root[idx]; leaf_val = leaf->value.load(order); // TODO: Automatically fix parent here if child is erroneously marked? } - if (!EPOCH::compare_epochs(leaf_val, epoch) || !has_valid_bit(leaf_val)) { + if (!compare_epoch(leaf_val, epoch)) { // Root is invalid as well. return std::numeric_limits::max(); } @@ -117,7 +112,7 @@ struct atomic_bit_tree { idx = new_idx; leaf = &root[idx]; leaf_val = leaf->value.load(order); - if (!EPOCH::compare_epochs(leaf_val, epoch)) { + if (!compare_epoch(leaf_val, epoch)) { advanced_epoch = true; break; } @@ -127,7 +122,7 @@ struct atomic_bit_tree { if (!advanced_epoch) { do { auto bit_idx = select_random_bit_index(static_cast(leaf_val)); - if (bit_idx == 32 || !EPOCH::compare_epochs(leaf_val, epoch)) { + if (bit_idx == 32 || !compare_epoch(leaf_val, epoch)) { // Leaf empty, need to move up again. advanced_epoch = true; break; @@ -170,6 +165,15 @@ struct atomic_bit_tree { return index * bit_count + offset + 1; } + template + bool compare_epoch(std::uint64_t eb, std::uint64_t epoch) { + if constexpr (EPOCH::uses_epochs) { + return EPOCH::compare_epochs(eb, epoch); + } else { + return VALUE == claim_value::ONE ? EPOCH::get_bits(eb) : static_cast(~eb); + } + } + template void change_bit(std::size_t window_index, std::size_t index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) { //assert(window_index < window_count); From 09e90a6b615eb621e5f91f8ceb57ad96acabbeb8 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Mon, 1 Dec 2025 02:06:33 +0100 Subject: [PATCH 11/15] Eradicate more checks made redundant by double epochs --- relaxed_concurrent_fifo/atomic_bit_tree.h | 27 +++++++---------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index c209a27..903fc9f 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -66,9 +66,7 @@ struct atomic_bit_tree { value = ~value; } - if (value == 0) { - return 32; - } + assert(value); auto valid_bits = std::popcount(value); auto nth_bit = std::uniform_int_distribution<>{0, valid_bits - 1}(rng); @@ -103,13 +101,7 @@ struct atomic_bit_tree { bool advanced_epoch = false; while (idx < leaves_start_index) { - auto new_idx = get_random_child(static_cast(leaf_val), idx); - if (new_idx == -1) { - // We walked into an out-of-date node. Let's propagate this information up. - advanced_epoch = true; - break; - } - idx = new_idx; + idx = get_random_child(static_cast(leaf_val), idx); leaf = &root[idx]; leaf_val = leaf->value.load(order); if (!compare_epoch(leaf_val, epoch)) { @@ -122,12 +114,6 @@ struct atomic_bit_tree { if (!advanced_epoch) { do { auto bit_idx = select_random_bit_index(static_cast(leaf_val)); - if (bit_idx == 32 || !compare_epoch(leaf_val, epoch)) { - // Leaf empty, need to move up again. - advanced_epoch = true; - break; - } - ret = (idx - leaves_start_index) * bit_count + bit_idx; if constexpr (MODE == claim_mode::READ_ONLY) { return ret; @@ -135,6 +121,12 @@ struct atomic_bit_tree { auto bit_change_ret = try_change_bit(epoch, *leaf, leaf_val, bit_idx, order); success = bit_change_ret.first; advanced_epoch = bit_change_ret.second; + // TODO: This check is already done in try_change_bit, try merging it. + if (!compare_epoch(leaf_val, epoch)) { + // Leaf empty, need to move up again. + advanced_epoch = true; + break; + } } while (!success); } @@ -159,9 +151,6 @@ struct atomic_bit_tree { template int get_random_child(ARR_TYPE node, int index) { auto offset = select_random_bit_index(node); - if (offset == 32) { - return -1; - } return index * bit_count + offset + 1; } From f162e75f5efd15b650626f6bee580dd039cf05e7 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Mon, 1 Dec 2025 02:06:45 +0100 Subject: [PATCH 12/15] Configure blocks per window statically --- .../benchmarks/providers/benchmark_provider_other.hpp | 4 ++-- relaxed_concurrent_fifo/block_based_queue.h | 4 ++-- relaxed_concurrent_fifo/config.hpp | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp b/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp index 9d89c01..2d9bd56 100644 --- a/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp +++ b/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp @@ -24,8 +24,8 @@ #pragma GCC diagnostic pop #endif -template -using benchmark_provider_bbq = benchmark_provider_generic, BENCHMARK, double, std::size_t>; +template +using benchmark_provider_bbq = benchmark_provider_generic, BENCHMARK, double, std::size_t>; template using benchmark_provider_kfifo = benchmark_provider_generic, BENCHMARK, double>; diff --git a/relaxed_concurrent_fifo/block_based_queue.h b/relaxed_concurrent_fifo/block_based_queue.h index 0f13e7f..9b6bf27 100644 --- a/relaxed_concurrent_fifo/block_based_queue.h +++ b/relaxed_concurrent_fifo/block_based_queue.h @@ -58,7 +58,7 @@ struct block { static_assert(std::is_trivially_destructible_v>); }; -template +template class block_based_queue { private: std::size_t blocks_per_window; @@ -149,7 +149,7 @@ class block_based_queue { public: block_based_queue(int thread_count, std::size_t min_size, double blocks_per_window_per_thread, std::size_t cells_per_block) : - blocks_per_window(512), + blocks_per_window(BLOCKS_PER_WINDOW), window_block_distribution(0, static_cast(blocks_per_window - 1)), window_count(std::max(4, std::bit_ceil(min_size / blocks_per_window / cells_per_block))), window_count_mod_mask(window_count - 1), diff --git a/relaxed_concurrent_fifo/config.hpp b/relaxed_concurrent_fifo/config.hpp index e0ca7ce..48d0ae6 100644 --- a/relaxed_concurrent_fifo/config.hpp +++ b/relaxed_concurrent_fifo/config.hpp @@ -43,13 +43,14 @@ static void add_instances(std::vector>("{},{},blockfifo", 1, c - 1)); + instances.push_back(std::make_unique>("{},{},blockfifo-64", 1, c - 1)); + instances.push_back(std::make_unique>("{},{},blockfifo-512", 1, c - 1)); } //} } else { - instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 7)); - instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 63)); - instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 511)); + instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 7)); + instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 63)); + instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 511)); } #endif From 8176775377aa8c78135f087b29831d5438667c7b Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Fri, 19 Dec 2025 10:13:31 +0100 Subject: [PATCH 13/15] Allow imperfect bit trees --- relaxed_concurrent_fifo/atomic_bit_tree.h | 59 +++++++++++++--------- relaxed_concurrent_fifo/atomic_bitset.h | 8 ++- relaxed_concurrent_fifo/epoch_handling.hpp | 7 --- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index 903fc9f..d42c582 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -19,13 +19,10 @@ struct atomic_bit_tree { int leaves_start_index; static constexpr std::size_t bit_count = sizeof(ARR_TYPE) * 8; - // TODO Don't need this, bit_count is constexpr. - static constexpr std::size_t bit_count_log_2 = std::bit_width(bit_count) - 1; std::unique_ptr>[]> data; - static constexpr std::size_t calculate_fragment_count(std::size_t leaves) { - auto height = (std::bit_width(leaves) - 1) / bit_count_log_2; - return ((1ull << ((height + 1) * bit_count_log_2)) - 1) / (bit_count - 1); + std::uint64_t get_bits(std::uint64_t eb) { + return eb & ((1 << bit_count) - 1); } template @@ -40,13 +37,14 @@ struct atomic_bit_tree { template std::pair try_change_bit(std::uint64_t epoch, std::atomic_uint64_t& leaf, std::uint64_t& leaf_val, int bit_idx, std::memory_order order) { + std::uint64_t valid_mask = static_cast(leaf_val >> bit_count) << bit_count; ARR_TYPE modified = modify(leaf_val, bit_idx); // TODO: These conditions are not always needed. - while (modified != EPOCH::get_bits(leaf_val) && compare_epoch(leaf_val, epoch)) { + while (modified != get_bits(leaf_val) && compare_epoch(leaf_val, epoch)) { bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); if (leaf.compare_exchange_strong(leaf_val, advanced_epoch - ? (EPOCH::make_unit(epoch + 1) | (VALUE == claim_value::ZERO ? modified : 0)) - : (EPOCH::make_unit(epoch) | modified), order)) { + ? (EPOCH::make_unit(epoch + 1) | valid_mask | (VALUE == claim_value::ZERO ? modified : 0)) + : (EPOCH::make_unit(epoch) | valid_mask | modified), order)) { return {true, advanced_epoch}; } modified = modify(leaf_val, bit_idx); @@ -57,20 +55,24 @@ struct atomic_bit_tree { static inline thread_local std::minstd_rand rng{std::random_device()()}; template - static int select_random_bit_index(ARR_TYPE value) { + static int select_random_bit_index(std::uint64_t value) { //unsigned value32 = value; //return VALUE == claim_value::ZERO ? std::countr_one(value32) : std::countr_zero(value32); + ARR_TYPE bits = static_cast(value); + // TODO: Don't randomize? (FIFO semantic on fragment level??) if constexpr (VALUE == claim_value::ZERO) { - value = ~value; + bits = ~bits; } - assert(value); + bits = (value >> bit_count) & bits; + + assert(bits); - auto valid_bits = std::popcount(value); + auto valid_bits = std::popcount(bits); auto nth_bit = std::uniform_int_distribution<>{0, valid_bits - 1}(rng); - return std::countr_zero(_pdep_u32(1 << nth_bit, value)); + return std::countr_zero(_pdep_u32(1 << nth_bit, bits)); } template @@ -101,7 +103,7 @@ struct atomic_bit_tree { bool advanced_epoch = false; while (idx < leaves_start_index) { - idx = get_random_child(static_cast(leaf_val), idx); + idx = get_random_child(leaf_val, idx); leaf = &root[idx]; leaf_val = leaf->value.load(order); if (!compare_epoch(leaf_val, epoch)) { @@ -113,7 +115,7 @@ struct atomic_bit_tree { // Skip if we didn't find a leaf but stepped into an invalid node. if (!advanced_epoch) { do { - auto bit_idx = select_random_bit_index(static_cast(leaf_val)); + auto bit_idx = select_random_bit_index(leaf_val); ret = (idx - leaves_start_index) * bit_count + bit_idx; if constexpr (MODE == claim_mode::READ_ONLY) { return ret; @@ -149,7 +151,7 @@ struct atomic_bit_tree { } template - int get_random_child(ARR_TYPE node, int index) { + int get_random_child(std::uint64_t node, int index) { auto offset = select_random_bit_index(node); return index * bit_count + offset + 1; } @@ -159,7 +161,7 @@ struct atomic_bit_tree { if constexpr (EPOCH::uses_epochs) { return EPOCH::compare_epochs(eb, epoch); } else { - return VALUE == claim_value::ONE ? EPOCH::get_bits(eb) : static_cast(~eb); + return VALUE == claim_value::ONE ? get_bits(eb) : static_cast(~eb); } } @@ -185,12 +187,23 @@ struct atomic_bit_tree { public: atomic_bit_tree(std::size_t window_count, std::size_t blocks_per_window) : - leaves_per_window(blocks_per_window / bit_count), - fragments_per_window(calculate_fragment_count(leaves_per_window)), - leaves_start_index(static_cast(fragments_per_window - leaves_per_window)), - data(std::make_unique>[]>(fragments_per_window * window_count)) { - // Must be a perfect k-ary tree. - assert(blocks_per_window == 1ull << ((std::bit_width(blocks_per_window) - 1) / bit_count_log_2 * bit_count_log_2)); + leaves_per_window(blocks_per_window / bit_count) { + // TODO: This restriction can be ever so slighty weakened (6 top level bits also work). + assert(std::has_single_bit(blocks_per_window)); + auto bits_per_level = std::bit_width(bit_count) - 1; + auto bits = std::bit_width(leaves_per_window) - 1; + auto rounded_up_bits = bits + bits_per_level - 1; + auto bits_required_in_top_level = 2 << (rounded_up_bits % bits_per_level); + auto rounded_up_height = rounded_up_bits / bits_per_level; + auto lower_level_fragments = ((1ull << (rounded_up_height * bits_per_level)) - 1) / (bit_count - 1); + fragments_per_window = 1 + lower_level_fragments * bits_required_in_top_level; + leaves_start_index = static_cast(fragments_per_window - leaves_per_window); + // TODO: Don't allocate memory for the "dead" top level bits. + data = std::make_unique>[]>(fragments_per_window * window_count); + for (std::size_t i = 0; i < fragments_per_window * window_count; i++) { + auto bits_in_node = (i % fragments_per_window) == 0 ? bits_required_in_top_level : bit_count; + data[i]->fetch_or(((1 << bits_in_node) - 1) << bit_count); + } } template diff --git a/relaxed_concurrent_fifo/atomic_bitset.h b/relaxed_concurrent_fifo/atomic_bitset.h index 456cbda..8a9e4ca 100644 --- a/relaxed_concurrent_fifo/atomic_bitset.h +++ b/relaxed_concurrent_fifo/atomic_bitset.h @@ -39,6 +39,10 @@ class atomic_bitset { static constexpr std::size_t bit_count = sizeof(ARR_TYPE) * 8; std::unique_ptr>[]> data; + std::uint64_t get_bits(std::uint64_t eb) { + return eb & ((1 << bit_count) - 1); + } + template static constexpr void set_bit_atomic(std::atomic& epoch_and_bits, std::size_t index, std::uint64_t epoch, std::memory_order order) { if constexpr (EPOCH::uses_epochs) { @@ -55,7 +59,7 @@ class atomic_bitset { // TODO: Special case handling like this is probably bad. // We basically want to increment the epoch when the last filled bit has been reset. test = eb & ~stencil; - if (EPOCH::get_bits(test) == 0) { + if (get_bits(test) == 0) { test = EPOCH::make_unit(epoch + 1); } } @@ -156,7 +160,7 @@ class atomic_bitset { [[nodiscard]] constexpr bool any(std::size_t window_index, std::uint64_t epoch, std::memory_order order = BITSET_DEFAULT_MEMORY_ORDER) const { for (std::size_t i = 0; i < units_per_window; i++) { std::uint64_t eb = data[window_index * units_per_window + i]->load(order); - if (EPOCH::compare_epochs(eb, epoch) && EPOCH::get_bits(eb)) { + if (EPOCH::compare_epochs(eb, epoch) && get_bits(eb)) { return true; } } diff --git a/relaxed_concurrent_fifo/epoch_handling.hpp b/relaxed_concurrent_fifo/epoch_handling.hpp index c792e9a..5889b60 100644 --- a/relaxed_concurrent_fifo/epoch_handling.hpp +++ b/relaxed_concurrent_fifo/epoch_handling.hpp @@ -8,7 +8,6 @@ concept epoch_handling = requires(std::uint64_t u64) { { T::compare_epochs(u64, u64) } -> std::same_as; { T::uses_epochs } -> std::convertible_to; { T::make_unit(u64) } -> std::same_as; - { T::get_bits(u64) } -> std::same_as; }; struct default_epoch_handling { @@ -19,9 +18,6 @@ struct default_epoch_handling { static constexpr std::uint64_t make_unit(std::uint64_t epoch) { return epoch << 32; } - static constexpr std::uint64_t get_bits(std::uint64_t bits) { - return bits & 0xffff'ffff; - } }; static_assert(epoch_handling); @@ -33,9 +29,6 @@ struct no_epoch_handling { static constexpr std::uint64_t make_unit(std::uint64_t) { return 0; } - static constexpr std::uint64_t get_bits(std::uint64_t bits) { - return bits; - } }; static_assert(epoch_handling); From 7324eede9c66f8c06ae623370f170f98f885ec53 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Fri, 19 Dec 2025 11:17:58 +0100 Subject: [PATCH 14/15] Fix imperfect bit trees --- relaxed_concurrent_fifo/atomic_bit_tree.h | 31 +++++++++++++++-------- relaxed_concurrent_fifo/config.hpp | 4 +-- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index d42c582..a7ca8c9 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -21,7 +21,18 @@ struct atomic_bit_tree { static constexpr std::size_t bit_count = sizeof(ARR_TYPE) * 8; std::unique_ptr>[]> data; - std::uint64_t get_bits(std::uint64_t eb) { + template + constexpr bool has_valid_bit(std::uint64_t eb) { + // TODO: Using the double-epochs this can likely be avoided by always assuming 1 = desired and flipping the semantic accordingly when incrementing the epoch. + // This is true except for when there is no epoch handling and as such no decider for the semantic. + auto bits = get_bits(eb); + if constexpr (VALUE == claim_value::ZERO) { + bits = ~bits; + } + return static_cast(bits & (eb >> bit_count)); + } + + constexpr std::uint64_t get_bits(std::uint64_t eb) { return eb & ((1 << bit_count) - 1); } @@ -37,13 +48,14 @@ struct atomic_bit_tree { template std::pair try_change_bit(std::uint64_t epoch, std::atomic_uint64_t& leaf, std::uint64_t& leaf_val, int bit_idx, std::memory_order order) { - std::uint64_t valid_mask = static_cast(leaf_val >> bit_count) << bit_count; + ARR_TYPE target = static_cast(leaf_val >> bit_count); + std::uint64_t valid_mask = target << bit_count; ARR_TYPE modified = modify(leaf_val, bit_idx); // TODO: These conditions are not always needed. while (modified != get_bits(leaf_val) && compare_epoch(leaf_val, epoch)) { - bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : ~0); + bool advanced_epoch = modified == static_cast(VALUE == claim_value::ONE ? 0 : target); if (leaf.compare_exchange_strong(leaf_val, advanced_epoch - ? (EPOCH::make_unit(epoch + 1) | valid_mask | (VALUE == claim_value::ZERO ? modified : 0)) + ? (EPOCH::make_unit(epoch + 1) | valid_mask | (VALUE == claim_value::ONE ? 0 : target)) : (EPOCH::make_unit(epoch) | valid_mask | modified), order)) { return {true, advanced_epoch}; } @@ -161,7 +173,7 @@ struct atomic_bit_tree { if constexpr (EPOCH::uses_epochs) { return EPOCH::compare_epochs(eb, epoch); } else { - return VALUE == claim_value::ONE ? get_bits(eb) : static_cast(~eb); + return has_valid_bit(eb); } } @@ -195,14 +207,13 @@ struct atomic_bit_tree { auto rounded_up_bits = bits + bits_per_level - 1; auto bits_required_in_top_level = 2 << (rounded_up_bits % bits_per_level); auto rounded_up_height = rounded_up_bits / bits_per_level; - auto lower_level_fragments = ((1ull << (rounded_up_height * bits_per_level)) - 1) / (bit_count - 1); - fragments_per_window = 1 + lower_level_fragments * bits_required_in_top_level; + // TODO: We could save memory by not allocating the leaves for "dead" top level bits (but ONLY leaves). + fragments_per_window = ((1ull << ((rounded_up_height + 1) * bits_per_level)) - 1) / (bit_count - 1); leaves_start_index = static_cast(fragments_per_window - leaves_per_window); - // TODO: Don't allocate memory for the "dead" top level bits. data = std::make_unique>[]>(fragments_per_window * window_count); for (std::size_t i = 0; i < fragments_per_window * window_count; i++) { auto bits_in_node = (i % fragments_per_window) == 0 ? bits_required_in_top_level : bit_count; - data[i]->fetch_or(((1 << bits_in_node) - 1) << bit_count); + data[i]->fetch_or(((1 << bits_in_node) - 1) << (bit_count + bit_count - bits_in_node)); } } @@ -215,7 +226,7 @@ struct atomic_bit_tree { /*std::cout << window_index << " " << (int)VALUE << " " << (int)MODE << " "; for (auto i = 0; i < fragments_per_window; i++) { auto val = data[window_index * fragments_per_window + i]->load(); - std::cout << get_epoch(val) << " " << std::bitset(get_bits(val)) << " | "; + std::cout << std::bitset(get_bits(val)) << " | "; } std::cout << std::endl;*/ return ret; diff --git a/relaxed_concurrent_fifo/config.hpp b/relaxed_concurrent_fifo/config.hpp index 48d0ae6..656bc3a 100644 --- a/relaxed_concurrent_fifo/config.hpp +++ b/relaxed_concurrent_fifo/config.hpp @@ -43,8 +43,8 @@ static void add_instances(std::vector>("{},{},blockfifo-64", 1, c - 1)); - instances.push_back(std::make_unique>("{},{},blockfifo-512", 1, c - 1)); + instances.push_back(std::make_unique>("{},{},blockfifo-128", 1, c - 1)); + instances.push_back(std::make_unique>("{},{},blockfifo-256", 1, c - 1)); } //} } else { From f710d06fc433515f57db080508d8383cde037bb6 Mon Sep 17 00:00:00 2001 From: Salvage <29021710+Saalvage@users.noreply.github.com> Date: Fri, 19 Dec 2025 12:25:57 +0100 Subject: [PATCH 15/15] Restore configurability --- relaxed_concurrent_fifo/atomic_bit_tree.h | 2 +- .../providers/benchmark_provider_other.hpp | 4 ++-- relaxed_concurrent_fifo/block_based_queue.h | 7 ++++--- relaxed_concurrent_fifo/config.hpp | 15 +++++++-------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/relaxed_concurrent_fifo/atomic_bit_tree.h b/relaxed_concurrent_fifo/atomic_bit_tree.h index a7ca8c9..6cb244b 100644 --- a/relaxed_concurrent_fifo/atomic_bit_tree.h +++ b/relaxed_concurrent_fifo/atomic_bit_tree.h @@ -200,7 +200,7 @@ struct atomic_bit_tree { public: atomic_bit_tree(std::size_t window_count, std::size_t blocks_per_window) : leaves_per_window(blocks_per_window / bit_count) { - // TODO: This restriction can be ever so slighty weakened (6 top level bits also work). + // TODO: This restriction can be ever so slightly weakened (6 top level bits also work). assert(std::has_single_bit(blocks_per_window)); auto bits_per_level = std::bit_width(bit_count) - 1; auto bits = std::bit_width(leaves_per_window) - 1; diff --git a/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp b/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp index 2d9bd56..9d89c01 100644 --- a/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp +++ b/relaxed_concurrent_fifo/benchmarks/providers/benchmark_provider_other.hpp @@ -24,8 +24,8 @@ #pragma GCC diagnostic pop #endif -template -using benchmark_provider_bbq = benchmark_provider_generic, BENCHMARK, double, std::size_t>; +template +using benchmark_provider_bbq = benchmark_provider_generic, BENCHMARK, double, std::size_t>; template using benchmark_provider_kfifo = benchmark_provider_generic, BENCHMARK, double>; diff --git a/relaxed_concurrent_fifo/block_based_queue.h b/relaxed_concurrent_fifo/block_based_queue.h index 9b6bf27..79ceb2f 100644 --- a/relaxed_concurrent_fifo/block_based_queue.h +++ b/relaxed_concurrent_fifo/block_based_queue.h @@ -58,7 +58,7 @@ struct block { static_assert(std::is_trivially_destructible_v>); }; -template +template class block_based_queue { private: std::size_t blocks_per_window; @@ -149,7 +149,8 @@ class block_based_queue { public: block_based_queue(int thread_count, std::size_t min_size, double blocks_per_window_per_thread, std::size_t cells_per_block) : - blocks_per_window(BLOCKS_PER_WINDOW), + blocks_per_window(std::bit_ceil(std::max(sizeof(BITSET_T) * 8, + std::lround(thread_count* blocks_per_window_per_thread)))), window_block_distribution(0, static_cast(blocks_per_window - 1)), window_count(std::max(4, std::bit_ceil(min_size / blocks_per_window / cells_per_block))), window_count_mod_mask(window_count - 1), @@ -169,7 +170,7 @@ class block_based_queue { // At least as big as the bitset's type. assert(blocks_per_window >= sizeof(BITSET_T) * 8); - assert(std::bit_ceil(blocks_per_window) == blocks_per_window); + assert(std::has_single_bit(blocks_per_window)); for (std::size_t i = 0; i < window_count * blocks_per_window; i++) { auto ptr = buffer.get() + i * block_size; diff --git a/relaxed_concurrent_fifo/config.hpp b/relaxed_concurrent_fifo/config.hpp index 656bc3a..229587c 100644 --- a/relaxed_concurrent_fifo/config.hpp +++ b/relaxed_concurrent_fifo/config.hpp @@ -41,16 +41,15 @@ template static void add_instances(std::vector>>& instances, bool parameter_tuning, std::unordered_set& filter_set, bool are_exclude_filters) { #if defined(INCLUDE_BBQ) || defined(INCLUDE_ALL) if (parameter_tuning) { - //for (double b = 0.5; b <= 16; b *= 2) { - for (int c = 2; c <= 2048; c *= 2) { - instances.push_back(std::make_unique>("{},{},blockfifo-128", 1, c - 1)); - instances.push_back(std::make_unique>("{},{},blockfifo-256", 1, c - 1)); + for (double b = 0.5; b <= 16; b *= 2) { + for (int c = 8; c <= 8; c *= 2) { + instances.push_back(std::make_unique>("{},{},blockfifo", b, c - 1)); } - //} + } } else { - instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 7)); - instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 63)); - instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 511)); + instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 7)); + instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 63)); + instances.push_back(std::make_unique>("blockfifo-{}-{}", 1, 511)); } #endif