-
Notifications
You must be signed in to change notification settings - Fork 481
Warp determinism #1355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Warp determinism #1355
Changes from all commits
968c7bb
d5ef316
5ec9b25
72e8e3f
48e7207
3170255
50b7ebc
cc6720d
5b5038f
e34a93c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,7 +13,7 @@ | |
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Benchmarks for atomic operations under high thread contention. | ||
| """Benchmarks for atomic operations and deterministic mode overhead. | ||
|
|
||
| All threads write to a single output location (index 0) to maximize contention | ||
| and measure worst-case atomic operation performance. | ||
|
|
@@ -25,13 +25,18 @@ | |
|
|
||
| import warp as wp | ||
|
|
||
| wp.set_module_options({"enable_backward": False}) | ||
|
|
||
| # Map string parameter names to warp dtypes | ||
| DTYPE_MAP = { | ||
| "float32": wp.float32, | ||
| "int32": wp.int32, | ||
| } | ||
|
|
||
| NUM_ELEMENTS = 32 * 1024 * 1024 | ||
| DETERMINISTIC_NUM_ELEMENTS = 1 * 1024 * 1024 | ||
| COUNTER_NUM_ELEMENTS = 4 * 1024 * 1024 | ||
| DETERMINISTIC_BENCHMARK_SIZES = [64 * 1024, 256 * 1024, 1024 * 1024] | ||
|
|
||
|
|
||
| @wp.kernel | ||
|
|
@@ -54,6 +59,60 @@ def min_kernel( | |
| wp.atomic_min(out, 0, val) # All threads contend on out[0] | ||
|
|
||
|
|
||
| @wp.kernel | ||
| def scatter_add_kernel( | ||
| vals: wp.array(dtype=wp.float32), | ||
| indices: wp.array(dtype=wp.int32), | ||
| out: wp.array(dtype=wp.float32), | ||
| ): | ||
| tid = wp.tid() | ||
| wp.atomic_add(out, indices[tid], vals[tid]) | ||
|
|
||
|
|
||
| @wp.kernel(deterministic=True, deterministic_max_records=1) | ||
| def scatter_add_kernel_deterministic( | ||
| vals: wp.array(dtype=wp.float32), | ||
| indices: wp.array(dtype=wp.int32), | ||
| out: wp.array(dtype=wp.float32), | ||
| ): | ||
| tid = wp.tid() | ||
| wp.atomic_add(out, indices[tid], vals[tid]) | ||
|
|
||
|
|
||
| @wp.kernel | ||
| def counter_kernel( | ||
| vals: wp.array(dtype=wp.float32), | ||
| counter: wp.array(dtype=wp.int32), | ||
| out: wp.array(dtype=wp.float32), | ||
| ): | ||
| tid = wp.tid() | ||
| slot = wp.atomic_add(counter, 0, 1) | ||
| out[slot] = vals[tid] | ||
|
|
||
|
|
||
| @wp.kernel(deterministic=True, deterministic_max_records=1) | ||
| def counter_kernel_deterministic( | ||
| vals: wp.array(dtype=wp.float32), | ||
| counter: wp.array(dtype=wp.int32), | ||
| out: wp.array(dtype=wp.float32), | ||
| ): | ||
| tid = wp.tid() | ||
| slot = wp.atomic_add(counter, 0, 1) | ||
| out[slot] = vals[tid] | ||
|
|
||
|
|
||
| @wp.kernel | ||
| def zero_float_array_kernel(out: wp.array(dtype=wp.float32)): | ||
| tid = wp.tid() | ||
| out[tid] = 0.0 | ||
|
|
||
|
|
||
| @wp.kernel | ||
| def zero_int_array_kernel(out: wp.array(dtype=wp.int32)): | ||
| tid = wp.tid() | ||
| out[tid] = 0 | ||
|
|
||
|
|
||
| class AtomicMax: | ||
| """Benchmark wp.atomic_max() with high thread contention. | ||
|
|
||
|
|
@@ -166,3 +225,162 @@ def time_cuda(self, vals_np_dict, dtype_str): | |
| self.out.zero_() | ||
| self.cmd.launch() | ||
| wp.synchronize_device(self.device) | ||
|
|
||
|
|
||
| class AtomicAddDeterminismOverhead: | ||
| """Benchmark the overhead of deterministic accumulation atomics. | ||
|
|
||
| The benchmark compares the normal atomic-add path against deterministic | ||
| scatter-sort-reduce for the same kernel using CUDA graph replay. A small | ||
| size sweep exposes where deterministic execution crosses over. Two | ||
| destination counts are used: | ||
|
|
||
| - ``1``: worst-case contention, where every thread targets the same output. | ||
| - ``65536``: lower contention, closer to a scatter workload. | ||
| """ | ||
|
|
||
| params = (["normal", "deterministic"], [1, 65536], DETERMINISTIC_BENCHMARK_SIZES) | ||
| param_names = ["mode", "num_outputs", "num_elements"] | ||
|
Comment on lines
+242
to
+243
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make ASV parameter metadata immutable.
♻️ Minimal fix- params = (["normal", "deterministic"], [1, 65536], DETERMINISTIC_BENCHMARK_SIZES)
+ params = (("normal", "deterministic"), (1, 65536), tuple(DETERMINISTIC_BENCHMARK_SIZES))
param_names = ["mode", "num_outputs", "num_elements"]
@@
- params = (["normal", "deterministic"], DETERMINISTIC_BENCHMARK_SIZES)
+ params = (("normal", "deterministic"), tuple(DETERMINISTIC_BENCHMARK_SIZES))
param_names = ["mode", "num_elements"]Also applies to: 317-318 🧰 Tools🪛 Ruff (0.15.9)[warning] 243-243: Mutable default value for class attribute (RUF012) 🤖 Prompt for AI Agents |
||
|
|
||
| repeat = 10 | ||
| number = 5 | ||
|
|
||
| def setup_cache(self): | ||
| rng = np.random.default_rng(123) | ||
| vals_np = {n: rng.random(n, dtype=np.float32) for n in DETERMINISTIC_BENCHMARK_SIZES} | ||
| indices_np = {} | ||
| for n in DETERMINISTIC_BENCHMARK_SIZES: | ||
| indices_np[n] = { | ||
| 1: np.zeros(n, dtype=np.int32), | ||
| 65536: rng.integers(0, 65536, size=n, dtype=np.int32), | ||
| } | ||
| return vals_np, indices_np | ||
|
|
||
| def setup(self, cache, mode, num_outputs, num_elements): | ||
| wp.init() | ||
| self.device = wp.get_device("cuda:0") | ||
|
|
||
| vals_np, indices_np = cache | ||
| self.vals = wp.array(vals_np[num_elements], dtype=wp.float32, device=self.device) | ||
| self.indices = wp.array(indices_np[num_elements][num_outputs], dtype=wp.int32, device=self.device) | ||
| self.out = wp.zeros(shape=(num_outputs,), dtype=wp.float32, device=self.device) | ||
|
|
||
| self.kernel = scatter_add_kernel_deterministic if mode == "deterministic" else scatter_add_kernel | ||
| wp.launch( | ||
| zero_float_array_kernel, | ||
| dim=num_outputs, | ||
| inputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
| wp.launch( | ||
| self.kernel, | ||
| (num_elements,), | ||
| inputs=[self.vals, self.indices], | ||
| outputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
| wp.synchronize_device(self.device) | ||
|
|
||
| with wp.ScopedCapture(device=self.device, force_module_load=False) as capture: | ||
| wp.launch( | ||
| zero_float_array_kernel, | ||
| dim=num_outputs, | ||
| inputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
| wp.launch( | ||
| self.kernel, | ||
| (num_elements,), | ||
| inputs=[self.vals, self.indices], | ||
| outputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
|
|
||
| self.graph = capture.graph | ||
|
|
||
| for _ in range(5): | ||
| wp.capture_launch(self.graph) | ||
| wp.synchronize_device(self.device) | ||
|
|
||
| def time_cuda(self, cache, mode, num_outputs, num_elements): | ||
| wp.capture_launch(self.graph) | ||
| wp.synchronize_device(self.device) | ||
|
|
||
|
|
||
| class AtomicCounterDeterminismOverhead: | ||
| """Benchmark the overhead of deterministic counter/allocator atomics. | ||
|
|
||
| The timed path uses CUDA graph replay and includes resetting the output | ||
| state inside the captured graph so the benchmark isolates device work. | ||
| """ | ||
|
|
||
| params = (["normal", "deterministic"], DETERMINISTIC_BENCHMARK_SIZES) | ||
| param_names = ["mode", "num_elements"] | ||
|
|
||
| repeat = 10 | ||
| number = 5 | ||
|
|
||
| def setup_cache(self): | ||
| rng = np.random.default_rng(321) | ||
| return {n: rng.random(n, dtype=np.float32) for n in DETERMINISTIC_BENCHMARK_SIZES} | ||
|
|
||
| def setup(self, vals_np, mode, num_elements): | ||
| wp.init() | ||
| self.device = wp.get_device("cuda:0") | ||
|
|
||
| self.vals = wp.array(vals_np[num_elements], dtype=wp.float32, device=self.device) | ||
| self.counter = wp.zeros(shape=(1,), dtype=wp.int32, device=self.device) | ||
| self.out = wp.zeros(shape=(num_elements,), dtype=wp.float32, device=self.device) | ||
|
|
||
| self.kernel = counter_kernel_deterministic if mode == "deterministic" else counter_kernel | ||
| wp.launch( | ||
| zero_int_array_kernel, | ||
| dim=1, | ||
| inputs=[self.counter], | ||
| device=self.device, | ||
| ) | ||
| wp.launch( | ||
| zero_float_array_kernel, | ||
| dim=num_elements, | ||
| inputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
| wp.launch( | ||
| self.kernel, | ||
| (num_elements,), | ||
| inputs=[self.vals, self.counter], | ||
| outputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
| wp.synchronize_device(self.device) | ||
|
|
||
| with wp.ScopedCapture(device=self.device, force_module_load=False) as capture: | ||
| wp.launch( | ||
| zero_int_array_kernel, | ||
| dim=1, | ||
| inputs=[self.counter], | ||
| device=self.device, | ||
| ) | ||
| wp.launch( | ||
| zero_float_array_kernel, | ||
| dim=num_elements, | ||
| inputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
| wp.launch( | ||
| self.kernel, | ||
| (num_elements,), | ||
| inputs=[self.vals, self.counter], | ||
| outputs=[self.out], | ||
| device=self.device, | ||
| ) | ||
|
|
||
| self.graph = capture.graph | ||
|
|
||
| for _ in range(5): | ||
| wp.capture_launch(self.graph) | ||
| wp.synchronize_device(self.device) | ||
|
|
||
| def time_cuda(self, vals_np, mode, num_elements): | ||
| wp.capture_launch(self.graph) | ||
| wp.synchronize_device(self.device) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changelog entry should include a GH reference and stay API-level.
Lines 7-11 describe internal mechanics but do not include an issue/PR reference.
📝 Suggested rewrite
As per coding guidelines: "If a change modifies user-facing behavior, append an entry ... include issue refs ... and avoid internal implementation details."
📝 Committable suggestion
🤖 Prompt for AI Agents