-
Notifications
You must be signed in to change notification settings - Fork 47
[Sync] add syncFinder escape evidence tests #603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| // RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s | ||
| // | ||
| // Nested-loop reproducer for current main-branch loop syncFinder escape: | ||
| // - A top-level MTE2->V chain feeds a top-level V producer. | ||
| // - A nested may-path consumes that V result with MTE3. | ||
| // - A post-loop MTE3 consumer still directly depends on the original MTE2 result. | ||
| // - Correct zero-trip handling would keep a direct MTE2->MTE3 sync for the | ||
| // post-loop tstore, because outer-loop may execute zero times. | ||
| // - Current main drops that direct sync during After Analysis after loop | ||
| // syncFinder escape rebuilds a hidden MTE2->V->MTE3 chain at top level. | ||
| // | ||
| // CHECK: // === [PTOInsertSync Debug] After Analysis === // | ||
| // CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2] | ||
| // CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_V> idx=0 | ||
| // CHECK: [ 1] COMPOUND pto.tadd [PIPE_V] | ||
| // CHECK: PRE : wait_flag <PIPE_MTE2 -> PIPE_V> idx=0 | ||
| // CHECK: POST: set_flag <PIPE_V -> PIPE_MTE3> idx=1 | ||
| // CHECK: [ 4] COMPOUND pto.tstore [PIPE_MTE3] | ||
| // CHECK: PRE : wait_flag <PIPE_V -> PIPE_MTE3> idx=1 | ||
| // CHECK: [ 7] COMPOUND pto.tstore [PIPE_MTE3] | ||
| // CHECK-NOT: wait_flag <PIPE_MTE2 -> PIPE_MTE3> | ||
| // CHECK: PRE : pipe_barrier <PIPE_MTE3 -> PIPE_MTE3> | ||
|
|
||
| module attributes {pto.target_arch = "a2a3"} { | ||
| func.func @syncfinder_zero_trip_nested_loop(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %outer: index, %inner: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} { | ||
| %c0_i64 = arith.constant 0 : i64 | ||
| %c4096_i64 = arith.constant 4096 : i64 | ||
| %c16 = arith.constant 16 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c0 = arith.constant 0 : index | ||
|
|
||
| %src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32> | ||
| %dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32> | ||
| %src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
| %dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
|
|
||
| %ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> | ||
| %ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> | ||
|
|
||
| // i0: top-level producer on PIPE_MTE2 | ||
| pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) | ||
| // i1: top-level bridge producer on PIPE_V | ||
| pto.tadd ins(%ub0, %ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) | ||
|
|
||
| // nested may-path: old V->MTE3 sync only lives inside the loops | ||
| scf.for %o = %c0 to %outer step %c1 { | ||
| scf.for %i = %c0 to %inner step %c1 { | ||
| pto.tstore ins(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) | ||
| } | ||
| } | ||
|
|
||
| // post-loop consumer: directly depends on ub0, so zero-trip-safe handling | ||
| // should keep a direct MTE2->MTE3 wait here. | ||
| pto.tstore ins(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) | ||
| return | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| // RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s | ||
| // | ||
| // Debug reproducer for the current main-branch loop behavior: | ||
| // - A zero-trip loop contains an old V->MTE3 sync chain. | ||
| // - The post-loop MTE3 consumer still directly depends on the pre-loop MTE2 producer. | ||
| // - Correct zero-trip handling would keep a direct MTE2->MTE3 sync for the post-loop tstore. | ||
| // - Current main drops that direct sync during After Analysis because loop syncFinder escapes. | ||
| // | ||
| // CHECK: // === [PTOInsertSync Debug] After Analysis === // | ||
| // CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2] | ||
| // CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_V> idx=0 | ||
| // CHECK: [ 1] COMPOUND pto.tadd [PIPE_V] | ||
| // CHECK: PRE : wait_flag <PIPE_MTE2 -> PIPE_V> idx=0 | ||
| // CHECK: POST: set_flag <PIPE_V -> PIPE_MTE3> idx=1 | ||
| // CHECK: [ 2] LOOP LOOP_BEGIN | ||
| // CHECK: [ 3] COMPOUND pto.tstore [PIPE_MTE3] | ||
| // CHECK: PRE : wait_flag <PIPE_V -> PIPE_MTE3> idx=1 | ||
| // CHECK: [ 5] COMPOUND pto.tstore [PIPE_MTE3] | ||
| // CHECK-NOT: wait_flag <PIPE_MTE2 -> PIPE_MTE3> | ||
| // CHECK: PRE : pipe_barrier <PIPE_MTE3 -> PIPE_MTE3> | ||
|
Comment on lines
+18
to
+20
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
|
||
| module attributes {pto.target_arch = "a2a3"} { | ||
| func.func @syncfinder_zero_trip_single_loop(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %trip: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} { | ||
| %c0_i64 = arith.constant 0 : i64 | ||
| %c4096_i64 = arith.constant 4096 : i64 | ||
| %c16 = arith.constant 16 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c0 = arith.constant 0 : index | ||
|
|
||
| %src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32> | ||
| %dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32> | ||
| %src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
| %dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
|
|
||
| %ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> | ||
| %ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> | ||
|
|
||
| // pre-loop producer: PIPE_MTE2 writes ub0 | ||
| pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) | ||
| // bridge producer: PIPE_V reads ub0 and writes ub1 | ||
| pto.tadd ins(%ub0, %ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) | ||
|
|
||
| // zero-trip loop candidate: old V->MTE3 sync lives only on this may-path | ||
| scf.for %i = %c0 to %trip step %c1 { | ||
| pto.tstore ins(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) | ||
| } | ||
|
|
||
| // post-loop consumer: directly depends on ub0, so zero-trip-safe insertion | ||
| // should keep a direct MTE2->MTE3 wait here. | ||
| pto.tstore ins(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) | ||
| return | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| // RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s | ||
| // | ||
| // Branch contrast case for syncFinder escape analysis: | ||
| // - Frontend lowering always materializes an ELSE_BEGIN plus a virtual else | ||
| // placeholder for scf.if without an explicit else. | ||
| // - InsertBranchSync therefore takes the "merge both branches" path instead of | ||
| // the "no else: propagate syncFinder only" path. | ||
| // - The post-if direct MTE2->MTE3 sync must remain present. | ||
| // | ||
| // CHECK: // === [PTOInsertSync Debug] After Analysis === // | ||
| // CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2] | ||
| // CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_V> idx=0 | ||
| // CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_MTE3> idx=[[DIRECT:[0-9]+]] | ||
| // CHECK: [ 2] BRANCH IF_BEGIN | ||
| // CHECK: [ 5] BRANCH ELSE_BEGIN | ||
| // CHECK: virtualElse | ||
| // CHECK: [ 8] COMPOUND pto.tstore [PIPE_MTE3] | ||
| // CHECK: PRE : wait_flag <PIPE_MTE2 -> PIPE_MTE3> idx=[[DIRECT]] | ||
|
|
||
| module attributes {pto.target_arch = "a2a3"} { | ||
| func.func @syncfinder_if_virtual_else_safe(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %cond: i1) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} { | ||
| %c0_i64 = arith.constant 0 : i64 | ||
| %c4096_i64 = arith.constant 4096 : i64 | ||
| %c16 = arith.constant 16 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c0 = arith.constant 0 : index | ||
|
|
||
| %src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32> | ||
| %dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32> | ||
| %src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
| %dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32> | ||
|
|
||
| %ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> | ||
| %ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> | ||
|
|
||
| pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) | ||
| pto.tadd ins(%ub0, %ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) | ||
|
|
||
| scf.if %cond { | ||
| pto.tstore ins(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) | ||
| } | ||
|
|
||
| pto.tstore ins(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) | ||
| return | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
CHECK-NOTdirective is misplaced and will not correctly validate the absence of the synchronization flag if the bug is fixed.In the
InsertSyncAnalysisimplementation, synchronization operations are added to thepipeBeforelist in the order they are discovered during the reverse scan. The loop-carried dependency (which generates thepipe_barrier) is processed before the top-level dependencies (which would generate thewait_flag). Consequently, thepipe_barrierappears before anywait_flagin the debug output.FileCheck'sCHECK-NOTonly ensures the pattern does not occur between the previousCHECKand the nextCHECK. Currently, it only checks the gap between theCOMPOUNDline and thePRE : pipe_barrierline. If the bug were fixed and thewait_flagwere correctly inserted, it would appear after the barrier, and this test would still pass (failing to catch the regression). Additionally, note thatwait_flag <PIPE_V -> PIPE_MTE3>is also missing for this node due to the same escape issue.