Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions test/lit/pto/issue_syncfinder_zero_trip_nested_loop_debug.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s
//
// Nested-loop reproducer for current main-branch loop syncFinder escape:
// - A top-level MTE2->V chain feeds a top-level V producer.
// - A nested may-path consumes that V result with MTE3.
// - A post-loop MTE3 consumer still directly depends on the original MTE2 result.
// - Correct zero-trip handling would keep a direct MTE2->MTE3 sync for the
// post-loop tstore, because outer-loop may execute zero times.
// - Current main drops that direct sync during After Analysis after loop
// syncFinder escape rebuilds a hidden MTE2->V->MTE3 chain at top level.
//
// CHECK: // === [PTOInsertSync Debug] After Analysis === //
// CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2]
// CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_V> idx=0
// CHECK: [ 1] COMPOUND pto.tadd [PIPE_V]
// CHECK: PRE : wait_flag <PIPE_MTE2 -> PIPE_V> idx=0
// CHECK: POST: set_flag <PIPE_V -> PIPE_MTE3> idx=1
// CHECK: [ 4] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK: PRE : wait_flag <PIPE_V -> PIPE_MTE3> idx=1
// CHECK: [ 7] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK-NOT: wait_flag <PIPE_MTE2 -> PIPE_MTE3>
// CHECK: PRE : pipe_barrier <PIPE_MTE3 -> PIPE_MTE3>
Comment on lines +20 to +22
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The CHECK-NOT directive is misplaced and will not correctly validate the absence of the synchronization flag if the bug is fixed.

In the InsertSyncAnalysis implementation, synchronization operations are added to the pipeBefore list in the order they are discovered during the reverse scan. The loop-carried dependency (which generates the pipe_barrier) is processed before the top-level dependencies (which would generate the wait_flag). Consequently, the pipe_barrier appears before any wait_flag in the debug output.

FileCheck's CHECK-NOT only ensures the pattern does not occur between the previous CHECK and the next CHECK. Currently, it only checks the gap between the COMPOUND line and the PRE : pipe_barrier line. If the bug were fixed and the wait_flag were correctly inserted, it would appear after the barrier, and this test would still pass (failing to catch the regression). Additionally, note that wait_flag <PIPE_V -> PIPE_MTE3> is also missing for this node due to the same escape issue.

// CHECK: [   7] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK: PRE : pipe_barrier <PIPE_MTE3 -> PIPE_MTE3>
// CHECK-NOT: wait_flag <PIPE_MTE2 -> PIPE_MTE3>


module attributes {pto.target_arch = "a2a3"} {
func.func @syncfinder_zero_trip_nested_loop(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %outer: index, %inner: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c4096_i64 = arith.constant 4096 : i64
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index

%src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
%dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
%src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
%dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>

%ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>

// i0: top-level producer on PIPE_MTE2
pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
// i1: top-level bridge producer on PIPE_V
pto.tadd ins(%ub0, %ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)

// nested may-path: old V->MTE3 sync only lives inside the loops
scf.for %o = %c0 to %outer step %c1 {
scf.for %i = %c0 to %inner step %c1 {
pto.tstore ins(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>)
}
}

// post-loop consumer: directly depends on ub0, so zero-trip-safe handling
// should keep a direct MTE2->MTE3 wait here.
pto.tstore ins(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>)
return
}
}
53 changes: 53 additions & 0 deletions test/lit/pto/issue_syncfinder_zero_trip_single_loop_debug.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s
//
// Debug reproducer for the current main-branch loop behavior:
// - A zero-trip loop contains an old V->MTE3 sync chain.
// - The post-loop MTE3 consumer still directly depends on the pre-loop MTE2 producer.
// - Correct zero-trip handling would keep a direct MTE2->MTE3 sync for the post-loop tstore.
// - Current main drops that direct sync during After Analysis because loop syncFinder escapes.
//
// CHECK: // === [PTOInsertSync Debug] After Analysis === //
// CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2]
// CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_V> idx=0
// CHECK: [ 1] COMPOUND pto.tadd [PIPE_V]
// CHECK: PRE : wait_flag <PIPE_MTE2 -> PIPE_V> idx=0
// CHECK: POST: set_flag <PIPE_V -> PIPE_MTE3> idx=1
// CHECK: [ 2] LOOP LOOP_BEGIN
// CHECK: [ 3] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK: PRE : wait_flag <PIPE_V -> PIPE_MTE3> idx=1
// CHECK: [ 5] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK-NOT: wait_flag <PIPE_MTE2 -> PIPE_MTE3>
// CHECK: PRE : pipe_barrier <PIPE_MTE3 -> PIPE_MTE3>
Comment on lines +18 to +20
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The CHECK-NOT directive is misplaced here as well. Since the pipe_barrier is discovered and inserted into the pipeBefore list before the wait_flag during the reverse analysis, it will appear first in the debug output. Moving the CHECK-NOT after the barrier check ensures that the test correctly fails if the wait_flag is present anywhere in the PRE block.

// CHECK: [   5] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK: PRE : pipe_barrier <PIPE_MTE3 -> PIPE_MTE3>
// CHECK-NOT: wait_flag <PIPE_MTE2 -> PIPE_MTE3>


module attributes {pto.target_arch = "a2a3"} {
func.func @syncfinder_zero_trip_single_loop(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %trip: index) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c4096_i64 = arith.constant 4096 : i64
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index

%src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
%dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
%src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
%dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>

%ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>

// pre-loop producer: PIPE_MTE2 writes ub0
pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
// bridge producer: PIPE_V reads ub0 and writes ub1
pto.tadd ins(%ub0, %ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)

// zero-trip loop candidate: old V->MTE3 sync lives only on this may-path
scf.for %i = %c0 to %trip step %c1 {
pto.tstore ins(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>)
}

// post-loop consumer: directly depends on ub0, so zero-trip-safe insertion
// should keep a direct MTE2->MTE3 wait here.
pto.tstore ins(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>)
return
}
}
46 changes: 46 additions & 0 deletions test/lit/pto/syncfinder_if_virtual_else_safe_debug.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s
//
// Branch contrast case for syncFinder escape analysis:
// - Frontend lowering always materializes an ELSE_BEGIN plus a virtual else
// placeholder for scf.if without an explicit else.
// - InsertBranchSync therefore takes the "merge both branches" path instead of
// the "no else: propagate syncFinder only" path.
// - The post-if direct MTE2->MTE3 sync must remain present.
//
// CHECK: // === [PTOInsertSync Debug] After Analysis === //
// CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2]
// CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_V> idx=0
// CHECK: POST: set_flag <PIPE_MTE2 -> PIPE_MTE3> idx=[[DIRECT:[0-9]+]]
// CHECK: [ 2] BRANCH IF_BEGIN
// CHECK: [ 5] BRANCH ELSE_BEGIN
// CHECK: virtualElse
// CHECK: [ 8] COMPOUND pto.tstore [PIPE_MTE3]
// CHECK: PRE : wait_flag <PIPE_MTE2 -> PIPE_MTE3> idx=[[DIRECT]]

module attributes {pto.target_arch = "a2a3"} {
func.func @syncfinder_if_virtual_else_safe(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>, %cond: i1) attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c4096_i64 = arith.constant 4096 : i64
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index

%src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
%dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout<nd>} : !pto.tensor_view<?x?xf32>
%src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
%dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>

%ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>

pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%ub0, %ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)

scf.if %cond {
pto.tstore ins(%ub1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>)
}

pto.tstore ins(%ub0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>)
return
}
}
Loading