diff --git a/test/lit/pto/issue_syncfinder_zero_trip_nested_loop_debug.pto b/test/lit/pto/issue_syncfinder_zero_trip_nested_loop_debug.pto new file mode 100644 index 000000000..d3c89aa32 --- /dev/null +++ b/test/lit/pto/issue_syncfinder_zero_trip_nested_loop_debug.pto @@ -0,0 +1,57 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s +// +// Nested-loop reproducer for current main-branch loop syncFinder escape: +// - A top-level MTE2->V chain feeds a top-level V producer. +// - A nested may-path consumes that V result with MTE3. +// - A post-loop MTE3 consumer still directly depends on the original MTE2 result. +// - Correct zero-trip handling would keep a direct MTE2->MTE3 sync for the +// post-loop tstore, because outer-loop may execute zero times. +// - Current main drops that direct sync during After Analysis after loop +// syncFinder escape rebuilds a hidden MTE2->V->MTE3 chain at top level. +// +// CHECK: // === [PTOInsertSync Debug] After Analysis === // +// CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2] +// CHECK: POST: set_flag PIPE_V> idx=0 +// CHECK: [ 1] COMPOUND pto.tadd [PIPE_V] +// CHECK: PRE : wait_flag PIPE_V> idx=0 +// CHECK: POST: set_flag PIPE_MTE3> idx=1 +// CHECK: [ 4] COMPOUND pto.tstore [PIPE_MTE3] +// CHECK: PRE : wait_flag PIPE_MTE3> idx=1 +// CHECK: [ 7] COMPOUND pto.tstore [PIPE_MTE3] +// CHECK-NOT: wait_flag PIPE_MTE3> +// CHECK: PRE : pipe_barrier PIPE_MTE3> + +module attributes {pto.target_arch = "a2a3"} { + func.func @syncfinder_zero_trip_nested_loop(%arg0: !pto.ptr, %arg1: !pto.ptr, %outer: index, %inner: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + + %src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + %dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + + %ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf + + // i0: top-level producer on PIPE_MTE2 + pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf) + // i1: top-level bridge producer on PIPE_V + pto.tadd ins(%ub0, %ub0 : !pto.tile_buf, !pto.tile_buf) outs(%ub1 : !pto.tile_buf) + + // nested may-path: old V->MTE3 sync only lives inside the loops + scf.for %o = %c0 to %outer step %c1 { + scf.for %i = %c0 to %inner step %c1 { + pto.tstore ins(%ub1 : !pto.tile_buf) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) + } + } + + // post-loop consumer: directly depends on ub0, so zero-trip-safe handling + // should keep a direct MTE2->MTE3 wait here. + pto.tstore ins(%ub0 : !pto.tile_buf) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) + return + } +} diff --git a/test/lit/pto/issue_syncfinder_zero_trip_single_loop_debug.pto b/test/lit/pto/issue_syncfinder_zero_trip_single_loop_debug.pto new file mode 100644 index 000000000..67a34d1f2 --- /dev/null +++ b/test/lit/pto/issue_syncfinder_zero_trip_single_loop_debug.pto @@ -0,0 +1,53 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s +// +// Debug reproducer for the current main-branch loop behavior: +// - A zero-trip loop contains an old V->MTE3 sync chain. +// - The post-loop MTE3 consumer still directly depends on the pre-loop MTE2 producer. +// - Correct zero-trip handling would keep a direct MTE2->MTE3 sync for the post-loop tstore. +// - Current main drops that direct sync during After Analysis because loop syncFinder escapes. +// +// CHECK: // === [PTOInsertSync Debug] After Analysis === // +// CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2] +// CHECK: POST: set_flag PIPE_V> idx=0 +// CHECK: [ 1] COMPOUND pto.tadd [PIPE_V] +// CHECK: PRE : wait_flag PIPE_V> idx=0 +// CHECK: POST: set_flag PIPE_MTE3> idx=1 +// CHECK: [ 2] LOOP LOOP_BEGIN +// CHECK: [ 3] COMPOUND pto.tstore [PIPE_MTE3] +// CHECK: PRE : wait_flag PIPE_MTE3> idx=1 +// CHECK: [ 5] COMPOUND pto.tstore [PIPE_MTE3] +// CHECK-NOT: wait_flag PIPE_MTE3> +// CHECK: PRE : pipe_barrier PIPE_MTE3> + +module attributes {pto.target_arch = "a2a3"} { + func.func @syncfinder_zero_trip_single_loop(%arg0: !pto.ptr, %arg1: !pto.ptr, %trip: index) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + + %src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + %dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + + %ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf + + // pre-loop producer: PIPE_MTE2 writes ub0 + pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf) + // bridge producer: PIPE_V reads ub0 and writes ub1 + pto.tadd ins(%ub0, %ub0 : !pto.tile_buf, !pto.tile_buf) outs(%ub1 : !pto.tile_buf) + + // zero-trip loop candidate: old V->MTE3 sync lives only on this may-path + scf.for %i = %c0 to %trip step %c1 { + pto.tstore ins(%ub1 : !pto.tile_buf) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) + } + + // post-loop consumer: directly depends on ub0, so zero-trip-safe insertion + // should keep a direct MTE2->MTE3 wait here. + pto.tstore ins(%ub0 : !pto.tile_buf) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) + return + } +} diff --git a/test/lit/pto/syncfinder_if_virtual_else_safe_debug.pto b/test/lit/pto/syncfinder_if_virtual_else_safe_debug.pto new file mode 100644 index 000000000..84c2310be --- /dev/null +++ b/test/lit/pto/syncfinder_if_virtual_else_safe_debug.pto @@ -0,0 +1,46 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o - 2>&1 | FileCheck %s +// +// Branch contrast case for syncFinder escape analysis: +// - Frontend lowering always materializes an ELSE_BEGIN plus a virtual else +// placeholder for scf.if without an explicit else. +// - InsertBranchSync therefore takes the "merge both branches" path instead of +// the "no else: propagate syncFinder only" path. +// - The post-if direct MTE2->MTE3 sync must remain present. +// +// CHECK: // === [PTOInsertSync Debug] After Analysis === // +// CHECK: [ 0] COMPOUND pto.tload [PIPE_MTE2] +// CHECK: POST: set_flag PIPE_V> idx=0 +// CHECK: POST: set_flag PIPE_MTE3> idx=[[DIRECT:[0-9]+]] +// CHECK: [ 2] BRANCH IF_BEGIN +// CHECK: [ 5] BRANCH ELSE_BEGIN +// CHECK: virtualElse +// CHECK: [ 8] COMPOUND pto.tstore [PIPE_MTE3] +// CHECK: PRE : wait_flag PIPE_MTE3> idx=[[DIRECT]] + +module attributes {pto.target_arch = "a2a3"} { + func.func @syncfinder_if_virtual_else_safe(%arg0: !pto.ptr, %arg1: !pto.ptr, %cond: i1) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0_i64 = arith.constant 0 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + + %src_view = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %dst_view = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] {layout = #pto.layout} : !pto.tensor_view + %src_pview = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + %dst_pview = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + + %ub0 = pto.alloc_tile addr = %c0_i64 : !pto.tile_buf + %ub1 = pto.alloc_tile addr = %c4096_i64 : !pto.tile_buf + + pto.tload ins(%src_pview : !pto.partition_tensor_view<16x16xf32>) outs(%ub0 : !pto.tile_buf) + pto.tadd ins(%ub0, %ub0 : !pto.tile_buf, !pto.tile_buf) outs(%ub1 : !pto.tile_buf) + + scf.if %cond { + pto.tstore ins(%ub1 : !pto.tile_buf) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) + } + + pto.tstore ins(%ub0 : !pto.tile_buf) outs(%dst_pview : !pto.partition_tensor_view<16x16xf32>) + return + } +}