CVCUDA · carterbox · Apr 10, 2026 · carterbox · Apr 10, 2026 · carterbox
diff --git a/tests/cvcuda/python/test_multi_stream.py b/tests/cvcuda/python/test_multi_stream.py
@@ -25,6 +25,7 @@ def test_multiple_streams():
     stream3 = cvcuda.Stream()  # create a new stream
     assert stream1 is not stream2
     assert stream1 is not stream3
+    assert stream2 is not stream3
     assert cvcuda.Stream.current is cvcuda.Stream.default
     assert cvcuda.Stream.current is not stream1
     assert cvcuda.Stream.current is not stream2
@@ -54,16 +55,18 @@ def test_stream_context_nested():
         assert cvcuda.Stream.current is stream2
     assert cvcuda.Stream.current is cvcuda.Stream.default
 
+class _CatchThisException(Exception):
+    """A test specific Exception to check that we raise the correct Exception."""
 
 def test_stream_context_exception():
     stream1 = cvcuda.Stream()  # create a new stream
     stream2 = cvcuda.Stream()  # create a new stream
-    with t.raises(Exception):
+    with t.raises(_CatchThisException):
         with stream1:
             assert cvcuda.Stream.current is stream1
             with stream2:
                 assert cvcuda.Stream.current is stream2
-                raise Exception()
+                raise _CatchThisException()
             assert cvcuda.Stream.current is stream1
         assert cvcuda.Stream.current is cvcuda.Stream.default
     with stream2:
@@ -77,6 +80,7 @@ def test_operator_stream():
     stream3 = cvcuda.Stream()  # create a new stream
     assert stream1 is not stream2
     assert stream1 is not stream3
+    assert stream2 is not stream3
     assert cvcuda.Stream.current is cvcuda.Stream.default
     assert cvcuda.Stream.current is not stream1
     assert cvcuda.Stream.current is not stream2
@@ -100,6 +104,9 @@ def test_operator_stream():
         cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY)
         assert cvcuda.Stream.current is stream3
     assert cvcuda.Stream.current is cvcuda.Stream.default
+    stream1.sync()
+    stream2.sync()
+    stream3.sync()
 
 
 def test_operator_changing_stream():
@@ -110,6 +117,7 @@ def test_operator_changing_stream():
     C = 3
     Loop = 50
     streams = [cvcuda.Stream() for _ in range(4)]  # create a list of streams
+    torch_streams = [torch.cuda.ExternalStream(s.handle) for s in streams]
 
     inputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
     outputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
@@ -119,11 +127,16 @@ def test_operator_changing_stream():
     inTensor = cvcuda.as_tensor(inputTensor.data, "NHWC")
     outTensor = cvcuda.as_tensor(outputTensor.data, "NHWC")
 
+    prev_torch_stream = None
     for _ in range(Loop):
-        for stream in streams:
+        for stream, torch_stream in zip(streams, torch_streams, strict=True):
+            if prev_torch_stream is not None:
+                torch_stream.wait_stream(prev_torch_stream)
             cvcuda.flip_into(outTensor, inTensor, -1, stream=stream)  # output x flipped
             cvcuda.flip_into(inTensor, outTensor, -1, stream=stream)  # output y flipped
+            prev_torch_stream = torch_stream
 
+    torch_streams[-1].synchronize()
     final_out = torch.as_tensor(inTensor.cuda()).cpu()
     assert torch.equal(final_out, inputTensor_copy.cpu())
 
@@ -138,6 +151,9 @@ def test_operator_changing_stream_loaded():
     stream1 = cvcuda.Stream()
     stream2 = cvcuda.Stream()
 
+    torch_stream1 = torch.cuda.ExternalStream(stream1.handle)
+    torch_stream2 = torch.cuda.ExternalStream(stream2.handle)
+
     inputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
     inputTensorTmp = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
     outputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
@@ -156,9 +172,11 @@ def test_operator_changing_stream_loaded():
         cvcuda.flip_into(
             inTensorTmp, inTensor, -1, stream=stream1
         )  # output x/y flipped
+        torch_stream2.wait_stream(torch_stream1)
         cvcuda.flip_into(
             outTensor, inTensorTmp, -1, stream=stream2
         )  # output y/y flipped
 
+    torch_stream2.synchronize()
     final_out = torch.as_tensor(outTensor.cuda()).cpu()
     assert torch.equal(final_out, inputTensor_copy.cpu())