ros2 · yuanknv · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 10, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,18 @@
+Any contribution that you make to this repository will
+be under the Apache 2 License, as dictated by that
+[license](http://www.apache.org/licenses/LICENSE-2.0.html):
+
+~~~
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+~~~
+
+Contributors must sign-off each commit by adding a `Signed-off-by: ...`
+line to commit messages to certify that they have the right to submit
+the code they are contributing to the project according to the
+[Developer Certificate of Origin (DCO)](https://developercertificate.org/).
diff --git a/README.md b/README.md
@@ -1,2 +1,93 @@
 # rosidl_buffer_backends
-Backend implementations for ROSIDL buffer types
+
+CUDA and PyTorch buffer backend implementations for `rosidl::Buffer`,
+enabling zero-copy GPU memory sharing between ROS 2 publishers and
+subscribers.
+
+## Packages
+
+- **cuda_buffer** -- Core CUDA buffer library (VMM-backed IPC memory pool,
+  host endpoint manager, ReadHandle/WriteHandle with CUDA event sync).
+- **cuda_buffer_backend** -- BufferBackend plugin for CUDA IPC transport.
+- **cuda_buffer_backend_msgs** -- ROS 2 message definitions for CUDA buffer
+  descriptors.
+- **libtorch_vendor** -- Vendor package that downloads and installs the
+  pre-built LibTorch C++ distribution.
+- **torch_buffer** -- Device-agnostic PyTorch buffer library wrapping device
+  backends with tensor metadata (shape, strides, dtype).
+- **torch_buffer_backend** -- BufferBackend plugin for PyTorch tensors.
+- **torch_buffer_backend_msgs** -- ROS 2 message definitions for Torch buffer
+  descriptors.
+
+## Prerequisites
+
+- A ROS 2 Rolling development environment. See the upstream
+  [Building ROS 2 on Ubuntu](https://docs.ros.org/en/rolling/Installation/Alternatives/Ubuntu-Development-Setup.html)
+  guide for the canonical source-build flow, or use the pixi workflow
+  shipped by the [`ros2/ros2`](https://github.com/ros2/ros2) meta-repo.
+- CUDA Toolkit (>= 11.8) on the host.
+- LibTorch: provided automatically by `libtorch_vendor` at build time if a
+  system LibTorch isn't already visible.
+
+Per-package build, test, and run details live in each backend's README:
+
+- [`cuda_buffer_backend/README.md`](cuda_buffer_backend/README.md)
+- [`torch_buffer_backend/README.md`](torch_buffer_backend/README.md)
+- Demo: [`../rosidl_buffer_backends_tutorials/README.md`](../rosidl_buffer_backends_tutorials/README.md)
+
+## API overview
+
+### CUDA buffer backend (`cuda_buffer_backend`)
+
+```cpp
+#include "cuda_buffer/cuda_buffer_api.hpp"
+
+// Publisher: allocate + write directly via kernel.
+auto msg = cuda_buffer_backend::allocate_msg<sensor_msgs::msg::Image>(byte_count);
+{
+  auto wh = cuda_buffer_backend::from_buffer(msg.data, stream);
+  my_kernel<<<...>>>(wh.get_ptr(), ...);
+}  // wh destructor records the write event on `stream`
+
+// Publisher: copy from an existing host/device pointer into a pre-allocated buffer.
+{
+  auto wh = cuda_buffer_backend::from_buffer(msg.data, stream);
+  cuda_buffer_backend::to_buffer(host_ptr, byte_count, wh, stream,
+    cudaMemcpyHostToDevice);
+}
+
+// Subscriber: read handle (waits on publisher's write event).
+auto rh = cuda_buffer_backend::from_buffer(msg->data, stream);
+use_data<<<...>>>(rh.get_ptr(), ...);
+
+// Auto-promotion: passing a non-CUDA buffer allocates a fresh CUDA buffer
+// and (for reads) copies H2D; the handle owns the new buffer via
+// get_promoted_buffer().
+auto rh_any = cuda_buffer_backend::from_buffer(cpu_or_other_buf, stream);
+std::shared_ptr<rosidl::Buffer<uint8_t>> promoted = rh_any.get_promoted_buffer();
+```
+
+### Torch buffer backend (`torch_buffer_backend`)
+
+```cpp
+#include "torch_buffer/torch_buffer_api.hpp"
+
+// Publisher: allocate + copy a tensor into the message.
+auto msg = torch_buffer_backend::allocate_msg<sensor_msgs::msg::Image>(
+  {H, W, C}, torch::kByte);
+torch_buffer_backend::to_buffer(msg.data, tensor);
+
+// Subscriber: safe default returns an independent clone.
+at::Tensor t = torch_buffer_backend::from_buffer(msg->data);
+
+// Subscriber: zero-copy view when the caller is certain it will not mutate
+// the tensor in place. Caller must treat the returned tensor as read-only.
+at::Tensor view = torch_buffer_backend::from_buffer(msg->data, /*clone=*/false);
+```
+
+The torch backend does not cross-device-promote: the returned tensor stays
+on the same device as the underlying torch buffer (CUDA or CPU).
+
+## License
+
+Apache-2.0
diff --git a/cuda_buffer_backend/README.md b/cuda_buffer_backend/README.md
@@ -0,0 +1,159 @@
+# cuda_buffer_backend
+
+CUDA buffer backend plugin for the ROS 2 Buffer system. Enables zero-copy GPU memory sharing between publishers and subscribers on the same host using CUDA VMM (Virtual Memory Management).
+
+## Build
+
+Requires a ROS 2 Rolling source workspace; see
+[Building ROS 2 on Ubuntu](https://docs.ros.org/en/rolling/Installation/Alternatives/Ubuntu-Development-Setup.html)
+for the canonical setup. After cloning this repo into your workspace's
+`src/` directory:
+
+```bash
+# Install system dependencies (CUDA toolkit, etc.).
+rosdep install --from-paths src --ignore-src -y \
+  --skip-keys "fastcdr rti-connext-dds-7.7.0 urdfdom_headers qt6-svg-dev"
+
+# Build the CUDA backend.
+colcon build --symlink-install --packages-up-to cuda_buffer_backend
+source install/setup.sh
+```
+
+## Test
+
+```bash
+colcon test --packages-select cuda_buffer cuda_buffer_backend
+colcon test-result --verbose
+```
+
+## Packages
+
+| Package | Description |
+|---|---|
+| `cuda_buffer` | Core CUDA buffer implementation: memory pool, IPC manager, host endpoint manager, and user-facing `allocate_msg`/`from_buffer`/`to_buffer` APIs |
+| `cuda_buffer_backend` | Plugin registration via `pluginlib`, endpoint discovery, and descriptor serialization |
+| `cuda_buffer_backend_msgs` | ROS 2 message definition for `CudaBufferDescriptor` |
+
+## Usage
+
+### Publisher (direct write, zero-copy)
+
+```cpp
+#include "cuda_buffer/cuda_buffer_api.hpp"
+#include "sensor_msgs/msg/image.hpp"
+
+const size_t data_size = 640 * 480 * 3;
+
+sensor_msgs::msg::Image msg =
+  cuda_buffer_backend::allocate_msg<sensor_msgs::msg::Image>(data_size);
+msg.height = 480;
+msg.width = 640;
+msg.encoding = "rgb8";
+msg.step = 640 * 3;
+
+cuda_buffer_backend::WriteHandle wh =
+  cuda_buffer_backend::from_buffer(msg.data, stream);
+my_kernel<<<...>>>(wh.get_ptr(), ...);
+
+publisher->publish(msg);
+// wh destructor records write_event on stream when it goes out of scope
+```
+
+### Publisher (copy from existing pointer)
+
+Use `to_buffer` to copy bytes from an existing pointer (host or device) into
+a buffer that was already allocated (e.g. via `allocate_msg`). `to_buffer`
+is a plain memcpy-through-a-WriteHandle and does **not** allocate.
+
+```cpp
+sensor_msgs::msg::Image msg =
+  cuda_buffer_backend::allocate_msg<sensor_msgs::msg::Image>(data_size);
+msg.height = 480;
+msg.width = 640;
+msg.encoding = "rgb8";
+msg.step = 640 * 3;
+
+{
+  cuda_buffer_backend::WriteHandle wh =
+    cuda_buffer_backend::from_buffer(msg.data, stream);
+
+  // From a device pointer (D2D copy, default kind)
+  cuda_buffer_backend::to_buffer(gpu_ptr, data_size, wh, stream);
+
+  // Or from a host pointer (H2D copy)
+  // cuda_buffer_backend::to_buffer(
+  //   host_ptr, data_size, wh, stream, cudaMemcpyHostToDevice);
+}  // wh destructor records the write event on `stream`
+
+publisher->publish(msg);
+```
+
+### Subscriber (read from buffer, zero-copy)
+
+```cpp
+#include "cuda_buffer/cuda_buffer_api.hpp"
+
+void callback(const sensor_msgs::msg::Image::SharedPtr msg) {
+  const rosidl::Buffer<uint8_t> & data = msg->data;
+  cuda_buffer_backend::ReadHandle rh =
+    cuda_buffer_backend::from_buffer(data, stream);
+  // ReadHandle constructor waits on publisher's write_event
+
+  my_kernel<<<...>>>(rh.get_ptr(), ...);
+}  // ReadHandle destructor signals publisher that GPU work is complete
+```
+
+### Auto-promoting non-CUDA buffers
+
+`from_buffer` accept any `rosidl::Buffer<T>`, not just
+CUDA-backed ones. If the source is a non-CUDA buffer (e.g. the CPU fallback
+path), `from_buffer` allocates a new CUDA-backed `rosidl::Buffer<uint8_t>`
+and returns a handle for it.
+
+```cpp
+#include "cuda_buffer/cuda_buffer_api.hpp"
+
+void callback(const sensor_msgs::msg::Image::SharedPtr msg) {
+  const rosidl::Buffer<uint8_t> & data = msg->data;
+  cuda_buffer_backend::ReadHandle rh =
+    cuda_buffer_backend::from_buffer(data, stream);
+
+  my_kernel<<<...>>>(rh.get_ptr(), ...);
+}
+```
+
+### `from_buffer` handle rules
+
+`from_buffer` returns a **WriteHandle** when called with a non-const buffer, or a
+**ReadHandle** when called with a const buffer. The overload is selected at compile
+time based on const-ness of the reference:
+
+```cpp
+// Write path (publisher):
+cuda_buffer_backend::WriteHandle wh = cuda_buffer_backend::from_buffer(msg.data, stream);
+
+// Read path (subscriber):
+const rosidl::Buffer<uint8_t> & data = msg->data;
+cuda_buffer_backend::ReadHandle rh = cuda_buffer_backend::from_buffer(data, stream);
+```
+
+- A **WriteHandle** can only be acquired once per buffer. Attempting to acquire
+  a second WriteHandle (or acquiring one after finalization) throws `CudaError`.
+- To read a received buffer, always pass a **const reference**.
+- If the source buffer is non-CUDA, the handle owns the promoted CUDA buffer;
+  call `handle.get_promoted_buffer()` to retrieve it.
+
+## IPC Behavior
+
+The RMW layer calls `on_discovering_endpoint()` for each subscriber to decide between zero-copy IPC and CPU fallback:
+
+| Condition | Path |
+|---|---|
+| Same host, same GPU, same user | Zero-copy via CUDA VMM IPC |
+| Different GPU, different user, different host, or VMM unavailable | CPU fallback via `to_cpu()` |
+
+The publisher's pool checks a shared-memory refcount before recycling a block, ensuring all IPC subscribers have released their handles.
+
+## License
+
+Apache-2.0
diff --git a/cuda_buffer_backend/cuda_buffer/CMakeLists.txt b/cuda_buffer_backend/cuda_buffer/CMakeLists.txt
@@ -0,0 +1,84 @@
+cmake_minimum_required(VERSION 3.20)
+project(cuda_buffer)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(rosidl_buffer REQUIRED)
+find_package(cuda_buffer_backend_msgs REQUIRED)
+find_package(rmw REQUIRED)
+find_package(rcutils REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+add_library(${PROJECT_NAME} SHARED
+  src/cuda_buffer.cpp
+  src/cuda_buffer_ipc_manager.cpp
+  src/cuda_memory_pool.cpp
+  src/host_endpoint_manager.cpp
+)
+
+target_include_directories(${PROJECT_NAME} PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include/${PROJECT_NAME}>
+  ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+target_link_libraries(${PROJECT_NAME}
+  PUBLIC
+    rosidl_buffer::rosidl_buffer
+    rmw::rmw
+    rcutils::rcutils
+    ${cuda_buffer_backend_msgs_TARGETS}
+  PRIVATE
+    CUDA::cudart
+    CUDA::cuda_driver
+    rt
+)
+
+install(
+  DIRECTORY include/
+  DESTINATION include/${PROJECT_NAME}
+)
+
+install(
+  TARGETS ${PROJECT_NAME}
+  EXPORT ${PROJECT_NAME}
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
+  INCLUDES DESTINATION include/${PROJECT_NAME}
+)
+
+ament_export_targets(${PROJECT_NAME} HAS_LIBRARY_TARGET)
+ament_export_dependencies(rosidl_buffer cuda_buffer_backend_msgs rmw rcutils)
+ament_export_libraries(${PROJECT_NAME})
+ament_export_include_directories(include/${PROJECT_NAME})
+
+if(BUILD_TESTING)
+  find_package(ament_lint_auto REQUIRED)
+  ament_lint_auto_find_test_dependencies()
+
+  find_package(ament_cmake_gtest REQUIRED)
+
+  ament_add_gtest(test_cuda_buffer
+    test/test_cuda_buffer.cpp
+  )
+  if(TARGET test_cuda_buffer)
+    target_link_libraries(test_cuda_buffer
+      ${PROJECT_NAME}
+      rosidl_buffer::rosidl_buffer
+      CUDA::cudart
+      CUDA::cuda_driver
+      rt
+    )
+  endif()
+
+endif()
+
+ament_package()