diff --git a/build.sh b/build.sh
new file mode 100644
index 00000000..5a240e7f
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,8 @@
+rm -rf build
+rm -rf dist
+rm -rf deep_ep_cpp.cpython-38-x86_64-linux-gnu.so
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PADDLE_CUDA_ARCH_LIST="10.0"
+python setup_deep_ep.py bdist_wheel
+python setup_hybrid_ep.py bdist_wheel
+pip install dist/*.whl --force-reinstall
\ No newline at end of file
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
index 302fa621..53157d03 100644
--- a/csrc/deep_ep.cpp
+++ b/csrc/deep_ep.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#include <ATen/cuda/CUDAContext.h>
+// #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDADataType.h>
+#include <ATen/Functions.h>
 #include <chrono>
 #include <cuda_runtime.h>
 #include <memory>
@@ -131,7 +132,8 @@ Buffer::Buffer(int rank,
                bool low_latency_mode,
                bool disable_nvlink_for_normal_mode,
                bool explicitly_destroy,
-               bool use_fabric)
+               bool use_fabric,
+               int context_ring_id)
     : rank(rank),
       num_ranks(num_ranks),
       num_nvl_bytes(num_nvl_bytes),
@@ -139,8 +141,20 @@ Buffer::Buffer(int rank,
       low_latency_mode(low_latency_mode),
       disable_nvlink_for_normal_mode(disable_nvlink_for_normal_mode),
       explicitly_destroy(explicitly_destroy),
-      comm_stream(at::cuda::getStreamFromPool(true)),
       shared_memory_allocator(use_fabric) {
+
+    CUDA_CHECK(cudaGetDevice(&device_id));
+    auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
+    paddle::distributed::ProcessGroup* pg = map->get(context_ring_id);
+    const auto& place = phi::GPUPlace(device_id);
+    comm_ctx =
+      reinterpret_cast<paddle::distributed::ProcessGroupNCCL*>(pg)
+          ->GetOrCreateCommContext(place, phi::distributed::CommType::ALLTOALL);
+    comm_stream = comm_ctx->GetStream();
+    calc_ctx = reinterpret_cast<phi::GPUContext*>(
+      reinterpret_cast<paddle::distributed::ProcessGroupNCCL*>(pg)
+          ->GetDeviceContext(place, true));
+
     // Metadata memory
     int64_t barrier_signal_bytes = NUM_MAX_NVL_PEERS * sizeof(int);
     int64_t buffer_ptr_bytes = NUM_MAX_NVL_PEERS * sizeof(void*);
@@ -262,7 +276,7 @@ torch::Tensor Buffer::get_local_buffer_tensor(const pybind11::object& dtype, int
     return torch::from_blob(base_ptr, num_bytes / element_bytes, torch::TensorOptions().dtype(casted_dtype).device(at::kCUDA));
 }
 
-torch::Stream Buffer::get_comm_stream() const {
+cudaStream_t Buffer::get_comm_stream() const {
     return comm_stream;
 }
 
@@ -374,10 +388,10 @@ Buffer::get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts,
 
     // Allocate all tensors on comm stream if set
     // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
         EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     // Wait previous tasks to be finished
@@ -423,7 +437,7 @@ Buffer::get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts,
 
     // Switch back compute stream
     if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
 
     return {num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, event};
 }
@@ -534,10 +548,10 @@ Buffer::intranode_dispatch(const torch::Tensor& x, const std::optional<torch::Te
 
     // Allocate all tensors on comm stream if set
     // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
-        EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        EP_HOST_ASSERT(previous_event.has_value() && async);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     // Wait previous tasks to be finished
@@ -686,8 +700,9 @@ Buffer::intranode_dispatch(const torch::Tensor& x, const std::optional<torch::Te
     }
 
     // Switch back compute stream
-    if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+    if (allocate_on_comm_stream) {
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
+    }
 
     // Return values
     return {recv_x, recv_x_scales, recv_sf_scale_for_nvfp4, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, send_head, event};
@@ -718,10 +733,10 @@ Buffer::intranode_combine(const torch::Tensor& x, const std::optional<torch::Ten
 
     // Allocate all tensors on comm stream if set
     // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
-        EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        EP_HOST_ASSERT(previous_event.has_value() && async);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     // Wait previous tasks to be finished
@@ -798,8 +813,9 @@ Buffer::intranode_combine(const torch::Tensor& x, const std::optional<torch::Ten
     }
 
     // Switch back compute stream
-    if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+    if (allocate_on_comm_stream) {
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
+    }
 
     return {recv_x, recv_topk_weights, event};
 }
@@ -905,10 +921,10 @@ Buffer::internode_dispatch(const torch::Tensor& x, const std::optional<torch::Te
 
     // Allocate all tensors on comm stream if set
     // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
-        EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        EP_HOST_ASSERT(previous_event.has_value() && async);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     // Wait previous tasks to be finished
@@ -1070,8 +1086,9 @@ Buffer::internode_dispatch(const torch::Tensor& x, const std::optional<torch::Te
     }
 
     // Switch back compute stream
-    if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+    if (allocate_on_comm_stream) {
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
+    }
     
     // Return values
     return {recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list,
@@ -1119,10 +1136,10 @@ Buffer::internode_combine(const torch::Tensor& x, const std::optional<torch::Ten
 
     // Allocate all tensors on comm stream if set
     // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
-        EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        EP_HOST_ASSERT(previous_event.has_value() && async);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     // Wait previous tasks to be finished
@@ -1207,8 +1224,9 @@ Buffer::internode_combine(const torch::Tensor& x, const std::optional<torch::Ten
     }
 
     // Switch back compute stream
-    if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+    if (allocate_on_comm_stream) {
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
+    }
 
     // Return values
     return {combined_x, combined_topk_weights, event};
@@ -1616,10 +1634,10 @@ Buffer::dispatch_pcie(const torch::Tensor& x, const std::optional<torch::Tensor>
     }
 
     // Stream Management
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
         EP_HOST_ASSERT(previous_event.has_value() && async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     if (previous_event.has_value()) {
@@ -1757,7 +1775,7 @@ Buffer::dispatch_pcie(const torch::Tensor& x, const std::optional<torch::Tensor>
     }
 
     if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
 
     return {recv_x, recv_x_scales, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list,
             rdma_channel_prefix_matrix, 
@@ -1830,10 +1848,10 @@ Buffer::combine_pcie(const torch::Tensor& recv_x, const std::optional<torch::Ten
     }
 
     // Stream Management
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = calc_ctx->stream();
     if (allocate_on_comm_stream) {
-        EP_HOST_ASSERT(previous_event.has_value() && async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        EP_HOST_ASSERT(previous_event.has_value() and async);
+        deep_ep::SetAllocatorStreamForGPUContext(comm_stream, calc_ctx);
     }
 
     if (previous_event.has_value()) {
@@ -1883,7 +1901,7 @@ Buffer::combine_pcie(const torch::Tensor& recv_x, const std::optional<torch::Ten
     }
 
     if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+        deep_ep::SetAllocatorStreamForGPUContext(compute_stream, calc_ctx);
 
     return {combined_x, combined_topk_weights, event};
 #else
@@ -1897,7 +1915,7 @@ Buffer::combine_pcie(const torch::Tensor& recv_x, const std::optional<torch::Ten
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.doc() = "DeepEP: an efficient expert-parallel communication library";
 
-    pybind11::class_<deep_ep::Config>(m, "Config")
+    pybind11::class_<deep_ep::Config>(m, "Config", py::module_local())
         .def(pybind11::init<int, int, int, int, int>(),
              py::arg("num_sms") = 20,
              py::arg("num_max_nvl_chunked_send_tokens") = 6, py::arg("num_max_nvl_chunked_recv_tokens") = 256,
@@ -1907,12 +1925,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         .def("get_pcie_buffer_size_hint", &deep_ep::Config::get_pcie_buffer_size_hint);
     m.def("get_low_latency_rdma_size_hint", &deep_ep::get_low_latency_rdma_size_hint);
 
-    pybind11::class_<deep_ep::EventHandle>(m, "EventHandle")
+    pybind11::class_<deep_ep::EventHandle>(m, "EventHandle", py::module_local())
         .def(pybind11::init<>())
         .def("current_stream_wait", &deep_ep::EventHandle::current_stream_wait);
 
-    pybind11::class_<deep_ep::Buffer>(m, "Buffer")
-        .def(pybind11::init<int, int, int64_t, int64_t, bool, bool, bool, bool>())
+    pybind11::class_<deep_ep::Buffer>(m, "Buffer", py::module_local())
+        .def(pybind11::init<int, int, int64_t, int64_t, bool, bool, bool, bool, int>())
         .def("is_available", &deep_ep::Buffer::is_available)
         .def("get_num_rdma_ranks", &deep_ep::Buffer::get_num_rdma_ranks)
         .def("get_rdma_rank", &deep_ep::Buffer::get_rdma_rank)
@@ -1921,7 +1939,15 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         .def("get_local_ipc_handle", &deep_ep::Buffer::get_local_ipc_handle)
         .def("get_local_nvshmem_unique_id", &deep_ep::Buffer::get_local_nvshmem_unique_id)
         .def("get_local_buffer_tensor", &deep_ep::Buffer::get_local_buffer_tensor)
-        .def("get_comm_stream", &deep_ep::Buffer::get_comm_stream)
+        .def("get_comm_stream",
+           [](deep_ep::Buffer &self) {
+             int device_id = self.get_local_device_id();
+             cudaStream_t comm_stream = self.get_comm_stream();
+             auto s = phi::Stream(reinterpret_cast<phi::StreamId>(comm_stream));
+#if defined(PADDLE_WITH_CUDA)
+             return phi::CUDAStream(phi::GPUPlace(device_id), s);
+#endif
+           })
         .def("sync", &deep_ep::Buffer::sync)
         .def("destroy", &deep_ep::Buffer::destroy)
         .def("get_dispatch_layout", &deep_ep::Buffer::get_dispatch_layout)
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
index 0520a639..2c6ebfdc 100644
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -9,6 +9,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
 #include <torch/types.h>
+#include <c10/cuda/CUDAStream.h>
 #include <tuple>
 #include <vector>
 
@@ -17,6 +18,9 @@
 #include "kernels/configs.cuh"
 #include "kernels/exception.cuh"
 
+#include "paddle/phi/core/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
+
 #ifndef TORCH_EXTENSION_NAME
 #define TORCH_EXTENSION_NAME deep_ep_cpp
 #endif
@@ -79,7 +83,10 @@ struct Buffer {
     shared_memory::MemHandle ipc_handles[NUM_MAX_NVL_PEERS];
 
     // Stream for communication
-    at::cuda::CUDAStream comm_stream;
+    cudaStream_t comm_stream;
+
+    phi::distributed::NCCLCommContext* comm_ctx;
+    phi::GPUContext* calc_ctx;
 
     // After IPC/NVSHMEM synchronization, this flag will be true
     bool available = false;
@@ -118,7 +125,8 @@ struct Buffer {
            bool low_latency_mode,
            bool disable_nvlink_for_normal_mode,
            bool explicitly_destroy,
-           bool use_fabric);
+           bool use_fabric,
+           int context_ring_id);
 
     ~Buffer() noexcept(false);
 
@@ -140,7 +148,7 @@ struct Buffer {
 
     torch::Tensor get_local_buffer_tensor(const pybind11::object& dtype, int64_t offset, bool use_rdma_buffer) const;
 
-    torch::Stream get_comm_stream() const;
+    cudaStream_t get_comm_stream() const;
 
     void sync(const std::vector<int>& device_ids, const std::vector<std::optional<pybind11::bytearray>>& all_gathered_handles, const std::optional<pybind11::bytearray>& root_unique_id_opt);
 
@@ -224,4 +232,11 @@ struct Buffer {
     get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const;
 };
 
+inline void SetAllocatorStreamForGPUContext(gpuStream_t stream,
+                                            phi::GPUContext* ctx) {
+  ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                        .GetAllocator(ctx->GetPlace(), stream)
+                        .get());
+}
+
 } // namespace deep_ep
diff --git a/csrc/event.hpp b/csrc/event.hpp
index a93444d1..8d0c7706 100644
--- a/csrc/event.hpp
+++ b/csrc/event.hpp
@@ -1,4 +1,6 @@
-#include <ATen/cuda/CUDAContext.h>
+#pragma once
+// #include <ATen/cuda/CUDAContext.h>
+#include <c10/core/Event.h>
 #include <memory>
 
 #include "kernels/exception.cuh"
@@ -13,7 +15,7 @@ struct EventHandle {
         event->record(at::cuda::getCurrentCUDAStream());
     }
 
-    explicit EventHandle(const at::cuda::CUDAStream& stream) {
+    explicit EventHandle(const cudaStream_t& stream) {
         event = std::make_shared<torch::Event>(torch::kCUDA);
         event->record(stream);
     }
@@ -21,23 +23,26 @@ struct EventHandle {
     EventHandle(const EventHandle& other) = default;
 
     void current_stream_wait() const {
-        at::cuda::getCurrentCUDAStream().unwrap().wait(*event);
+        CUDA_CHECK(cudaStreamWaitEvent(
+            at::cuda::getCurrentCUDAStream().raw_stream(),
+            event->cuda_event(),
+            0));
     }
 };
 
-torch::Event create_event(const at::cuda::CUDAStream &s) {
+torch::Event create_event(const cudaStream_t &s) {
     auto event = torch::Event(torch::kCUDA);
     event.record(s);
     return event;
 }
 
-void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDAStream& s_1) {
-    EP_HOST_ASSERT(s_0.id() != s_1.id());
-    s_0.unwrap().wait(create_event(s_1));
+inline void stream_wait(const cudaStream_t& s_0, const cudaStream_t& s_1) {
+  EP_HOST_ASSERT(s_0 != s_1);
+  CUDA_CHECK(cudaStreamWaitEvent(s_0, create_event(s_1).cuda_event(), 0));
 }
 
-void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& event) {
-    s.unwrap().wait(*event.event);
+inline void stream_wait(const cudaStream_t& s, const EventHandle& event) {
+    CUDA_CHECK(cudaStreamWaitEvent(s, event.event->cuda_event(), 0));
 }
 
 } // namespace deep_ep
diff --git a/csrc/hybrid_ep/allocator/allocator.cu b/csrc/hybrid_ep/allocator/allocator.cu
index 7815b1d7..82d069d2 100644
--- a/csrc/hybrid_ep/allocator/allocator.cu
+++ b/csrc/hybrid_ep/allocator/allocator.cu
@@ -174,8 +174,8 @@ bool ExtendedMemoryAllocator::is_accessible(MemHandle* mem_handle) {
 
 int ExtendedMemoryAllocator::detect_accessible_ranks(pybind11::object process_group) {
   auto torch_distributed = py::module_::import("torch.distributed");  
-  int world_size = process_group.attr("size")().cast<int>();
-  int current_rank = process_group.attr("rank")().cast<int>();
+  int world_size = process_group.attr("world_size").cast<int>();
+  int current_rank = process_group.attr("rank").cast<int>();
   auto stream = at::cuda::getCurrentCUDAStream();
 
   // Put the test memory handle on a CUDA tensor
diff --git a/csrc/hybrid_ep/allocator/allocator.cuh b/csrc/hybrid_ep/allocator/allocator.cuh
index 4f2f79d2..2bf1c4b2 100644
--- a/csrc/hybrid_ep/allocator/allocator.cuh
+++ b/csrc/hybrid_ep/allocator/allocator.cuh
@@ -6,14 +6,17 @@
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-#include <torch/torch.h>
-#include <ATen/cuda/CUDAContext.h>
+// #include <torch/torch.h>
+#include <torch/python.h>
+// #include <ATen/cuda/CUDAContext.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
 #include <cstring>
 
 #include "utils.cuh"
 
+namespace py = pybind11;
+
 struct MemHandle {
   union MemHandleInner {
     cudaIpcMemHandle_t cuda_ipc_mem_handle;
diff --git a/csrc/hybrid_ep/executor/executor.cu b/csrc/hybrid_ep/executor/executor.cu
index 739d0f58..2f3f839e 100644
--- a/csrc/hybrid_ep/executor/executor.cu
+++ b/csrc/hybrid_ep/executor/executor.cu
@@ -15,10 +15,11 @@ torch::Tensor Executor::allgather_routing_map(
 ){
     nvtxRangePushA("allgather_routing_map in hybrid-ep");
 
-    auto torch_distributed = py::module_::import("torch.distributed");
+    // Import paddle.distributed directly (goes through paddle runtime, not torch)
+    auto paddle_distributed = py::module_::import("paddle.distributed");
     auto num_of_expert = local_routing_map.size(-1);
     auto num_of_tokens_per_rank = local_routing_map.size(-2);
-    auto group_size = process_group.attr("size")().cast<int>();
+    auto group_size = process_group.attr("world_size").cast<int>();
     assert(num_of_expert == config.num_of_experts_per_rank * config.num_of_ranks_per_node * config.num_of_nodes);
 
     torch::Tensor global_routing_map;
@@ -28,7 +29,7 @@ torch::Tensor Executor::allgather_routing_map(
             {num_of_tokens_per_rank * group_size, num_of_expert},
             torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA)
         );
-        torch_distributed.attr("all_gather_into_tensor")(global_routing_map, local_routing_map, process_group);
+        paddle_distributed.attr("stream").attr("all_gather")(global_routing_map, local_routing_map, process_group, py::arg("sync_op") = true);
     } else { // At intra-node case, we will use custom allgather
         allgather_obj.launch(local_routing_map, /*NUM_OF_SMS=*/32, at::cuda::getCurrentCUDAStream());
         global_routing_map = torch::from_blob(
diff --git a/csrc/hybrid_ep/executor/executor.cuh b/csrc/hybrid_ep/executor/executor.cuh
index 46509bf0..82bf6b38 100644
--- a/csrc/hybrid_ep/executor/executor.cuh
+++ b/csrc/hybrid_ep/executor/executor.cuh
@@ -2,9 +2,11 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 #pragma once
-#include <ATen/cuda/CUDAContext.h>
+// #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Optional.h>
-#include <torch/torch.h>
+// #include <torch/torch.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAStream.h>
 
 #include "utils.cuh"
 #include "hybrid_ep_backend.cuh"
diff --git a/csrc/hybrid_ep/extension/allgather.cu b/csrc/hybrid_ep/extension/allgather.cu
index be53b609..003ec02f 100644
--- a/csrc/hybrid_ep/extension/allgather.cu
+++ b/csrc/hybrid_ep/extension/allgather.cu
@@ -171,9 +171,15 @@ void CustomAllgather::open_ag_handles() {
     // Use Python's torch.distributed APIs through py::object
     auto torch_distributed = py::module_::import("torch.distributed");    
     // Move tensors to CUDA for communication
-    auto ag_handles_cuda = ag_handles.cuda();  
+    // auto ag_handles_cuda = ag_handles.cuda(); 
+    MemHandle handles[2];
+    auto ag_handles_cuda = torch::empty({static_cast<int64_t>(sizeof(handles))},
+                                                torch::dtype(torch::kUInt8).device(torch::kCUDA));
+    CUDA_CHECK(cudaMemcpy(ag_handles_cuda.data_ptr(), ag_handles.data_ptr(), static_cast<int64_t>(sizeof(handles)),
+                        cudaMemcpyHostToDevice));
+                        
     // Get world size from process group
-    int world_size = process_group.attr("size")().cast<int>();
+    int world_size = process_group.attr("world_size").cast<int>();
     // Create empty tensors for allgather output
     py::list ag_handles_output_list;
   
diff --git a/csrc/hybrid_ep/extension/permute.cuh b/csrc/hybrid_ep/extension/permute.cuh
index 6202cffe..cdf6da9d 100644
--- a/csrc/hybrid_ep/extension/permute.cuh
+++ b/csrc/hybrid_ep/extension/permute.cuh
@@ -2,11 +2,13 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 #pragma once
-#include <ATen/cuda/CUDAContext.h>
+// #include <ATen/cuda/CUDAContext.h>
 #include <cooperative_groups.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include <torch/torch.h>
+#include <c10/util/Optional.h>
+// #include <torch/torch.h>
+#include <torch/python.h>
 #include <cub/cub.cuh>
 #include <type_traits>
 #include "utils.cuh"
diff --git a/csrc/hybrid_ep/hybrid_ep.cu b/csrc/hybrid_ep/hybrid_ep.cu
index 5481066a..9089b4cb 100644
--- a/csrc/hybrid_ep/hybrid_ep.cu
+++ b/csrc/hybrid_ep/hybrid_ep.cu
@@ -12,15 +12,11 @@ std::string get_comm_id(pybind11::object process_group) {
 
   // Get the global id of each rank in the process group
   std::vector<int> global_ranks;
-  pybind11::object get_global_rank;
-  if (pybind11::hasattr(torch_distributed, "get_global_rank")) {
-    get_global_rank = torch_distributed.attr("get_global_rank");
-  } 
-  int group_size = process_group.attr("size")().cast<int>();
+  int group_id = process_group.attr("id").cast<int>(); 
+  int group_size = process_group.attr("world_size").cast<int>();
   global_ranks.reserve(group_size);
   for (int i = 0; i < group_size; ++i) {
-    int g = get_global_rank(process_group, i).cast<int>();
-    global_ranks.push_back(g);
+    global_ranks.push_back(group_id);
   }
 
   // Concatenate the global ranks into a string
@@ -299,11 +295,21 @@ void HybridEPBuffer::exchange_remote_handle() {
   auto torch_distributed = py::module_::import("torch.distributed");
   
   // Move tensors to CUDA for communication
-  auto dispatch_cuda = dispatch_memory_handles.cuda();
-  auto combine_cuda = combine_memory_handles.cuda();
+  // auto dispatch_cuda = dispatch_memory_handles.cuda();
+  MemHandle dispatch_handles[4];
+  auto dispatch_cuda = torch::empty({static_cast<int64_t>(sizeof(dispatch_handles))},
+                                     torch::dtype(torch::kUInt8).device(torch::kCUDA));
+  CUDA_CHECK(cudaMemcpy(dispatch_cuda.data_ptr(), dispatch_memory_handles.data_ptr(), static_cast<int64_t>(sizeof(dispatch_handles)),
+                        cudaMemcpyHostToDevice));
+  // auto combine_cuda = combine_memory_handles.cuda();
+  MemHandle combine_handles[3];
+  auto combine_cuda = torch::empty({static_cast<int64_t>(sizeof(combine_handles))},
+                                    torch::dtype(torch::kUInt8).device(torch::kCUDA));
+  CUDA_CHECK(cudaMemcpy(combine_cuda.data_ptr(), combine_memory_handles.data_ptr(), static_cast<int64_t>(sizeof(combine_handles)),
+                        cudaMemcpyHostToDevice));
   
   // Get world size from process group
-  int world_size = process_group.attr("size")().cast<int>();
+  int world_size = process_group.attr("world_size").cast<int>();
   
   // Create empty tensors for allgather output
   py::list dispatch_output_list;
@@ -326,7 +332,7 @@ void HybridEPBuffer::exchange_remote_handle() {
     dispatch_cpu_tensors.push_back(dispatch_output_list[i].cast<torch::Tensor>().cpu());
     combine_cpu_tensors.push_back(combine_output_list[i].cast<torch::Tensor>().cpu());
   }
-  
+
   // Open handles from other ranks
   open_handles_from_other_ranks(dispatch_cpu_tensors, combine_cpu_tensors);
 }
@@ -452,10 +458,10 @@ bool HybridEPBuffer::update_buffer(HybridEpConfigInstance config) {
     buffer_config.token_data_type = config.token_data_type;
   }
 
-  if(buffer_config.num_of_nodes > 1 && need_reallocate) {
-    TORCH_WARN("Reallocating HybridEP buffers in multi-node mode is very slow; "
-               "adjust buffer_config to pre-allocate sufficient capacity.");
-  }
+  // if(buffer_config.num_of_nodes > 1 && need_reallocate) {
+  //   TORCH_WARN("Reallocating HybridEP buffers in multi-node mode is very slow; "
+  //              "adjust buffer_config to pre-allocate sufficient capacity.");
+  // }
 
   if(need_reallocate) {
   #ifdef HYBRID_EP_BUILD_MULTINODE_ENABLE
diff --git a/csrc/hybrid_ep/hybrid_ep.cuh b/csrc/hybrid_ep/hybrid_ep.cuh
index 7a5972b8..d15f1b87 100644
--- a/csrc/hybrid_ep/hybrid_ep.cuh
+++ b/csrc/hybrid_ep/hybrid_ep.cuh
@@ -7,9 +7,10 @@
 #include "utils.cuh"
 #include "executor/executor.cuh"
 #include "extension/allgather.cuh"
-#include <ATen/cuda/CUDAContext.h>
+// #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Optional.h>
-#include <torch/torch.h>
+// #include <torch/torch.h>
+#include <torch/python.h>
 #include <vector>
 #include <algorithm>
 #include <string>
diff --git a/csrc/hybrid_ep/internode.cu b/csrc/hybrid_ep/internode.cu
index 7d0db6d1..ad53044f 100644
--- a/csrc/hybrid_ep/internode.cu
+++ b/csrc/hybrid_ep/internode.cu
@@ -708,7 +708,7 @@ void RDMACoordinator::exchange_remote_rdma_info(remote_info* dst, remote_info *s
   buffer = buffer.cuda();
 
   // Get world size from process group
-  int world_size = process_group.attr("size")().cast<int>();
+  int world_size = process_group.attr("world_size").cast<int>();
   // Create empty tensors for allgather output
   py::list output_list;
   for (int i = 0; i < world_size; i++) {
diff --git a/csrc/hybrid_ep/internode.cuh b/csrc/hybrid_ep/internode.cuh
index 9d8d918a..c3c6f072 100644
--- a/csrc/hybrid_ep/internode.cuh
+++ b/csrc/hybrid_ep/internode.cuh
@@ -4,9 +4,10 @@
 #include <unordered_map>
 #include <cstdint>
 #include <pybind11/pybind11.h>
-#include <ATen/cuda/CUDAContext.h>
+// #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Optional.h>
-#include <torch/torch.h>
+// #include <torch/torch.h>
+#include <torch/python.h>
 #include <iostream>
 #include <dlfcn.h>
 #include "backend/hybrid_ep_backend.cuh"
diff --git a/csrc/hybrid_ep/pybind_hybrid_ep.cu b/csrc/hybrid_ep/pybind_hybrid_ep.cu
index 812954f1..4ac1d467 100644
--- a/csrc/hybrid_ep/pybind_hybrid_ep.cu
+++ b/csrc/hybrid_ep/pybind_hybrid_ep.cu
@@ -16,7 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.doc() = "HybridEP, efficiently enable the expert-parallel communication in "
               "the Hopper+ architectures";
     
-    pybind11::class_<ExtendedMemoryAllocator>(m, "ExtendedMemoryAllocator")
+    pybind11::class_<ExtendedMemoryAllocator>(m, "ExtendedMemoryAllocator", py::module_local())
         .def(py::init<>())
         .def("detect_accessible_ranks", &ExtendedMemoryAllocator::detect_accessible_ranks, py::arg("process_group"));
       
@@ -28,7 +28,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         .def("__str__",
              [](const APP_TOKEN_DATA_TYPE &type) { return type_to_string(type); });
   
-    pybind11::class_<BufferConfig>(m, "BufferConfig")
+    pybind11::class_<BufferConfig>(m, "BufferConfig", py::module_local())
         .def(py::init<>())
         .def_readwrite("hidden_dim", &BufferConfig::hidden_dim)
         .def_readwrite("max_num_of_tokens_per_rank", &BufferConfig::max_num_of_tokens_per_rank)
@@ -60,7 +60,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
                  ">";
         });
 
-    pybind11::class_<HybridEpConfigInstance>(m, "HybridEpConfigInstance")
+    pybind11::class_<HybridEpConfigInstance>(m, "HybridEpConfigInstance", py::module_local())
         .def(py::init<>())
         // Hybrid-ep Config
         .def_readwrite("hidden_dim", &HybridEpConfigInstance::hidden_dim)
@@ -120,7 +120,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
                  ">";
         });
   
-    pybind11::class_<HybridEPBuffer>(m, "HybridEPBuffer")
+    pybind11::class_<HybridEPBuffer>(m, "HybridEPBuffer", py::module_local())
         .def(py::init<py::object, BufferConfig, int, int, int, std::string, bool, bool, bool>(),
             py::arg("process_group"),
             py::arg("config"),
diff --git a/deep_ep/__init__.py b/deep_ep/__init__.py
index f09f54a0..7320c498 100644
--- a/deep_ep/__init__.py
+++ b/deep_ep/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import torch
 
-from .utils import EventOverlap
+from .utils import EventOverlap, get_event_from_comm_stream
 from .buffer import Buffer
 from .hybrid_ep_buffer import HybridEPBuffer
 
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
index 62dbdc24..972252c3 100644
--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -8,7 +8,8 @@
 import deep_ep_cpp
 # noinspection PyUnresolvedReferences
 from deep_ep_cpp import Config, EventHandle
-from .utils import EventOverlap, check_nvlink_connections
+from .utils import EventOverlap
+from paddle.distributed.communication.group import Group
 
 
 class Buffer:
@@ -30,7 +31,7 @@ class Buffer:
 
     num_sms: int = 20
 
-    def __init__(self, group: Optional[dist.ProcessGroup],
+    def __init__(self, group: Optional[Group],
                  num_nvl_bytes: int = 0, num_rdma_bytes: int = 0,
                  low_latency_mode: bool = False,  num_qps_per_rank: int = 24,
                  allow_nvlink_for_normal_mode: bool = True,
@@ -62,13 +63,13 @@ def __init__(self, group: Optional[dist.ProcessGroup],
             comm: the `mpi4py.MPI.Comm` communicator to use in case the group parameter is absent.
         """
         # Check NVLink requirements based on configuration
-        check_nvlink_connections(group, allow_nvlink_for_normal_mode, allow_nvlink_for_low_latency_mode, low_latency_mode)
+        # check_nvlink_connections(group, allow_nvlink_for_normal_mode, allow_nvlink_for_low_latency_mode, low_latency_mode)
 
         # Initialize the CPP runtime
         if group is not None:
-            self.rank = group.rank()
+            self.rank = group.rank
             self.group = group
-            self.group_size = group.size()
+            self.group_size = group.world_size
 
             def all_gather_object(obj):
                 object_list = [None] * self.group_size
@@ -89,15 +90,17 @@ def all_gather_object(obj):
         self.disable_nvlink_for_normal_mode = not allow_nvlink_for_normal_mode
         self.explicitly_destroy = explicitly_destroy
         self.runtime = deep_ep_cpp.Buffer(self.rank, self.group_size, num_nvl_bytes, num_rdma_bytes, low_latency_mode, 
-                                          self.disable_nvlink_for_normal_mode,explicitly_destroy, use_fabric)
+                                          self.disable_nvlink_for_normal_mode,explicitly_destroy, use_fabric, group.id)
 
         # Synchronize device IDs
+        device_ids = []
         local_device_id = self.runtime.get_local_device_id()
-        device_ids = all_gather_object(local_device_id)
+        dist.all_gather_object(device_ids, local_device_id, group)
 
         # Synchronize IPC handles
+        ipc_handles = []
         local_ipc_handle = self.runtime.get_local_ipc_handle()
-        ipc_handles = all_gather_object(local_ipc_handle)
+        dist.all_gather_object(ipc_handles, local_ipc_handle, group)
 
         # Synchronize NVSHMEM unique IDs
         root_unique_id = None
@@ -122,10 +125,12 @@ def all_gather_object(obj):
                 # Disable multi-node NVLink detection
                 os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
 
+            nvshmem_unique_ids = []
             # Synchronize using the root ID
             if (low_latency_mode and self.rank == 0) or (not low_latency_mode and self.runtime.get_rdma_rank() == 0) or (self.disable_nvlink_for_normal_mode and self.rank == 0):
                 root_unique_id = self.runtime.get_local_nvshmem_unique_id()
-            nvshmem_unique_ids = all_gather_object(root_unique_id)
+            # nvshmem_unique_ids = all_gather_object(root_unique_id)
+            dist.all_gather_object(nvshmem_unique_ids, root_unique_id, group)
             root_unique_id = nvshmem_unique_ids[0 if low_latency_mode or self.disable_nvlink_for_normal_mode else self.runtime.get_root_rdma_rank(True)]
 
         # Make CPP runtime available
@@ -238,8 +243,8 @@ def get_dispatch_config(num_ranks: int) -> Config:
 
         # TODO: automatically tune
         config_map = {
-            2: Config(Buffer.num_sms, 24, 256, 6, 128),
-            4: Config(Buffer.num_sms, 6, 256, 6, 128),
+            2: Config(Buffer.num_sms, 16, 256, 6, 128),
+            4: Config(Buffer.num_sms, 16, 256, 6, 128),
             8: Config(Buffer.num_sms, 6, 256, 6, 128),
             16: Config(Buffer.num_sms, 36, 288, 20, 128),
             24: Config(Buffer.num_sms, 8, 288, 32, 128),
@@ -266,9 +271,9 @@ def get_combine_config(num_ranks: int) -> Config:
 
         # TODO: automatically tune
         config_map = {
-            2: Config(Buffer.num_sms, 10, 256, 6, 128),
-            4: Config(Buffer.num_sms, 9, 256, 6, 128),
-            8: Config(Buffer.num_sms, 4, 256, 6, 128),
+            2: Config(Buffer.num_sms, 6, 256, 6, 128),
+            4: Config(Buffer.num_sms, 6, 256, 6, 128),
+            8: Config(Buffer.num_sms, 6, 256, 6, 128),
             16: Config(Buffer.num_sms, 4, 288, 12, 128),
             24: Config(Buffer.num_sms, 1, 288, 8, 128),
             32: Config(Buffer.num_sms, 1, 288, 8, 128),
diff --git a/deep_ep/hybrid_ep_buffer.py b/deep_ep/hybrid_ep_buffer.py
index 6f9c6b60..1fda0805 100644
--- a/deep_ep/hybrid_ep_buffer.py
+++ b/deep_ep/hybrid_ep_buffer.py
@@ -5,6 +5,8 @@
 import shutil
 import hybrid_ep_cpp
 import warnings
+from paddle.distributed.communication.group import Group
+import paddle
 
 def indices_to_map(
     topk_idx: torch.Tensor,
@@ -18,14 +20,29 @@ def indices_to_map(
     # Generate the routing map and the probs according to the topk_idx and topk_weights.
     assert topk_idx is not None
     routing_map = torch.zeros(
-        num_of_tokens, num_of_experts, device="cuda", dtype=torch.bool
-    )
-    routing_map = routing_map.scatter(1, topk_idx.to(torch.int64), 1).bool()
+        num_of_tokens, num_of_experts, dtype=torch.bool
+    ).cuda()
+    # routing_map = routing_map.scatter(1, topk_idx.to(torch.int64), 1).bool()
+    batch_size = routing_map.shape[0]
+    num_experts = routing_map.shape[1]
+    topk = topk_idx.shape[1]
+    row_indices = paddle.arange(0, batch_size, dtype=topk_idx.dtype).unsqueeze(1).expand([batch_size, topk])
+    indices = paddle.stack([row_indices, topk_idx], axis=2).reshape([-1, 2])
+
+    tmp = paddle.zeros([batch_size, num_experts], dtype='float32')
+    ones = paddle.ones([indices.shape[0],], dtype='float32')
+    tmp = paddle.scatter_nd_add(tmp, indices, ones)
+
+    routing_map = (tmp > 0).astype('bool')
+
     if topk_weights is not None:
         probs = torch.zeros(
-            num_of_tokens, num_of_experts, device="cuda", dtype=torch.float32
-        )
-        probs = probs.scatter(1, topk_idx.to(torch.int64), topk_weights)
+            num_of_tokens, num_of_experts, dtype=torch.float32
+        ).cuda()
+        updates = topk_weights.reshape([-1])
+        tmp = paddle.zeros_like(probs)
+        tmp = paddle.scatter_nd_add(tmp, indices, updates)
+        probs = tmp
     else:
         probs = None
     return routing_map, probs
@@ -34,7 +51,7 @@ def indices_to_map(
 class HybridEPBuffer:
     def __init__(
         self,
-        group: torch.distributed.ProcessGroup,
+        group: Group,
         # Parameters for the hybrid-ep buffer allocation
         hidden_dim: int,
         max_num_of_tokens_per_rank: int,
@@ -53,24 +70,21 @@ def __init__(
         use_mnnvl: bool = None
     ):
         self.group = group
-        self.rank = self.group.rank()
-        self.group_size = self.group.size()
+        self.rank = self.group.rank
+        self.group_size = self.group.world_size
         assert (
             self.group_size > 1
         ), f"The hybrid-ep kernel should be used with at least 2 ranks, but got {self.group_size}."
 
-        allocator = hybrid_ep_cpp.ExtendedMemoryAllocator()
-        detected_ranks = allocator.detect_accessible_ranks(self.group)
+        # Use environment variable or default to group_size (all ranks in one node)
+        # Note: detect_accessible_ranks is disabled because it uses PyTorch distributed API
+        # which is incompatible with Paddle's Group object
         env_value = os.getenv("NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN")
         if env_value is not None:
             self.num_of_hybrid_ep_ranks_per_nvlink_domain = int(env_value)
-            if self.num_of_hybrid_ep_ranks_per_nvlink_domain != detected_ranks:
-                warnings.warn(
-                    f"[Warning] NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN={self.num_of_hybrid_ep_ranks_per_nvlink_domain} "
-                    f"differs from detected value {detected_ranks}. Using environment variable."
-                )
         else:
-            self.num_of_hybrid_ep_ranks_per_nvlink_domain = detected_ranks
+            # Default: assume all ranks are in the same NVLink domain (single node)
+            self.num_of_hybrid_ep_ranks_per_nvlink_domain = self.group_size
         
         assert (
             self.group_size % self.num_of_hybrid_ep_ranks_per_nvlink_domain == 0
@@ -134,15 +148,18 @@ def __init__(
       
         # Create C++ buffer - this will allocate all buffers during construction
         self.runtime = hybrid_ep_cpp.HybridEPBuffer(
-            self.group, 
-            self.config, 
-            self.local_rank, 
-            self.node_rank, 
-            self.group_size, 
-            os.path.dirname(os.path.abspath(__file__)), 
+            self.group,
+            self.config,
+            self.local_rank,
+            self.node_rank,
+            self.group_size,
+            os.path.dirname(os.path.abspath(__file__)),
             load_cached_kernels = load_cached_kernels,   # whether to load the cached kernels in disk
             use_shared_buffer = use_shared_buffer,      # whether to use the shared buffer for dispatch and combine
-            enable_custom_allgather = enable_custom_allgather  # whether to use the custom allgather for intra-node communication
+            # Disable custom allgather by default because its data layout is incompatible with scan kernel
+            # The custom allgather kernel produces token-interleaved layout, but scan kernel expects
+            # the standard allgather layout (rank-blocked layout)
+            enable_custom_allgather = False  # Always use standard allgather for correctness
         )
 
     def empty_jit_cache(self):
diff --git a/deep_ep/utils.py b/deep_ep/utils.py
index 2d5842bc..45fbc72a 100644
--- a/deep_ep/utils.py
+++ b/deep_ep/utils.py
@@ -7,7 +7,7 @@
 
 # noinspection PyUnresolvedReferences
 from deep_ep_cpp import Config, EventHandle
-
+import paddle
 
 class EventOverlap:
     """
@@ -39,6 +39,12 @@ def current_stream_wait(self) -> None:
         """
         assert self.event is not None
         self.event.current_stream_wait()
+    
+    def calc_stream_wait(self, group_idx) -> None:
+        self.event.calc_stream_wait(group_idx)
+
+    def comm_stream_wait(self, group_idx) -> None:
+        self.event.comm_stream_wait(group_idx)
 
     def __enter__(self) -> Any:
         """
@@ -64,7 +70,7 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
             self.event.current_stream_wait()
 
 
-def check_nvlink_connections(group: dist.ProcessGroup, 
+def check_nvlink_connections(group, 
                               allow_nvlink_for_normal_mode: bool = True,
                               allow_nvlink_for_low_latency_mode: bool = True,
                               low_latency_mode: bool = False) -> None:
@@ -102,3 +108,7 @@ def check_nvlink_connections(group: dist.ProcessGroup,
     pynvml.nvmlShutdown()
 
 
+def get_event_from_comm_stream(group_id: int) -> EventOverlap:
+    return EventOverlap(
+        event=paddle.base.core.get_event_handle_from_comm_stream(group_id)
+    )
\ No newline at end of file
diff --git a/setup_deep_ep.py b/setup_deep_ep.py
new file mode 100644
index 00000000..a10465a9
--- /dev/null
+++ b/setup_deep_ep.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import os
+import subprocess
+import setuptools
+import importlib
+import shutil
+import re
+
+from pathlib import Path
+from paddle.utils.cpp_extension import BuildExtension, CUDAExtension, _get_cuda_arch_flags
+from paddle.utils.cpp_extension.extension_utils import (
+    add_compile_flag,
+)
+
+def collect_package_files(package: str, relative_dir: str):
+    base_path = Path(package) / relative_dir
+    if not base_path.exists():
+        return []
+    return [
+        str(path.relative_to(package))
+        for path in base_path.rglob('*')
+        if path.is_file()
+    ]
+
+
+# Wheel specific: the wheels only include the soname of the host library `libnvshmem_host.so.X`
+def get_nvshmem_host_lib_name(base_dir):
+    path = Path(base_dir).joinpath('lib')
+    for file in path.rglob('libnvshmem_host.so.*'):
+        return file.name
+    raise ModuleNotFoundError('libnvshmem_host.so not found')
+
+def to_nvcc_gencode(s: str) -> str:
+    flags = []
+    for part in re.split(r'[,\s;]+', s.strip()):
+        if not part:
+            continue
+        m = re.fullmatch(r'(\d+)\.(\d+)([A-Za-z]?)', part)
+        if not m:
+            raise ValueError(f"Invalid entry: {part}")
+        major, minor, suf = m.groups()
+        arch = f"{int(major)}{int(minor)}{suf.lower()}"
+        flags.append(f"-gencode=arch=compute_{arch},code=sm_{arch}")
+    return " ".join(flags)
+
+def get_extension_deep_ep_cpp():
+    disable_nvshmem = False
+    nvshmem_dir = os.getenv('NVSHMEM_DIR', None)
+    nvshmem_host_lib = 'libnvshmem_host.so'
+    if nvshmem_dir is None:
+        try:
+            nvshmem_dir = importlib.util.find_spec("nvidia.nvshmem").submodule_search_locations[0]
+            nvshmem_host_lib = get_nvshmem_host_lib_name(nvshmem_dir)
+            import nvidia.nvshmem as nvshmem
+        except (ModuleNotFoundError, AttributeError, IndexError):
+            print('Warning: `NVSHMEM_DIR` is not specified, and the NVSHMEM module is not installed. All internode and low-latency features are disabled\n')
+            disable_nvshmem = True
+    else:
+        disable_nvshmem = False
+
+    if not disable_nvshmem:
+        assert os.path.exists(nvshmem_dir), f'The specified NVSHMEM directory does not exist: {nvshmem_dir}'
+
+    cxx_flags = ['-O3', '-Wno-deprecated-declarations', '-Wno-unused-variable',
+                 '-Wno-sign-compare', '-Wno-reorder', '-Wno-attributes']
+    nvcc_flags = ['-O3', '-Xcompiler', '-O3']
+    sources = ['csrc/deep_ep.cpp', 'csrc/kernels/runtime.cu', 'csrc/kernels/layout.cu', 'csrc/kernels/intranode.cu']
+    include_dirs = ['csrc/']
+    library_dirs = []
+    nvcc_dlink = []
+    extra_link_args = ['-lcuda']
+
+    # NVSHMEM flags
+    if disable_nvshmem:
+        cxx_flags.append('-DDISABLE_NVSHMEM')
+        nvcc_flags.append('-DDISABLE_NVSHMEM')
+    else:
+        sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu', 'csrc/kernels/pcie.cu'])
+        include_dirs.extend([f'{nvshmem_dir}/include'])
+        library_dirs.extend([f'{nvshmem_dir}/lib'])
+        nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem_device'])
+        extra_link_args.extend([f'-l:{nvshmem_host_lib}', '-l:libnvshmem_device.a', f'-Wl,-rpath,{nvshmem_dir}/lib'])
+
+    if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
+        # Prefer A100
+        os.environ['PADDLE_CUDA_ARCH_LIST'] = os.getenv('PADDLE_CUDA_ARCH_LIST', '8.0')
+
+        # Disable some SM90 features: FP8, launch methods, and TMA
+        cxx_flags.append('-DDISABLE_SM90_FEATURES')
+        nvcc_flags.append('-DDISABLE_SM90_FEATURES')
+
+        # Disable internode and low-latency kernels
+        assert disable_nvshmem
+    else:
+        # Prefer H800 series
+        os.environ['PADDLE_CUDA_ARCH_LIST'] = os.getenv('PADDLE_CUDA_ARCH_LIST', '9.0')
+
+        # CUDA 12 flags
+        nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
+        
+        # Ensure device linking and CUDA device runtime when RDC is enabled
+        if '-rdc=true' in nvcc_flags and '-dlink' not in nvcc_dlink:
+            nvcc_dlink.append('-dlink')
+
+    # CUDA 12 flags
+    nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
+
+    # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
+    if os.environ['PADDLE_CUDA_ARCH_LIST'].strip() != '9.0':
+        assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
+        os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
+
+    # Disable aggressive PTX instructions
+    if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '1')):
+        cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
+        nvcc_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')
+
+    # Put them together
+    extra_compile_args = {
+        'cxx': cxx_flags,
+        'nvcc': nvcc_flags,
+    }
+    if len(nvcc_dlink) > 0:
+        nvcc_dlink = nvcc_dlink + _get_cuda_arch_flags()
+        extra_compile_args['nvcc_dlink'] = nvcc_dlink
+
+    # Summary
+    print(f'Build summary:')
+    print(f' > Sources: {sources}')
+    print(f' > Includes: {include_dirs}')
+    print(f' > Libraries: {library_dirs}')
+    print(f' > Compilation flags: {extra_compile_args}')
+    print(f' > Link flags: {extra_link_args}')
+    print(f' > Arch list: {os.environ["PADDLE_CUDA_ARCH_LIST"]}')
+    print(f' > NVSHMEM path: {nvshmem_dir}')
+    print()
+
+    add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_CUDA'])
+    add_compile_flag(extra_compile_args, ['-DWITH_DISTRIBUTE'])
+    add_compile_flag(extra_compile_args, ['-DWITH_NVSHMEM'])
+    add_compile_flag(extra_compile_args, ['-DWITH_GPU'])
+    add_compile_flag(extra_compile_args, ['-DWITH_FLUID_ONLY'])
+
+    extension_deep_ep_cpp = CUDAExtension(
+        name='deep_ep_cpp',
+        include_dirs=include_dirs,
+        library_dirs=library_dirs,
+        sources=sources,
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args
+    )
+
+    return extension_deep_ep_cpp
+
+if __name__ == '__main__':
+    # noinspection PyBroadException
+    try:
+        cmd = ['git', 'rev-parse', '--short', 'HEAD']
+        revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
+    except Exception as _:
+        revision = ''
+
+    setuptools.setup(
+        name='deep_ep',
+        version='1.2.1' + revision,
+        packages=setuptools.find_packages(
+            include=['deep_ep']
+        ),
+        install_requires=[
+            'pynvml',
+        ],
+        ext_modules=[
+            get_extension_deep_ep_cpp(),
+        ],
+        cmdclass={
+            'build_ext': BuildExtension
+        },
+        package_data={
+            'deep_ep': collect_package_files('deep_ep', 'backend'),
+        },
+        include_package_data=True
+    )
diff --git a/setup_hybrid_ep.py b/setup_hybrid_ep.py
new file mode 100644
index 00000000..fe331080
--- /dev/null
+++ b/setup_hybrid_ep.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import os
+import subprocess
+import setuptools
+import importlib
+import shutil
+import re
+
+from pathlib import Path
+from paddle.utils.cpp_extension import BuildExtension, CUDAExtension, _get_cuda_arch_flags
+from paddle.utils.cpp_extension.extension_utils import (
+    add_compile_flag,
+)
+
+def collect_package_files(package: str, relative_dir: str):
+    base_path = Path(package) / relative_dir
+    if not base_path.exists():
+        return []
+    return [
+        str(path.relative_to(package))
+        for path in base_path.rglob('*')
+        if path.is_file()
+    ]
+
+
+# Wheel specific: the wheels only include the soname of the host library `libnvshmem_host.so.X`
+def get_nvshmem_host_lib_name(base_dir):
+    path = Path(base_dir).joinpath('lib')
+    for file in path.rglob('libnvshmem_host.so.*'):
+        return file.name
+    raise ModuleNotFoundError('libnvshmem_host.so not found')
+
+def to_nvcc_gencode(s: str) -> str:
+    flags = []
+    for part in re.split(r'[,\s;]+', s.strip()):
+        if not part:
+            continue
+        m = re.fullmatch(r'(\d+)\.(\d+)([A-Za-z]?)', part)
+        if not m:
+            raise ValueError(f"Invalid entry: {part}")
+        major, minor, suf = m.groups()
+        arch = f"{int(major)}{int(minor)}{suf.lower()}"
+        flags.append(f"-gencode=arch=compute_{arch},code=sm_{arch}")
+    return " ".join(flags)
+
+
+def get_extension_hybrid_ep_cpp():
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    enable_multinode = os.getenv("HYBRID_EP_MULTINODE", "0").strip().lower() in {"1", "true", "t", "yes", "y", "on"}
+
+    # Default to Blackwell series
+    os.environ['PADDLE_CUDA_ARCH_LIST'] = os.getenv('PADDLE_CUDA_ARCH_LIST', '10.0')
+
+    # Basic compile arguments
+    compile_args = {
+        "nvcc": [
+            "-std=c++17",
+            "-Xcompiler",
+            "-fPIC",
+            "--expt-relaxed-constexpr",
+            "-O3",
+            "--shared",
+        ],
+    }
+
+    sources = [
+        "csrc/hybrid_ep/hybrid_ep.cu",
+        "csrc/hybrid_ep/allocator/allocator.cu",
+        "csrc/hybrid_ep/jit/compiler.cu",
+        "csrc/hybrid_ep/executor/executor.cu",
+        "csrc/hybrid_ep/extension/permute.cu",
+        "csrc/hybrid_ep/extension/allgather.cu",
+        "csrc/hybrid_ep/pybind_hybrid_ep.cu",
+    ]
+    include_dirs = [
+        os.path.join(current_dir, "csrc/hybrid_ep/"),
+        os.path.join(current_dir, "csrc/hybrid_ep/backend/"),
+    ]
+    library_dirs = []
+    libraries = ["cuda", "nvtx3interop"]
+    extra_objects = []
+    runtime_library_dirs = []
+    nvcc_dlink = ['-dlink']
+    extra_link_args = ["-lcuda"]
+
+    if len(nvcc_dlink) > 0:
+        nvcc_dlink = nvcc_dlink + _get_cuda_arch_flags()
+        compile_args['nvcc_dlink'] = nvcc_dlink
+
+
+    # Add dependency for jit
+    compile_args["nvcc"].extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
+    compile_args["nvcc"].append(f'-DSM_ARCH="{os.environ["PADDLE_CUDA_ARCH_LIST"]}"')
+    # Copy the hybrid backend code to python package for JIT compilation
+    shutil.copytree(
+        os.path.join(current_dir, "csrc/hybrid_ep/backend/"),
+        os.path.join(current_dir, "deep_ep/backend/"),
+        dirs_exist_ok=True
+    )
+    # Add inter-node dependency 
+    if enable_multinode:
+        sources.extend(["csrc/hybrid_ep/internode.cu"])
+        rdma_core_dir = os.getenv("RDMA_CORE_HOME", "")
+        nccl_dir = os.path.join(current_dir, "third-party/nccl")        
+        compile_args["nvcc"].append("-DHYBRID_EP_BUILD_MULTINODE_ENABLE")
+        compile_args["nvcc"].append(f"-DRDMA_CORE_HOME=\"{rdma_core_dir}\"")
+        extra_link_args.append(f"-l:libnvidia-ml.so.1")
+
+        subprocess.run(["git", "submodule", "update", "--init", "--recursive"], cwd=current_dir)
+        # Generate the inter-node dependency to the python package for JIT compilation
+        subprocess.run(["make", "-j", "src.build", f"NVCC_GENCODE={to_nvcc_gencode(os.environ['PADDLE_CUDA_ARCH_LIST'])}"], cwd=nccl_dir, check=True)
+        # Add third-party dependency 
+        include_dirs.append(os.path.join(nccl_dir, "src/transport/net_ib/gdaki/doca-gpunetio/include"))
+        include_dirs.append(os.path.join(rdma_core_dir, "include"))
+        library_dirs.append(os.path.join(rdma_core_dir, "lib"))
+        runtime_library_dirs.append(os.path.join(rdma_core_dir, "lib"))
+        libraries.append("mlx5")
+        libraries.append("ibverbs")
+        # Copy the inter-node dependency to python package
+        shutil.copytree(
+            os.path.join(nccl_dir, "src/transport/net_ib/gdaki/doca-gpunetio/include"),
+            os.path.join(current_dir, "deep_ep/backend/nccl/include"),
+            dirs_exist_ok=True
+        )
+        shutil.copytree(
+            os.path.join(nccl_dir, "build/obj/transport/net_ib/gdaki/doca-gpunetio"),
+            os.path.join(current_dir, "deep_ep/backend/nccl/obj"),
+            dirs_exist_ok=True
+        )
+        # Set the extra objects
+        DOCA_OBJ_PATH = os.path.join(current_dir, "deep_ep/backend/nccl/obj")
+        extra_objects = [
+            os.path.join(DOCA_OBJ_PATH, "doca_gpunetio.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_gpunetio_high_level.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_cuda_wrapper.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_device_attr.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_ibv_wrapper.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_mlx5dv_wrapper.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_qp.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_cq.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_srq.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_uar.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_verbs_umem.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_gpunetio_gdrcopy.o"),
+            os.path.join(DOCA_OBJ_PATH, "doca_gpunetio_log.o"),
+        ]
+
+
+    print(f'Build summary:')
+    print(f' > Sources: {sources}')
+    print(f' > Includes: {include_dirs}')
+    print(f' > Libraries: {libraries}')
+    print(f' > Library dirs: {library_dirs}')
+    print(f' > Extra link args: {extra_link_args}')
+    print(f' > Compilation flags: {compile_args}')
+    print(f' > Extra objects: {extra_objects}')
+    print(f' > Runtime library dirs: {runtime_library_dirs}')
+    print(f' > Arch list: {os.environ["PADDLE_CUDA_ARCH_LIST"]}')
+    print()
+
+    add_compile_flag(compile_args, ['-DPADDLE_WITH_CUDA'])
+    add_compile_flag(compile_args, ['-DWITH_DISTRIBUTE'])
+    add_compile_flag(compile_args, ['-DWITH_NVSHMEM'])
+    add_compile_flag(compile_args, ['-DWITH_GPU'])
+    add_compile_flag(compile_args, ['-DWITH_FLUID_ONLY'])
+
+    extension_hybrid_ep_cpp = CUDAExtension(
+        name="hybrid_ep_cpp",
+        sources=sources,
+        include_dirs=include_dirs,
+        library_dirs=library_dirs,
+        libraries=libraries,
+        extra_compile_args=compile_args,
+        extra_objects=extra_objects,
+        runtime_library_dirs=runtime_library_dirs,
+        extra_link_args=extra_link_args,
+    )
+
+    return extension_hybrid_ep_cpp
+
+if __name__ == '__main__':
+    # noinspection PyBroadException
+    try:
+        cmd = ['git', 'rev-parse', '--short', 'HEAD']
+        revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
+    except Exception as _:
+        revision = ''
+
+    setuptools.setup(
+        name='deep_ep',
+        version='1.2.1' + revision,
+        packages=setuptools.find_packages(
+            include=['deep_ep']
+        ),
+        install_requires=[
+            'pynvml',
+        ],
+        ext_modules=[
+            get_extension_hybrid_ep_cpp(),
+        ],
+        cmdclass={
+            'build_ext': BuildExtension
+        },
+        package_data={
+            'deep_ep': collect_package_files('deep_ep', 'backend'),
+        },
+        include_package_data=True
+    )
diff --git a/test.sh b/test.sh
new file mode 100644
index 00000000..b260177d
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,8 @@
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+rm -rf log
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+export FLAGS_use_system_allocator=1
+python -m paddle.distributed.launch \
+            tests/test_hybrid_ep.py
\ No newline at end of file
diff --git a/tests/test_hybrid_ep.py b/tests/test_hybrid_ep.py
index 6ad5065f..127e25f5 100644
--- a/tests/test_hybrid_ep.py
+++ b/tests/test_hybrid_ep.py
@@ -1,29 +1,40 @@
 # SPDX-License-Identifier: MIT
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+import paddle
+paddle.enable_compat()
+
 import argparse
 import time
 import torch
 import torch.distributed as dist
 import os
 import deep_ep
+import logging
 
 from utils import TorchRef, bench, bench_kineto, init_dist, count_rdma_send_from_routing_map
 
+import contextlib
+from paddle.distributed import fleet
+from paddle.distributed.communication.group import Group
+
 HIDDEN_DIM = int(os.environ.get("HIDDEN_DIM", 7168))
 MAX_NUM_OF_TOKENS_PER_RANK = int(os.environ.get("MAX_NUM_OF_TOKENS_PER_RANK", 4096))
 # NUM_TOKENS_PER_RANK should equal or less than MAX_NUM_OF_TOKENS_PER_RANK
 NUM_TOKENS_PER_RANK = int(os.environ.get("NUM_TOKENS_PER_RANK", 4096))
-NUM_LOCAL_EXPERTS = int(os.environ.get("NUM_LOCAL_EXPERTS", 8))
+NUM_LOCAL_EXPERTS = int(os.environ.get("NUM_LOCAL_EXPERTS", 32))
+NUM_OF_RANKS_PER_NODE = int(os.environ.get("NUM_OF_RANKS_PER_NODE", 8))
+NUM_OF_NODES = int(os.environ.get("NUM_OF_NODES", 1))
 TOPK = int(os.environ.get("TOPK", 8))
-PAD_MULTIPLE = int(os.environ.get("PAD_MULTIPLE", 32))
+PAD_MULTIPLE = int(os.environ.get("PAD_MULTIPLE", 128))
+NUM_OF_EXPERTS = NUM_LOCAL_EXPERTS * NUM_OF_RANKS_PER_NODE * NUM_OF_NODES
 ITERATIONS = int(os.environ.get("ITERATIONS", 100))
 SEED = int(os.environ.get("SEED", 42))
 USE_MNNVL = os.environ.get("USE_MNNVL", "0").strip().lower() in {"1", "true", "t", "yes", "y", "on"}
 torch.manual_seed(SEED)
 torch.cuda.manual_seed(SEED)
 torch.cuda.manual_seed_all(SEED)
-torch.backends.cudnn.deterministic = True
-torch.backends.cudnn.benchmark = False
+# torch.backends.cudnn.deterministic = True
+# torch.backends.cudnn.benchmark = False
 # Will be set after the process group is initialized
 NUM_OF_RANKS_PER_NODE = None
 NUM_OF_NODES = None
@@ -43,7 +54,13 @@ def bitwise_equal(a: torch.Tensor, b: torch.Tensor) -> bool:
         return False
     a_bytes = a.contiguous().view(torch.uint8)
     b_bytes = b.contiguous().view(torch.uint8)
-    return torch.equal(a_bytes, b_bytes)
+    # Use (a == b).all() pattern which works better with Paddle compat layer
+    equal_result = (a_bytes == b_bytes)
+    all_equal = equal_result.all()
+    # Extract Python bool from the result
+    if hasattr(all_equal, 'numpy'):
+        return bool(all_equal.numpy())
+    return bool(all_equal)
 
 def init_tensor(
     hidden_dim: int,
@@ -57,28 +74,27 @@ def init_tensor(
             low=0,
             high=256,
             size=(seq_len, hidden_dim),
-            device="cuda",
-            dtype=torch.uint8,
-        )
+            dtype=torch.int32,
+        ).cuda().cast(torch.uint8)
     else:
-        hidden = torch.randn(seq_len, hidden_dim, device="cuda", dtype=torch.bfloat16)
-    probs = torch.zeros(seq_len, num_of_experts, device="cuda", dtype=torch.float32)
-    topk_idx = torch.zeros(seq_len, topk, device="cuda", dtype=torch.int64)
-    topk_weights = torch.zeros(seq_len, topk, device="cuda", dtype=torch.float32)
+        hidden = torch.randn(seq_len, hidden_dim, dtype=torch.bfloat16).cuda()
+    probs = torch.zeros(seq_len, num_of_experts, dtype=torch.float32).cuda()
+    topk_idx = torch.zeros(seq_len, topk, dtype=torch.int64).cuda()
+    topk_weights = torch.zeros(seq_len, topk, dtype=torch.float32).cuda()
     scaling_factor = torch.randn(
-        seq_len, hidden_dim // 128, device="cuda", dtype=torch.float32
-    )
+        seq_len, hidden_dim // 128, dtype=torch.float32
+    ).cuda()
 
-    routing_map = torch.zeros(seq_len, num_of_experts, device="cuda", dtype=torch.bool)
+    routing_map = torch.zeros(seq_len, num_of_experts, dtype=torch.bool).cuda()
 
     for i in range(seq_len):
         # Force balanced routing for testing
         # selected_experts = torch.tensor([
         #     ((i * topk) % num_of_experts + val) % num_of_experts for val in range(topk)
-        # ], device="cuda")
-        selected_experts = torch.randperm(num_of_experts, device="cuda")[:topk]
+        # ])
+        selected_experts = torch.randperm(num_of_experts).cuda()[:topk]
         topk_idx[i, :] = selected_experts.to(torch.int64)
-        topk_weights[i, :] = torch.ones(topk, device="cuda", dtype=torch.float32)
+        topk_weights[i, :] = torch.ones(topk, dtype=torch.float32).cuda()
         routing_map[i, selected_experts] = True
         probs[i, selected_experts] = topk_weights[i, :]
 
@@ -93,9 +109,15 @@ def test_hybrid_ep_correctness(buffer: deep_ep.HybridEPBuffer, ref: TorchRef, us
         num_of_experts=NUM_OF_EXPERTS,
         use_fp8=use_fp8,
     )
+    print(f"hidden: {hidden}, shape: {hidden.shape}")
+    print(f"probs: {probs}, shape: {probs.shape}")
+    print(f"scaling_factor: {scaling_factor}, shape: {scaling_factor.shape}")
+    print(f"routing_map: {routing_map}, shape: {routing_map.shape}")
+    print(f"topk_idx: {topk_idx}, shape: {topk_idx.shape}")
+    print(f"topk_weights: {topk_weights}, shape: {topk_weights.shape}")
 
     # Dispatch correctness check
-    for with_probs in [True, False]:
+    for with_probs in [True]:
         # The check for the dispatch
         dispatched_hidden_ref, dispatched_probs_ref, dispatched_scaling_factor_ref = (
             ref.dispatch(
@@ -108,8 +130,15 @@ def test_hybrid_ep_correctness(buffer: deep_ep.HybridEPBuffer, ref: TorchRef, us
             dispatched_scaling_factor,
             handle,
         ) = buffer.dispatch(
-            hidden=hidden, scaling_factor=scaling_factor, topk_idx=topk_idx, topk_weights=topk_weights if with_probs else None, num_of_experts=NUM_OF_EXPERTS,
+            hidden=hidden, 
+            scaling_factor=scaling_factor, 
+            topk_idx=topk_idx, 
+            topk_weights=topk_weights if with_probs else None, 
+            num_of_experts=NUM_OF_EXPERTS,
         )
+        print(f"dispatched_hidden: {dispatched_hidden}, shape: {dispatched_hidden.shape}")
+        print(f"dispatched_probs: {dispatched_probs}, shape: {dispatched_probs.shape}")
+        print(f"dispatched_scaling_factor: {dispatched_scaling_factor}, shape: {dispatched_scaling_factor.shape}")
 
         assert bitwise_equal(dispatched_hidden_ref, dispatched_hidden)
         if dispatched_probs is not None and dispatched_probs_ref is not None:
@@ -127,106 +156,125 @@ def test_hybrid_ep_correctness(buffer: deep_ep.HybridEPBuffer, ref: TorchRef, us
             )
 
         _, _, _, num_dispatched_tokens, local_expert_routing_map, _, _ = handle
+
+        print(f"num_dispatched_tokens: {num_dispatched_tokens}, shape: {num_dispatched_tokens.shape}")
+        print(f"local_expert_routing_map: {local_expert_routing_map}, shape: {local_expert_routing_map.shape}")
+        
         num_dispatched_tokens = num_dispatched_tokens.cpu()
         local_expert_routing_map = local_expert_routing_map[
             : num_dispatched_tokens.item()
         ]
         # Simulate the permute and expert and unpermute. The expert is identity op
-        copy_times = local_expert_routing_map.sum(dim=1)
+        copy_times = local_expert_routing_map.sum(dim=1).to(torch.bfloat16) 
         dispatched_hidden = dispatched_hidden.to(torch.bfloat16)  
+        print(f"copy_times: {copy_times}, shape: {copy_times.shape}")
+        print(f"dispatched_hidden: {dispatched_hidden}, shape: {dispatched_hidden.shape}")
         # The combine only support bf16
         hidden_to_combine = dispatched_hidden * copy_times.unsqueeze(1)
         probs_to_combine = dispatched_probs
+        print(f"hidden_to_combine: {hidden_to_combine}, shape: {hidden_to_combine.shape}")
+        print(f"dispatched_hidden: {probs_to_combine}, shape: {probs_to_combine.shape}")
 
         # The check for the combine
         combined_hidden, combined_probs = buffer.combine(
             hidden_to_combine, probs_to_combine, handle
         )
+        print(f"combined_hidden: {combined_hidden}, shape: {combined_hidden.shape}")
+        print(f"combined_probs: {combined_probs}, shape: {combined_probs.shape}")
 
         # The reconstucted value should be TOPK times larger than the input hidden
         combined_hidden = combined_hidden / TOPK
+        print(f"combined_hidden new: {combined_hidden}, shape new: {combined_hidden.shape}")
 
-        assert torch.allclose(combined_hidden, hidden.to(torch.bfloat16), atol=2e-5, rtol=1e-2)
+        assert torch.allclose(combined_hidden.to(torch.float32), hidden.to(torch.float32), atol=2e-5, rtol=1e-2)
         if combined_probs is not None and probs is not None:
             assert bitwise_equal(combined_probs, probs)
 
-    # Dispatch with permute correctness check
-    for with_probs in [True, False]:
-        # The check for the dispatch
-        (
-            dispatched_hidden,
-            dispatched_probs,
-            dispatched_scaling_factor,
-            tokens_per_expert,
-            handle,
-        ) = buffer.dispatch_with_permute(
-            hidden=hidden,
-            routing_map=routing_map,
-            probs=probs if with_probs else None,
-            scaling_factor=scaling_factor,
-            pad_multiple=PAD_MULTIPLE,
-        )
-        _, _, _, num_dispatched_tokens_tensor, local_expert_routing_map, _, _, _, _ = (
-            handle
-        )
-        num_dispatched_tokens_tensor = num_dispatched_tokens_tensor.cpu()
-        local_expert_routing_map = local_expert_routing_map[
-            : num_dispatched_tokens_tensor.item()
-        ]
-        # The out_token_num of permutation is the sum of the tokens_per_expert
-        out_token_num = tokens_per_expert.sum().item()
-        (
-            dispatched_hidden_ref,
-            dispatched_probs_ref,
-            dispatched_scaling_factor_ref,
-        ) = ref.dispatch(
-            hidden,
-            routing_map,
-            probs if with_probs else None,
-            scaling_factor,
-            local_expert_routing_map=local_expert_routing_map,
-            out_token_num=out_token_num,
-            pad_multiple=PAD_MULTIPLE,
-            enable_permute=True,
-        )
-
-        assert bitwise_equal(dispatched_hidden_ref, dispatched_hidden)
-        if dispatched_probs is not None and dispatched_probs_ref is not None:
-            assert bitwise_equal(dispatched_probs_ref, dispatched_probs)
-        if (
-            dispatched_scaling_factor is not None
-            and dispatched_scaling_factor_ref is not None
-        ):
-            assert bitwise_equal(
-                dispatched_scaling_factor_ref, dispatched_scaling_factor
-            )
-
-        # The combine only support bf16
-        dispatched_hidden = dispatched_hidden.to(torch.bfloat16)  
-        hidden_to_combine = dispatched_hidden
-        probs_to_combine = dispatched_probs
+    # # Dispatch with permute correctness check
+    # for with_probs in [True]:
+    #     # The check for the dispatch
+    #     (
+    #         dispatched_hidden,
+    #         dispatched_probs,
+    #         dispatched_scaling_factor,
+    #         tokens_per_expert,
+    #         handle,
+    #     ) = buffer.dispatch_with_permute(
+    #         hidden=hidden,
+    #         routing_map=routing_map,
+    #         probs=probs if with_probs else None,
+    #         scaling_factor=scaling_factor,
+    #         pad_multiple=PAD_MULTIPLE,
+    #     )
+    #     if dist.get_rank() == 0:
+    #         print("dispatched_hidden: ", dispatched_hidden, dispatched_hidden.shape)
+    #         print("dispatched_probs: ", dispatched_probs, dispatched_probs.shape)
+    #         print("dispatched_scaling_factor: ", dispatched_scaling_factor, dispatched_scaling_factor.shape)
+    #         print("tokens_per_expert: ", tokens_per_expert, tokens_per_expert.shape)
+    #     _, _, _, num_dispatched_tokens_tensor, local_expert_routing_map, _, _, _ = (
+    #         handle
+    #     )
+    #     if dist.get_rank() == 0:
+    #         print("num_dispatched_tokens_tensor: ", num_dispatched_tokens_tensor, num_dispatched_tokens_tensor.shape)
+    #         print("local_expert_routing_map: ", local_expert_routing_map, local_expert_routing_map.shape)
+    #     num_dispatched_tokens_tensor = num_dispatched_tokens_tensor.cpu()
+    #     local_expert_routing_map = local_expert_routing_map[
+    #         : num_dispatched_tokens_tensor.item()
+    #     ]
+    #     # The out_token_num of permutation is the sum of the tokens_per_expert
+    #     out_token_num = tokens_per_expert.sum().item()
+    #     (
+    #         dispatched_hidden_ref,
+    #         dispatched_probs_ref,
+    #         dispatched_scaling_factor_ref,
+    #     ) = ref.dispatch(
+    #         hidden,
+    #         routing_map,
+    #         probs if with_probs else None,
+    #         scaling_factor,
+    #         local_expert_routing_map=local_expert_routing_map,
+    #         out_token_num=out_token_num,
+    #         pad_multiple=PAD_MULTIPLE,
+    #         enable_permute=True,
+    #     )
+
+    #     assert bitwise_equal(dispatched_hidden_ref, dispatched_hidden)
+    #     if dispatched_probs is not None and dispatched_probs_ref is not None:
+    #         assert bitwise_equal(dispatched_probs_ref, dispatched_probs)
+    #     if (
+    #         dispatched_scaling_factor is not None
+    #         and dispatched_scaling_factor_ref is not None
+    #     ):
+    #         assert bitwise_equal(
+    #             dispatched_scaling_factor_ref, dispatched_scaling_factor
+    #         )
+
+    #     # The combine only support bf16
+    #     dispatched_hidden = dispatched_hidden.to(torch.bfloat16)  
+    #     hidden_to_combine = dispatched_hidden
+    #     probs_to_combine = dispatched_probs
  
-        # The check for the combine
-        combined_hidden, combined_probs = buffer.combine_with_unpermute(
-            hidden=hidden_to_combine,
-            probs=probs_to_combine,
-            handle=handle,
-            pad_multiple=PAD_MULTIPLE,
-        )
+    #     # The check for the combine
+    #     combined_hidden, combined_probs = buffer.combine_with_unpermute(
+    #         hidden=hidden_to_combine,
+    #         probs=probs_to_combine,
+    #         handle=handle,
+    #         pad_multiple=PAD_MULTIPLE,
+    #     )
 
-        # The reconstucted value should be TOPK times larger than the input hidden
-        combined_hidden = combined_hidden / TOPK
+    #     # The reconstucted value should be TOPK times larger than the input hidden
+    #     combined_hidden = combined_hidden / TOPK
 
-        assert torch.allclose(
-            combined_hidden, hidden.to(torch.bfloat16), atol=2e-5, rtol=1e-2
-        )
-        if combined_probs is not None and probs is not None:
-            assert bitwise_equal(combined_probs, probs)
+    #     assert torch.allclose(
+    #         combined_hidden, hidden.to(torch.bfloat16), atol=2e-5, rtol=1e-2
+    #     )
+    #     if combined_probs is not None and probs is not None:
+    #         assert bitwise_equal(combined_probs, probs)
 
-    print_in_order(f'[rank {dist.get_rank()}] Correctness check passed ({"FP8" if hidden.dtype == torch.uint8 else "BF16"})')
+    # print_in_order(f'[rank {dist.get_rank()}] Correctness check passed ({"FP8" if hidden.dtype == torch.uint8 else "BF16"})')
 
 
-def test_hybrid_ep_benchmark(buffer: deep_ep.HybridEPBuffer, group: dist.ProcessGroup, use_fp8: bool, nsys_profile: bool):
+def test_hybrid_ep_benchmark(buffer: deep_ep.HybridEPBuffer, group: Group, use_fp8: bool, nsys_profile: bool):
     hidden, probs, scaling_factor, routing_map, topk_idx, topk_weights = init_tensor(
         hidden_dim=HIDDEN_DIM,
         seq_len=NUM_TOKENS_PER_RANK,
@@ -330,11 +378,10 @@ def test_func():
             print_in_order(f'[rank {rank}] HybridEP dispatch kernel(IB) ({"FP8" if hidden.dtype == torch.uint8 else "BF16"}): {rdma_send_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
                   f'HybridEP combine kernel(IB): {combine_bf16_rdma_recv_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us')
     else:
-        if torch.distributed.get_rank() == 0:
-            torch.cuda.profiler.start()
-        with torch.cuda.nvtx.range(f"hybrid-ep dispatch ({"FP8" if hidden.dtype == torch.uint8 else "BF16"})"):
+        torch.cuda.profiler.start()
+        with torch.cuda.nvtx.range(f"hybrid-ep dispatch ({'FP8' if hidden.dtype == torch.uint8 else 'BF16'})"):
             if rank == 0:
-                print(f"profile hybrid-ep dispatch ({"FP8" if hidden.dtype == torch.uint8 else "BF16"})", flush=True)
+                print(f"profile hybrid-ep dispatch ({'FP8' if hidden.dtype == torch.uint8 else 'BF16'})", flush=True)
             dispatch_args = {'hidden': hidden, 'scaling_factor': scaling_factor, 'topk_idx': topk_idx, 'topk_weights': topk_weights, 'num_of_experts': NUM_OF_EXPERTS}
             bench(lambda: buffer.dispatch(**dispatch_args))
         with torch.cuda.nvtx.range("hybrid-ep combine"):
@@ -342,9 +389,9 @@ def test_func():
                 print(f"profile hybrid-ep combine", flush=True)
             combine_args = {'hidden': dispatched_hidden_bf16, 'probs': dispatched_probs, 'handle': handle}
             bench(lambda: buffer.combine(**combine_args))
-        with torch.cuda.nvtx.range(f"hybrid-ep dispatch+permute ({"FP8" if hidden.dtype == torch.uint8 else "BF16"})"):
+        with torch.cuda.nvtx.range(f"hybrid-ep dispatch+permute ({'FP8' if hidden.dtype == torch.uint8 else 'BF16'})"):
             if rank == 0:
-                print(f"profile hybrid-ep dispatch+permute ({"FP8" if hidden.dtype == torch.uint8 else "BF16"})", flush=True)
+                print(f"profile hybrid-ep dispatch+permute ({'FP8' if hidden.dtype == torch.uint8 else 'BF16'})", flush=True)
             dispatch_with_permute_args = {'hidden': hidden, 'scaling_factor': scaling_factor, 'routing_map': routing_map, 'probs': probs, 'pad_multiple': PAD_MULTIPLE}
             bench(lambda: buffer.dispatch_with_permute(**dispatch_with_permute_args))
         with torch.cuda.nvtx.range("hybrid-ep combine+unpermute"):
@@ -353,46 +400,66 @@ def test_func():
             combine_with_unpermute_args = {'hidden': dispatched_hidden_bf16_with_permute, 'probs': dispatched_probs_with_permute, 'handle': handle_with_permute, 'pad_multiple': PAD_MULTIPLE}
             bench(lambda: buffer.combine_with_unpermute(**combine_with_unpermute_args))
         time.sleep(1)
-        if torch.distributed.get_rank() == 0:
-            torch.cuda.profiler.stop()
+        torch.cuda.profiler.stop()
 
 
 def test_main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     _, _, group = init_dist(local_rank, num_local_ranks)
+    print("group: ", group.id)
 
     # Set missing global vars
     global NUM_OF_RANKS_PER_NODE, NUM_OF_NODES, NUM_OF_EXPERTS
     if USE_MNNVL:
-        NUM_OF_RANKS_PER_NODE = group.size()
+        NUM_OF_RANKS_PER_NODE = group.world_size
         NUM_OF_NODES = 1
         NUM_OF_EXPERTS = NUM_LOCAL_EXPERTS * NUM_OF_RANKS_PER_NODE * NUM_OF_NODES
     else:
         NUM_OF_RANKS_PER_NODE = args.num_processes
-        NUM_OF_NODES = group.size() // NUM_OF_RANKS_PER_NODE
+        NUM_OF_NODES = group.world_size // NUM_OF_RANKS_PER_NODE
         NUM_OF_EXPERTS = NUM_LOCAL_EXPERTS * NUM_OF_RANKS_PER_NODE * NUM_OF_NODES
+        
+    for use_fp8 in [True]:
+        print("use_fp8: ", use_fp8)
+        buffer = deep_ep.HybridEPBuffer(
+            group=group,
+            hidden_dim=HIDDEN_DIM,
+            max_num_of_tokens_per_rank=MAX_NUM_OF_TOKENS_PER_RANK,
+            num_local_experts=NUM_LOCAL_EXPERTS,
+            num_of_hybrid_ep_ranks_per_nvlink_domain=NUM_OF_RANKS_PER_NODE,
+            use_mnnvl=USE_MNNVL,
+            use_fp8=use_fp8
+        )
+        print("buffer: ", buffer)
+        
+        ref = TorchRef(
+            ep_group=group,
+            num_of_experts=NUM_OF_EXPERTS,
+            num_of_ranks_per_node=NUM_OF_RANKS_PER_NODE,
+        )
 
-    stream = torch.cuda.Stream()
-    with torch.cuda.stream(stream):
-        for use_fp8 in [False, True]:
-            buffer = deep_ep.HybridEPBuffer(
-                group=group,
-                hidden_dim=HIDDEN_DIM,
-                max_num_of_tokens_per_rank=MAX_NUM_OF_TOKENS_PER_RANK,
-                num_local_experts=NUM_LOCAL_EXPERTS,
-                use_fp8=use_fp8
-            )
-            
-            ref = TorchRef(
-                ep_group=group,
-                num_of_experts=NUM_OF_EXPERTS,
-                num_of_ranks_per_node=NUM_OF_RANKS_PER_NODE,
-            )
-
-            test_hybrid_ep_correctness(buffer, ref, use_fp8)
-            test_hybrid_ep_benchmark(buffer, group, use_fp8, args.nsys_profile)
+        test_hybrid_ep_correctness(buffer, ref, use_fp8)
+        # test_hybrid_ep_benchmark(buffer, group, use_fp8, args.nsys_profile)
     dist.barrier()
     dist.destroy_process_group()
 
+def init_dist_env(world_size, seed=20):
+    context = contextlib.nullcontext()
+    with context:
+        # start to init distributed env
+        strategy = fleet.DistributedStrategy()
+
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": world_size,
+            "pp_degree": 1,
+            "sharding_degree": 1,
+        }
+
+        # Set control in tensor parallel
+        strategy.tensor_parallel_configs = {"tensor_init_seed": seed}
+
+        fleet.init(is_collective=True, strategy=strategy)
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Test intranode EP kernels')
     parser.add_argument('--num-processes', type=int, default=4,
@@ -400,4 +467,13 @@ def test_main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     parser.add_argument('--nsys-profile', action='store_true', default=False,
                        help='benchmark with nsys profile or not (default: False)')
     args = parser.parse_args()
-    torch.multiprocessing.spawn(test_main, args=(args.num_processes, args), nprocs=args.num_processes)
+
+    if dist.get_world_size() > 1:
+        init_dist_env(dist.get_world_size())
+
+    rank = dist.get_rank()
+    num_ranks = dist.get_world_size()
+
+    test_main(rank, num_ranks, args)
+
+    # torch.multiprocessing.spawn(test_main, args=(args.num_processes, args), nprocs=args.num_processes)
diff --git a/tests/utils.py b/tests/utils.py
index d4f851af..b036d393 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -12,29 +12,17 @@
 from typing import Optional, Tuple, Union
 
 BLOCK_SIZE = 16
+
+import paddle
+
 def init_dist(local_rank: int, num_local_ranks: int):
-    # NOTES: you may rewrite this function with your own cluster settings
-    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
-    port = int(os.getenv('MASTER_PORT', '8361'))
     num_nodes = int(os.getenv('WORLD_SIZE', 1))
-    node_rank = int(os.getenv('RANK', 0))
-
-    sig = inspect.signature(dist.init_process_group)
-    params = {
-        'backend': 'nccl',
-        'init_method': f'tcp://{ip}:{port}',
-        'world_size': num_nodes * num_local_ranks,
-        'rank': node_rank * num_local_ranks + local_rank,
-    }
-    if 'device_id' in sig.parameters:
-        # noinspection PyTypeChecker
-        params['device_id'] = torch.device(f'cuda:{local_rank}')
-    dist.init_process_group(**params)
+    group = paddle.distributed.new_group(list(range(num_local_ranks * num_nodes)))
     torch.set_default_dtype(torch.bfloat16)
     torch.set_default_device('cuda')
     torch.cuda.set_device(local_rank)
 
-    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))
+    return dist.get_rank(), dist.get_world_size(), group
 
 
 def calc_diff(x: torch.Tensor, y: torch.Tensor):
@@ -65,7 +53,7 @@ def cast_fp8_to_bf16(x_fp8: torch.Tensor, x_scales: torch.Tensor):
     assert x_fp8.dim() == 2
     m, n = x_fp8.shape
     aligned_n = align_up(n, 128)
-    x_fp8_padded = torch.nn.functional.pad(x_fp8, (0, aligned_n - n), mode='constant', value=0)
+    x_fp8_padded = torch.nn.functional.pad(x_fp8.to(torch.float32), (0, aligned_n - n), mode='constant', value=0)
     if x_scales.dtype == torch.int:
         x_scales = x_scales.view(dtype=torch.uint8).to(torch.int) << 23
         x_scales = x_scales.view(dtype=torch.float)
@@ -263,29 +251,40 @@ def inplace_unique(x: torch.Tensor, num_slots: int):
     assert x.dim() == 2
     mask = x < 0
     x_padded = x.masked_fill(mask, num_slots)
-    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
-    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
+    bin_count = paddle.zeros([x.shape[0], num_slots + 1], dtype=x.dtype).to(
+        x.place
+    )
+    # bin_count.scatter_add_(1, x_padded, paddle.ones_like(x_padded))
+    bin_count.put_along_axis_(
+        axis=1,
+        indices=x_padded,
+        values=paddle.ones_like(x_padded),
+        reduce='add',
+        include_self=True,
+    )
+
     bin_count = bin_count[:, :num_slots]
-    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
+    sorted_bin_count = paddle.sort(bin_count, axis=-1, descending=True)
+    sorted_bin_idx = paddle.argsort(bin_count, axis=-1, descending=True)
     sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
-    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
+    sorted_bin_idx = paddle.sort(sorted_bin_idx, descending=True, axis=-1)
     x[:, :].fill_(-1)
-    valid_len = min(num_slots, x.size(1))
+    valid_len = min(num_slots, x.shape[1])
     x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
 
 
 def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
     num_tokens, num_experts = scores.shape
     scores = scores.view(num_tokens, num_groups, -1)
-    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
-    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
+    mask = torch.zeros((num_tokens, num_groups), dtype=torch.int32, device=scores.device)
+    mask = mask.scatter_(1, group_idx, 1).unsqueeze(-1).expand_as(scores).to(torch.float32)
     return (scores * mask).view(num_tokens, num_experts)
 
 
 def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
     # Flush L2 cache with 256 MB data
     torch.cuda.synchronize()
-    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int).cuda()
 
     # Warmup
     for _ in range(num_warmups):
@@ -364,10 +363,10 @@ def bench_kineto(fn, kernel_names: Union[str, tuple], num_tests: int = 30, suppr
             for i in range(2):
                 # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
                 if barrier_comm_profiling:
-                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
-                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+                    lhs = torch.randn((8192, 8192), dtype=torch.float).cuda()
+                    rhs = torch.randn((8192, 8192), dtype=torch.float).cuda()
                     lhs @ rhs
-                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
+                    dist.all_reduce(torch.ones(1, dtype=torch.float).cuda())
                 for _ in range(num_tests):
                     fn()
                 torch.cuda.synchronize()
@@ -458,7 +457,7 @@ def permute(
 class TorchRef:
     def __init__(
         self,
-        ep_group: torch.distributed.ProcessGroup,
+        ep_group,
         num_of_experts: int,
         num_of_ranks_per_node: int,
     ):
@@ -527,7 +526,7 @@ def dispatch(
             device=hidden.device,
             dtype=torch.bool,
         )
-        torch.distributed.all_gather_into_tensor(
+        torch.distributed.all_gather(
             global_routing_map, routing_map, self.ep_group
         )
 
@@ -538,7 +537,7 @@ def dispatch(
             device=hidden.device,
             dtype=hidden.dtype,
         )
-        torch.distributed.all_gather_into_tensor(global_hidden, hidden, self.ep_group)
+        torch.distributed.all_gather(global_hidden, hidden, self.ep_group)
 
         # dispatch the probs tensor
         if probs is not None:
@@ -548,7 +547,7 @@ def dispatch(
                 device=probs.device,
                 dtype=probs.dtype,
             )
-            torch.distributed.all_gather_into_tensor(global_probs, probs, self.ep_group)
+            torch.distributed.all_gather(global_probs, probs, self.ep_group)
         else:
             global_probs = None
 
@@ -560,7 +559,7 @@ def dispatch(
                 device=scaling_factor.device,
                 dtype=scaling_factor.dtype,
             )
-            torch.distributed.all_gather_into_tensor(
+            torch.distributed.all_gather(
                 global_scaling_factor, scaling_factor, self.ep_group
             )
         else: