ROCm · matthiasdiener · Dec 6, 2025 · Oct 30, 2025 · Jan 27, 2026 · Feb 24, 2026
@@ -65,7 +65,11 @@ std::vector<InputType> create_transpose(const InputType* const input, const size
 
 // Compute the global encode scale factor for a given global amax
 float compute_global_encode_scaling_factor_FP4(const float global_amax) {
+#ifdef __HIP_PLATFORM_AMD__
+  const float fp8_max = Numeric_Traits<fp8e4m3>::maxNorm;
+#else
   constexpr float fp8_max = 448.0f;     // 448.0f;
+#endif
   constexpr float fp4_max = 6.0f;       // 6.0f;
   float global_encode_scale = fp8_max * fp4_max / global_amax;
   // If scale is infinity, return max value of float32

@@ -108,7 +108,12 @@ void compute_ref(const fp4e2m1* input,
                  const size_t rows,
                  const size_t cols,
                  const size_t scale_stride) {
+#ifdef __HIP_PLATFORM_AMD__
+    const float fp8_max = Numeric_Traits<fp8e4m3>::maxNorm;
+    const float factor_inv = 1.0f / (6.0f * fp8_max);
+#else
     constexpr float factor_inv = 1.0f / (6.0f * 448.0f);
+#endif
 
     const size_t blocks_per_row = cols / kFP4BlockSize1D;
 

@@ -1,3 +1,5 @@
+# This file was modified for portability to AMDGPU
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved
 # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -10,7 +12,10 @@
 from transformer_engine.pytorch import NVFP4Quantizer
 from transformer_engine.pytorch.custom_recipes.quantization_nvfp4 import NVFP4QuantizerRef
 from transformer_engine.pytorch.custom_recipes import utils
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 
+if IS_HIP_EXTENSION:
+    from transformer_engine.pytorch.utils import get_torch_float8_e4m3_type
 
 recipe_available, reason_for_no_recipe = te.is_nvfp4_available(return_reason=True)
 
@@ -108,8 +113,13 @@ def check_nvfp4_gemm_versus_reference(
 
     # Native scales are stored as uint8 but need to be interpreted as float8_e4m3fn
     # for the reference GEMM to work correctly
-    sx_trimmed = sx_trimmed.view(torch.float8_e4m3fn)
-    sw_trimmed = sw_trimmed.view(torch.float8_e4m3fn)
+    if IS_HIP_EXTENSION:
+        fp8_dtype = get_torch_float8_e4m3_type()
+        sx_trimmed = sx_trimmed.view(fp8_dtype)
+        sw_trimmed = sw_trimmed.view(fp8_dtype)
+    else:
+        sx_trimmed = sx_trimmed.view(torch.float8_e4m3fn)
+        sw_trimmed = sw_trimmed.view(torch.float8_e4m3fn)
 
     # Create reference quantizer for reference GEMM
     ref_quantizer = NVFP4QuantizerRef(
@@ -150,7 +160,14 @@ def check_nvfp4_gemm_versus_reference(
 
     # Native TE GEMM using tex.generic_gemm (cuBLAS GEMM)
     # Allocate cuBLAS workspace
-    workspace = torch.empty(4, dtype=torch.uint8, device=device)
+    if IS_HIP_EXTENSION:
+        # On ROCm, FP4 is dequantized to BF16 in workspace before GEMM, so allocate enough space.
+        from transformer_engine.pytorch.cpp_extensions.gemm import get_cublas_workspace_size_bytes
+        bf16_size = torch.bfloat16.itemsize
+        ws_bytes = M * K * bf16_size + K * N * bf16_size + get_cublas_workspace_size_bytes()
+        workspace = torch.empty(ws_bytes, dtype=torch.uint8, device=device)
+    else:
+        workspace = torch.empty(4, dtype=torch.uint8, device=device)
 
     transa = True if not w_columnwise else False
     transb = False if not x_columnwise else True

@@ -1,3 +1,5 @@
+# This file was modified for portability to AMDGPU
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved
 # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -11,6 +13,9 @@
 import transformer_engine_torch as tex
 
 from transformer_engine.pytorch import NVFP4Quantizer
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
+if IS_HIP_EXTENSION:
+        from transformer_engine.pytorch.utils import get_torch_float8_e4m3_type, is_fp8_fnuz
 
 recipe_available, reason_for_no_recipe = te.is_nvfp4_available(return_reason=True)
 
@@ -58,10 +63,18 @@ def fp4_to_fp32(fp4: torch.Tensor) -> torch.Tensor:
 
 
 def dequantize_fp4(qx: torch.Tensor, sx: torch.Tensor, amax: torch.Tensor) -> torch.Tensor:
-    sf = sx.repeat_interleave(16, dim=1).view(torch.float8_e4m3fn).to(torch.float32)
+    if IS_HIP_EXTENSION:
+        fp8_dtype = get_torch_float8_e4m3_type()
+        fp8_max = 240.0 if is_fp8_fnuz() else 448.0
+        sf = sx.repeat_interleave(16, dim=1).view(fp8_dtype).to(torch.float32)
+    else:
+        sf = sx.repeat_interleave(16, dim=1).view(torch.float8_e4m3fn).to(torch.float32)
     dqx = fp4_to_fp32(unpack_fp4(qx))
     sf = sf[: dqx.shape[0], : dqx.shape[1]]
-    dequant = dqx * sf * (amax / (6.0 * 448))
+    if IS_HIP_EXTENSION:
+        dequant = dqx * sf * (amax / (6.0 * fp8_max))
+    else:
+        dequant = dqx * sf * (amax / (6.0 * 448))
     return dequant
 
 

@@ -1773,7 +1773,10 @@ def test_clamped_swiglu(
         quantized_compute = quantization is not None
         if not quantized_compute and (quantize_forward or quantize_backward):
             pytest.skip("Quantization scheme has not been provided")
-        maybe_skip_quantization(quantization, dims=in_shape, device=device)
+        if IS_HIP_EXTENSION:
+            maybe_skip_quantization(quantization, dims=in_shape, device=device, dtype=dtype)
+        else:
+            maybe_skip_quantization(quantization, dims=in_shape, device=device)
 
         # Random data
         x_ref, x_test = make_reference_and_test_tensors(
@@ -2937,6 +2940,8 @@ def to_cpu(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
 
         # Check values
         tols = {"rtol": 0.25, "atol": 0.5}  # Loose tols for sanity checking
+        if IS_HIP_EXTENSION:
+            tols["atol"] = 0.54
         torch.testing.assert_close(to_cpu(y_test), y_ref, **tols)
         torch.testing.assert_close(to_cpu(x_test.grad), x_ref.grad, **tols)
         torch.testing.assert_close(to_cpu(norm.weight.grad), norm_w_ref.grad, **tols)

@@ -778,7 +778,6 @@ def test_gpt_full_activation_recompute(
         if (dtype == torch.bfloat16
             and not fp8
             and not use_reentrant
-            and recipe.float8_per_tensor_scaling()
             ):
             pytest.skip("hipBLASLt does not provide suitable algorithms on GFX950 for this config.")
     if fp8 and recipe.nvfp4():

@@ -214,6 +214,7 @@ list(APPEND transformer_engine_cuda_sources
      recipe/current_scaling.cu
      recipe/delayed_scaling.cu
      recipe/fp8_block_scaling.cu
+     recipe/nvfp4.cu
      swizzle/swizzle.cu)
 
 list(APPEND transformer_engine_cuda_arch_specific_sources

@@ -66,7 +66,11 @@ __global__ void __launch_bounds__(512)
 #else
   float amax = (tensor_amax != nullptr) ? *tensor_amax : 1.0f;
 #endif
+#if defined(__HIP_DEVICE_COMPILE__)
+  constexpr float factor_inv = 1.0f / (detail::TypeExtrema<fp4e2m1>::max * detail::TypeExtrema<fp8e4m3>::max);
+#else
   constexpr float factor_inv = 1.0 / (6.0 * 448.0);
+#endif
   float final_scale = static_cast<float>(scale) * amax * factor_inv;
 #pragma unroll
   for (int i = 0; i < 4; i++) {