PaddlePaddle
diff --git a/‎cmake/cupti.cmake‎
Lines changed: 12 additions & 2 deletions b/‎cmake/cupti.cmake‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎paddle/common/flags.cc‎
Lines changed: 30 additions & 0 deletions b/‎paddle/common/flags.cc‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎paddle/phi/CMakeLists.txt‎
Lines changed: 7 additions & 2 deletions b/‎paddle/phi/CMakeLists.txt‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎paddle/phi/backends/dynload/rocm_driver.cc‎
Lines changed: 2 additions & 0 deletions b/‎paddle/phi/backends/dynload/rocm_driver.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/cpu/elementwise.h‎
Lines changed: 1 addition & 19 deletions b/‎paddle/phi/kernels/cpu/elementwise.h‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎paddle/phi/kernels/funcs/distribution_helper.h‎
Lines changed: 29 additions & 14 deletions b/‎paddle/phi/kernels/funcs/distribution_helper.h‎
Lines changed: 29 additions & 14 deletions
diff --git a/‎paddle/phi/kernels/funcs/dropout_impl.cu.h‎
Lines changed: 26 additions & 13 deletions b/‎paddle/phi/kernels/funcs/dropout_impl.cu.h‎
Lines changed: 26 additions & 13 deletions
diff --git a/‎paddle/phi/kernels/funcs/rng_launch_config.h‎
Lines changed: 58 additions & 0 deletions b/‎paddle/phi/kernels/funcs/rng_launch_config.h‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h‎
Lines changed: 24 additions & 11 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎paddle/phi/kernels/fusion/xpu/fused_rope_utils.h‎
Lines changed: 10 additions & 4 deletions b/‎paddle/phi/kernels/fusion/xpu/fused_rope_utils.h‎
Lines changed: 10 additions & 4 deletions
@@ -5,8 +5,18 @@ endif()
 include(${PROJECT_SOURCE_DIR}/cmake/architecture.cmake)
 
 if(WITH_ROCM)
+  if(EXISTS "${ROCM_PATH}/cuda/extras/CUPTI")
+    set(ROCM_CUDA_DIR "${ROCM_PATH}/cuda")
+  elseif(EXISTS "${ROCM_PATH}/cuda/cuda/extras/CUPTI")
+    set(ROCM_CUDA_DIR "${ROCM_PATH}/cuda/cuda")
+  else()
+    message(
+      FATAL_ERROR
+        "CUPTI not found under ${ROCM_PATH}/cuda/extras/CUPTI or ${ROCM_PATH}/cuda/cuda/extras/CUPTI"
+    )
+  endif()
   set(CUPTI_ROOT
-      "${ROCM_PATH}/cuda/extras/CUPTI"
+      "${ROCM_CUDA_DIR}/extras/CUPTI"
       CACHE PATH "CUPTI ROOT")
 else()
   set(CUPTI_ROOT
@@ -59,7 +69,7 @@ get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
 if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
   set(CUPTI_FOUND ON)
   if(WITH_ROCM)
-    include_directories(${ROCM_PATH}/cuda/include)
+    include_directories(${ROCM_CUDA_DIR}/include)
     add_definitions(-D__CUDA_HIP_PLATFORM_AMD__)
   endif()
 else()
 
@@ -230,6 +230,36 @@ PHI_DEFINE_EXPORTED_bool(
     "operator. The autotuning algorithm may be non-deterministic. If "
     "true, the algorithm is deterministic.");
 
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng
+ * Since Version: 3.4
+ * Value Range: bool, default=false
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng': True})
+ * Note: Fix RNG kernel launch config so same seed gives same results
+ *       across GPU types.
+ */
+PHI_DEFINE_EXPORTED_bool(
+    deterministic_rng,
+    false,
+    "Enable cross-device RNG consistency by fixing GPU kernel launch "
+    "configuration. When true, RNG kernels use a fixed grid/block size "
+    "so that the same seed produces identical results across GPU types.");
+
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng_grid
+ * Since Version: 3.4
+ * Value Range: int32, default=1024
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng_grid': 4096})
+ * Note: Grid size cap used when FLAGS_deterministic_rng is enabled.
+ *       Cross-device consistency requires the same value on all devices.
+ */
+PHI_DEFINE_EXPORTED_int32(
+    deterministic_rng_grid,
+    1024,
+    "Grid size cap when FLAGS_deterministic_rng is enabled.");
+
 /**
  * CUDA related FLAG
  * Name: FLAGS_embedding_deterministic
 
@@ -372,9 +372,14 @@ if(WITH_CUTLASS)
   )# for memory_efficient_attention.h
 endif()
 # PADDLE_WARP_SIZE: warp size for the target GPU platform.
-# Default 32 (NVIDIA). Override via -DPADDLE_WARP_SIZE=64 for iluvatar (COREX).
+# Default 32 (NVIDIA). ROCm (AMD/Hygon) wavefront size is 64.
+# Override via -DPADDLE_WARP_SIZE for other platforms.
 if(NOT DEFINED PADDLE_WARP_SIZE)
-  set(PADDLE_WARP_SIZE 32)
+  if(WITH_ROCM)
+    set(PADDLE_WARP_SIZE 64)
+  else()
+    set(PADDLE_WARP_SIZE 32)
+  endif()
 endif()
 math(EXPR PADDLE_WARP_MASK "${PADDLE_WARP_SIZE} - 1")
 if(PADDLE_WARP_SIZE EQUAL 64)
 
@@ -22,6 +22,8 @@ void* rocm_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
+ROCM_ROUTINE_EACH_VVM(DEFINE_WRAP);
+ROCM_ROUTINE_EACH_GPU_GRAPH(DEFINE_WRAP);
 ROCM_ROUTINE_EACH(DEFINE_WRAP);
 
 bool HasCUDADriver() {
 
@@ -35,25 +35,7 @@ struct SameDimsAddFunctor {
 };
 
 template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto blas = funcs::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VADD(
-        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+struct SameDimsAddFunctor<DevCtx, T> {
   void operator()(const DevCtx& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
 
@@ -30,6 +30,7 @@ limitations under the License. */
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/rng_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
@@ -311,22 +312,36 @@ void distribution_and_transform(const GPUContext &dev_ctx,
   if (size == 0) return;
   auto gen_cuda = dev_ctx.GetGenerator();
 
-  size_t block_size = 256;
-  size_t expect_grid_size = (size + block_size - 1) / block_size;
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
-
-  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
-                         prop.multiProcessorCount;
-  size_t grid_size =
-      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+  size_t block_size;
+  size_t grid_size;
+  uint64_t increment;
+
+  if (funcs::IsDeterministicRNG()) {
+    constexpr int kCount = DistOp::kReturnsCount;
+    auto cfg = funcs::GetDeterministicRNGConfig(size, kCount);
+    block_size = cfg.block_size;
+    grid_size = cfg.grid_size;
+    increment = cfg.increment;
+  } else {
+    block_size = 256;
+    size_t expect_grid_size = (size + block_size - 1) / block_size;
+
+    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+    const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
+
+    size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
+                           prop.multiProcessorCount;
+    grid_size =
+        expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+
+    size_t total_thread = block_size * grid_size;
+    size_t curand4_loop_times =
+        (size + 4 * total_thread - 1) / (4 * total_thread);
+    // 'increment' should be multiple of 4
+    increment = curand4_loop_times * 4;
+  }
 
   size_t total_thread = block_size * grid_size;
-  size_t curand4_loop_times =
-      (size + 4 * total_thread - 1) / (4 * total_thread);
-  // 'increment' should be multiple of 4
-  uint64_t increment = curand4_loop_times * 4;
 
   auto seed_offset = gen_cuda->IncrementOffset(increment);
   uint64_t seed = seed_offset.first;
 
@@ -301,19 +301,32 @@ void DropoutFwGPUKernelDriver(
     uint64_t increment;
     // VectorizedRandomGenerator use curand_uniform4, so kVecSize is 4;
     constexpr int kVecSize = funcs::uniform_distribution<float>::kReturnsCount;
-    auto gpu_config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
-    size_t grid_size = gpu_config.GetGridSize();
-    size_t block_size = gpu_config.GetBlockSize();
-
-    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-    const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
-    size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
-                           prop.multiProcessorCount / block_size;
-    grid_size = std::min(grid_size, max_grid_size);
-
-    auto offset =
-        ((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+
+    size_t grid_size;
+    size_t block_size;
+    size_t offset;
+
+    if (funcs::IsDeterministicRNG()) {
+      auto cfg = funcs::GetDeterministicRNGConfig(x_numel, kVecSize);
+      grid_size = cfg.grid_size;
+      block_size = cfg.block_size;
+      offset = cfg.increment;
+    } else {
+      auto gpu_config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
+      grid_size = gpu_config.GetGridSize();
+      block_size = gpu_config.GetBlockSize();
+
+      int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+      const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
+      size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
+                             prop.multiProcessorCount / block_size;
+      grid_size = std::min(grid_size, max_grid_size);
+
+      offset =
+          ((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+    }
+
     size_t main_offset =
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
 
@@ -0,0 +1,58 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "paddle/common/flags.h"
+
+COMMON_DECLARE_bool(deterministic_rng);
+COMMON_DECLARE_int32(deterministic_rng_grid);
+
+namespace phi {
+namespace funcs {
+
+inline bool IsDeterministicRNG() { return FLAGS_deterministic_rng; }
+
+struct RNGLaunchConfig {
+  size_t grid_size;
+  size_t block_size;
+  uint64_t increment;
+};
+
+// Cross-device consistency requires the same FLAGS_deterministic_rng_grid.
+// vec_size: elements per thread per loop iteration (kReturnsCount).
+inline RNGLaunchConfig GetDeterministicRNGConfig(int64_t numel,
+                                                 int vec_size = 4) {
+  RNGLaunchConfig config;
+  constexpr size_t kBlockSize = 256;
+  size_t grid_cap = static_cast<size_t>(FLAGS_deterministic_rng_grid);
+  size_t needed = (static_cast<size_t>(numel) + kBlockSize - 1) / kBlockSize;
+  config.grid_size = std::min(needed, grid_cap);
+  config.block_size = kBlockSize;
+
+  size_t total_thread = config.grid_size * config.block_size;
+  size_t loop_times =
+      (static_cast<size_t>(numel) + vec_size * total_thread - 1) /
+      (vec_size * total_thread);
+  config.increment = static_cast<uint64_t>(loop_times * vec_size);
+
+  return config;
+}
+
+}  // namespace funcs
+}  // namespace phi
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/rng_launch_config.h"
 
 namespace phi {
 namespace fusion {
@@ -23,17 +24,29 @@ template <typename Context>
 static inline std::vector<size_t> GetRandomCudaProp(int64_t numel,
                                                     const Context& dev_ctx) {
   constexpr int kVecSize = funcs::uniform_distribution<float>::kReturnsCount;
-  auto gpu_config =
-      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, kVecSize);
-  size_t grid_size = gpu_config.GetGridSize();
-  size_t block_size = gpu_config.GetBlockSize();
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  const auto& prop = backends::gpu::GetDeviceProperties(device_id);
-  size_t max_grid_size =
-      prop.maxThreadsPerMultiProcessor * prop.multiProcessorCount / block_size;
-  grid_size = std::min(grid_size, max_grid_size);
-  auto offset =
-      ((numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+
+  size_t grid_size;
+  size_t block_size;
+  size_t offset;
+
+  if (funcs::IsDeterministicRNG()) {
+    auto cfg = funcs::GetDeterministicRNGConfig(numel, kVecSize);
+    grid_size = cfg.grid_size;
+    block_size = cfg.block_size;
+    offset = cfg.increment;
+  } else {
+    auto gpu_config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, kVecSize);
+    grid_size = gpu_config.GetGridSize();
+    block_size = gpu_config.GetBlockSize();
+    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+    const auto& prop = backends::gpu::GetDeviceProperties(device_id);
+    size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
+                           prop.multiProcessorCount / block_size;
+    grid_size = std::min(grid_size, max_grid_size);
+    offset = ((numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+  }
+
   size_t main_offset =
       numel / (block_size * kVecSize) * (block_size * kVecSize);
   return {grid_size, block_size, offset, main_offset};
 
@@ -282,7 +282,10 @@ void XPUFusedRotaryEveryTwo(const Context& dev_ctx,
     single_func_name = "rotary_embedding_everytwo_unary_freqs_grad";
     fusion_func_name = "rotary_embedding_everytwo_binary_freqs_grad";
   }
-  if (!in_k) {
+  // Check k/v numel > 0, matching GPU's guard (k && k->numel() > 0), to
+  // avoid passing a zero-element tensor pointer to the XPU library which
+  // causes undefined behavior and overflow outputs.
+  if (!in_k || in_k->numel() == 0) {
     int ret = everytwo_func(dev_ctx.x_context(),
                             reinterpret_cast<const XPUType*>(in_q.data()),
                             nullptr,
@@ -310,7 +313,7 @@ void XPUFusedRotaryEveryTwo(const Context& dev_ctx,
                             10000.0f);
     PADDLE_ENFORCE_XDNN_SUCCESS(ret, fusion_func_name);
   }
-  if (in_v) {
+  if (in_v && in_v->numel() > 0) {
     int64_t num_heads_v = in_v->dims()[2];
     int ret = everytwo_func(dev_ctx.x_context(),
                             reinterpret_cast<const XPUType*>(in_v->data()),
@@ -352,7 +355,10 @@ void XPUFusedRotaryHalf(const Context& dev_ctx,
     fusion_func_name = "xpu::rotary_embedding_half_binary_freqs_grad";
   }
 
-  if (!in_k) {
+  // Check k/v numel > 0, matching GPU's guard (k && k->numel() > 0), to
+  // avoid passing a zero-element tensor pointer to the XPU library which
+  // causes undefined behavior and overflow outputs.
+  if (!in_k || in_k->numel() == 0) {
     int ret = half_func(dev_ctx.x_context(),
                         reinterpret_cast<const XPUType*>(in_q.data()),
                         nullptr,
@@ -389,7 +395,7 @@ void XPUFusedRotaryHalf(const Context& dev_ctx,
     PADDLE_ENFORCE_XDNN_SUCCESS(ret, fusion_func_name);
   }
 
-  if (in_v) {
+  if (in_v && in_v->numel() > 0) {
     int64_t num_heads_v = in_v->dims()[2];
     int ret = half_func(dev_ctx.x_context(),
                         reinterpret_cast<const XPUType*>(in_v->data()),