Add deterministic RNG launch config for cross-device consistency

DongBaiYue · claude · DongBaiYue · commit fa7502cc328b · 2026-04-24T17:47:12.000+08:00
When FLAGS_deterministic_rng is enabled, RNG kernels use a fixed
grid_size and block_size instead of device-dependent values, ensuring
the same seed produces identical random sequences across GPU types.

Two new flags:
- FLAGS_deterministic_rng (bool, default=false): enable the feature
- FLAGS_deterministic_rng_grid (int32, default=1024): grid size cap

Modified files:
- flags.cc: define the two flags
- rng_launch_config.h: new helper (IsDeterministicRNG, GetDeterministicRNGConfig)
- distribution_helper.h: if/else branch in distribution_and_transform
- dropout_impl.cu.h: if/else branch in DropoutFwGPUKernelDriver
- fused_dropout_add_utils.h: if/else branch in GetRandomCudaProp

Default (flag off) behavior is unchanged.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
@@ -230,6 +230,36 @@ PHI_DEFINE_EXPORTED_bool(
     "operator. The autotuning algorithm may be non-deterministic. If "
     "true, the algorithm is deterministic.");
 
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng
+ * Since Version: 3.3
+ * Value Range: bool, default=false
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng': True})
+ * Note: Fix RNG kernel launch config so same seed gives same results
+ *       across GPU types.
+ */
+PHI_DEFINE_EXPORTED_bool(
+    deterministic_rng,
+    false,
+    "Enable cross-device RNG consistency by fixing GPU kernel launch "
+    "configuration. When true, RNG kernels use a fixed grid/block size "
+    "so that the same seed produces identical results across GPU types.");
+
+/**
+ * GPU RNG related FLAG
+ * Name: FLAGS_deterministic_rng_grid
+ * Since Version: 3.3
+ * Value Range: int32, default=1024
+ * Example: paddle.set_flags({'FLAGS_deterministic_rng_grid': 4096})
+ * Note: Grid size cap used when FLAGS_deterministic_rng is enabled.
+ *       Cross-device consistency requires the same value on all devices.
+ */
+PHI_DEFINE_EXPORTED_int32(
+    deterministic_rng_grid,
+    1024,
+    "Grid size cap when FLAGS_deterministic_rng is enabled.");
+
 /**
  * CUDA related FLAG
  * Name: FLAGS_embedding_deterministic
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -30,6 +30,7 @@ limitations under the License. */
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/rng_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
@@ -311,22 +312,36 @@ void distribution_and_transform(const GPUContext &dev_ctx,
   if (size == 0) return;
   auto gen_cuda = dev_ctx.GetGenerator();
 
-  size_t block_size = 256;
-  size_t expect_grid_size = (size + block_size - 1) / block_size;
-
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
-
-  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
-                         prop.multiProcessorCount;
-  size_t grid_size =
-      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+  size_t block_size;
+  size_t grid_size;
+  uint64_t increment;
+
+  if (funcs::IsDeterministicRNG()) {
+    constexpr int kCount = DistOp::kReturnsCount;
+    auto cfg = funcs::GetDeterministicRNGConfig(size, kCount);
+    block_size = cfg.block_size;
+    grid_size = cfg.grid_size;
+    increment = cfg.increment;
+  } else {
+    block_size = 256;
+    size_t expect_grid_size = (size + block_size - 1) / block_size;
+
+    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+    const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
+
+    size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
+                           prop.multiProcessorCount;
+    grid_size =
+        expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+
+    size_t total_thread = block_size * grid_size;
+    size_t curand4_loop_times =
+        (size + 4 * total_thread - 1) / (4 * total_thread);
+    // 'increment' should be multiple of 4
+    increment = curand4_loop_times * 4;
+  }
 
   size_t total_thread = block_size * grid_size;
-  size_t curand4_loop_times =
-      (size + 4 * total_thread - 1) / (4 * total_thread);
-  // 'increment' should be multiple of 4
-  uint64_t increment = curand4_loop_times * 4;
 
   auto seed_offset = gen_cuda->IncrementOffset(increment);
   uint64_t seed = seed_offset.first;
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -302,19 +302,32 @@ void DropoutFwGPUKernelDriver(
     // VectorizedRandomGenerator use curand_uniform4, so kVecSize is 4;
     constexpr int kVecSize =
         phi::funcs::uniform_distribution<float>::kReturnsCount;
-    auto gpu_config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
-    size_t grid_size = gpu_config.GetGridSize();
-    size_t block_size = gpu_config.GetBlockSize();
-
-    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-    const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
-    size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
-                           prop.multiProcessorCount / block_size;
-    grid_size = std::min(grid_size, max_grid_size);
-
-    auto offset =
-        ((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+
+    size_t grid_size;
+    size_t block_size;
+    size_t offset;
+
+    if (phi::funcs::IsDeterministicRNG()) {
+      auto cfg = phi::funcs::GetDeterministicRNGConfig(x_numel, kVecSize);
+      grid_size = cfg.grid_size;
+      block_size = cfg.block_size;
+      offset = cfg.increment;
+    } else {
+      auto gpu_config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
+      grid_size = gpu_config.GetGridSize();
+      block_size = gpu_config.GetBlockSize();
+
+      int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+      const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
+      size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
+                             prop.multiProcessorCount / block_size;
+      grid_size = std::min(grid_size, max_grid_size);
+
+      offset =
+          ((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+    }
+
     size_t main_offset =
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
diff --git a/paddle/phi/kernels/funcs/rng_launch_config.h b/paddle/phi/kernels/funcs/rng_launch_config.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "paddle/common/flags.h"
+
+COMMON_DECLARE_bool(deterministic_rng);
+COMMON_DECLARE_int32(deterministic_rng_grid);
+
+namespace phi {
+namespace funcs {
+
+inline bool IsDeterministicRNG() { return FLAGS_deterministic_rng; }
+
+struct RNGLaunchConfig {
+  size_t grid_size;
+  size_t block_size;
+  uint64_t increment;
+};
+
+// Cross-device consistency requires the same FLAGS_deterministic_rng_grid.
+// vec_size: elements per thread per loop iteration (kReturnsCount).
+inline RNGLaunchConfig GetDeterministicRNGConfig(int64_t numel,
+                                                 int vec_size = 4) {
+  RNGLaunchConfig config;
+  constexpr size_t kBlockSize = 256;
+  size_t grid_cap = static_cast<size_t>(FLAGS_deterministic_rng_grid);
+  size_t needed = (static_cast<size_t>(numel) + kBlockSize - 1) / kBlockSize;
+  config.grid_size = std::min(needed, grid_cap);
+  config.block_size = kBlockSize;
+
+  size_t total_thread = config.grid_size * config.block_size;
+  size_t loop_times =
+      (static_cast<size_t>(numel) + vec_size * total_thread - 1) /
+      (vec_size * total_thread);
+  config.increment = static_cast<uint64_t>(loop_times * vec_size);
+
+  return config;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/rng_launch_config.h"
 
 namespace phi {
 namespace fusion {
@@ -23,17 +24,28 @@ template <typename Context>
 static inline std::vector<size_t> GetRandomCudaProp(int64_t numel,
                                                     const Context& dev_ctx) {
   constexpr int kVecSize = funcs::uniform_distribution<float>::kReturnsCount;
-  auto gpu_config =
-      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, kVecSize);
-  size_t grid_size = gpu_config.GetGridSize();
-  size_t block_size = gpu_config.GetBlockSize();
-  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
-  size_t max_grid_size =
-      prop.maxThreadsPerMultiProcessor * prop.multiProcessorCount / block_size;
-  grid_size = std::min(grid_size, max_grid_size);
-  auto offset =
-      ((numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+
+  size_t grid_size;
+  size_t block_size;
+  size_t offset;
+
+  if (funcs::IsDeterministicRNG()) {
+    auto cfg = funcs::GetDeterministicRNGConfig(numel, kVecSize);
+    grid_size = cfg.grid_size;
+    block_size = cfg.block_size;
+    offset = cfg.increment;
+  } else {
+    auto gpu_config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, kVecSize);
+    grid_size = gpu_config.GetGridSize();
+    block_size = gpu_config.GetBlockSize();
+    int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+    const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
+    size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
+                           prop.multiProcessorCount / block_size;
+    grid_size = std::min(grid_size, max_grid_size);
+    offset = ((numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
+  }
   size_t main_offset =
       numel / (block_size * kVecSize) * (block_size * kVecSize);
   return {grid_size, block_size, offset, main_offset};