Skip to content

Commit 3ad2db2

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into pylayer-tensor-wrapper-fix
2 parents 8bbcf5d + 2140638 commit 3ad2db2

21 files changed

Lines changed: 786 additions & 177 deletions

cmake/cupti.cmake

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,18 @@ endif()
55
include(${PROJECT_SOURCE_DIR}/cmake/architecture.cmake)
66

77
if(WITH_ROCM)
8+
if(EXISTS "${ROCM_PATH}/cuda/extras/CUPTI")
9+
set(ROCM_CUDA_DIR "${ROCM_PATH}/cuda")
10+
elseif(EXISTS "${ROCM_PATH}/cuda/cuda/extras/CUPTI")
11+
set(ROCM_CUDA_DIR "${ROCM_PATH}/cuda/cuda")
12+
else()
13+
message(
14+
FATAL_ERROR
15+
"CUPTI not found under ${ROCM_PATH}/cuda/extras/CUPTI or ${ROCM_PATH}/cuda/cuda/extras/CUPTI"
16+
)
17+
endif()
818
set(CUPTI_ROOT
9-
"${ROCM_PATH}/cuda/extras/CUPTI"
19+
"${ROCM_CUDA_DIR}/extras/CUPTI"
1020
CACHE PATH "CUPTI ROOT")
1121
else()
1222
set(CUPTI_ROOT
@@ -59,7 +69,7 @@ get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
5969
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
6070
set(CUPTI_FOUND ON)
6171
if(WITH_ROCM)
62-
include_directories(${ROCM_PATH}/cuda/include)
72+
include_directories(${ROCM_CUDA_DIR}/include)
6373
add_definitions(-D__CUDA_HIP_PLATFORM_AMD__)
6474
endif()
6575
else()

paddle/common/flags.cc

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,36 @@ PHI_DEFINE_EXPORTED_bool(
230230
"operator. The autotuning algorithm may be non-deterministic. If "
231231
"true, the algorithm is deterministic.");
232232

233+
/**
234+
* GPU RNG related FLAG
235+
* Name: FLAGS_deterministic_rng
236+
* Since Version: 3.4
237+
* Value Range: bool, default=false
238+
* Example: paddle.set_flags({'FLAGS_deterministic_rng': True})
239+
* Note: Fix RNG kernel launch config so same seed gives same results
240+
* across GPU types.
241+
*/
242+
PHI_DEFINE_EXPORTED_bool(
243+
deterministic_rng,
244+
false,
245+
"Enable cross-device RNG consistency by fixing GPU kernel launch "
246+
"configuration. When true, RNG kernels use a fixed grid/block size "
247+
"so that the same seed produces identical results across GPU types.");
248+
249+
/**
250+
* GPU RNG related FLAG
251+
* Name: FLAGS_deterministic_rng_grid
252+
* Since Version: 3.4
253+
* Value Range: int32, default=1024
254+
* Example: paddle.set_flags({'FLAGS_deterministic_rng_grid': 4096})
255+
* Note: Grid size cap used when FLAGS_deterministic_rng is enabled.
256+
* Cross-device consistency requires the same value on all devices.
257+
*/
258+
PHI_DEFINE_EXPORTED_int32(
259+
deterministic_rng_grid,
260+
1024,
261+
"Grid size cap when FLAGS_deterministic_rng is enabled.");
262+
233263
/**
234264
* CUDA related FLAG
235265
* Name: FLAGS_embedding_deterministic

paddle/phi/CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,9 +372,14 @@ if(WITH_CUTLASS)
372372
)# for memory_efficient_attention.h
373373
endif()
374374
# PADDLE_WARP_SIZE: warp size for the target GPU platform.
375-
# Default 32 (NVIDIA). Override via -DPADDLE_WARP_SIZE=64 for iluvatar (COREX).
375+
# Default 32 (NVIDIA). ROCm (AMD/Hygon) wavefront size is 64.
376+
# Override via -DPADDLE_WARP_SIZE for other platforms.
376377
if(NOT DEFINED PADDLE_WARP_SIZE)
377-
set(PADDLE_WARP_SIZE 32)
378+
if(WITH_ROCM)
379+
set(PADDLE_WARP_SIZE 64)
380+
else()
381+
set(PADDLE_WARP_SIZE 32)
382+
endif()
378383
endif()
379384
math(EXPR PADDLE_WARP_MASK "${PADDLE_WARP_SIZE} - 1")
380385
if(PADDLE_WARP_SIZE EQUAL 64)

paddle/phi/backends/dynload/rocm_driver.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ void* rocm_dso_handle = nullptr;
2222

2323
#define DEFINE_WRAP(__name) DynLoad__##__name __name
2424

25+
ROCM_ROUTINE_EACH_VVM(DEFINE_WRAP);
26+
ROCM_ROUTINE_EACH_GPU_GRAPH(DEFINE_WRAP);
2527
ROCM_ROUTINE_EACH(DEFINE_WRAP);
2628

2729
bool HasCUDADriver() {

paddle/phi/kernels/cpu/elementwise.h

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -35,25 +35,7 @@ struct SameDimsAddFunctor {
3535
};
3636

3737
template <typename DevCtx, typename T>
38-
struct SameDimsAddFunctor<
39-
DevCtx,
40-
T,
41-
typename std::enable_if<std::is_floating_point<T>::value>::type> {
42-
void operator()(const DevCtx& dev_ctx,
43-
const DenseTensor& x,
44-
const DenseTensor& y,
45-
DenseTensor* z) {
46-
auto blas = funcs::GetBlas<DevCtx, T>(dev_ctx);
47-
blas.VADD(
48-
x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
49-
}
50-
};
51-
52-
template <typename DevCtx, typename T>
53-
struct SameDimsAddFunctor<
54-
DevCtx,
55-
T,
56-
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
38+
struct SameDimsAddFunctor<DevCtx, T> {
5739
void operator()(const DevCtx& dev_ctx,
5840
const DenseTensor& x,
5941
const DenseTensor& y,

paddle/phi/kernels/funcs/distribution_helper.h

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ limitations under the License. */
3030

3131
#if defined(__NVCC__) || defined(__HIPCC__)
3232
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
33+
#include "paddle/phi/kernels/funcs/rng_launch_config.h"
3334
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
3435
#endif
3536

@@ -311,22 +312,36 @@ void distribution_and_transform(const GPUContext &dev_ctx,
311312
if (size == 0) return;
312313
auto gen_cuda = dev_ctx.GetGenerator();
313314

314-
size_t block_size = 256;
315-
size_t expect_grid_size = (size + block_size - 1) / block_size;
316-
317-
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
318-
const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
319-
320-
size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
321-
prop.multiProcessorCount;
322-
size_t grid_size =
323-
expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
315+
size_t block_size;
316+
size_t grid_size;
317+
uint64_t increment;
318+
319+
if (funcs::IsDeterministicRNG()) {
320+
constexpr int kCount = DistOp::kReturnsCount;
321+
auto cfg = funcs::GetDeterministicRNGConfig(size, kCount);
322+
block_size = cfg.block_size;
323+
grid_size = cfg.grid_size;
324+
increment = cfg.increment;
325+
} else {
326+
block_size = 256;
327+
size_t expect_grid_size = (size + block_size - 1) / block_size;
328+
329+
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
330+
const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id);
331+
332+
size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
333+
prop.multiProcessorCount;
334+
grid_size =
335+
expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
336+
337+
size_t total_thread = block_size * grid_size;
338+
size_t curand4_loop_times =
339+
(size + 4 * total_thread - 1) / (4 * total_thread);
340+
// 'increment' should be multiple of 4
341+
increment = curand4_loop_times * 4;
342+
}
324343

325344
size_t total_thread = block_size * grid_size;
326-
size_t curand4_loop_times =
327-
(size + 4 * total_thread - 1) / (4 * total_thread);
328-
// 'increment' should be multiple of 4
329-
uint64_t increment = curand4_loop_times * 4;
330345

331346
auto seed_offset = gen_cuda->IncrementOffset(increment);
332347
uint64_t seed = seed_offset.first;

paddle/phi/kernels/funcs/dropout_impl.cu.h

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -301,19 +301,32 @@ void DropoutFwGPUKernelDriver(
301301
uint64_t increment;
302302
// VectorizedRandomGenerator use curand_uniform4, so kVecSize is 4;
303303
constexpr int kVecSize = funcs::uniform_distribution<float>::kReturnsCount;
304-
auto gpu_config =
305-
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
306-
size_t grid_size = gpu_config.GetGridSize();
307-
size_t block_size = gpu_config.GetBlockSize();
308-
309-
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
310-
const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
311-
size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
312-
prop.multiProcessorCount / block_size;
313-
grid_size = std::min(grid_size, max_grid_size);
314-
315-
auto offset =
316-
((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
304+
305+
size_t grid_size;
306+
size_t block_size;
307+
size_t offset;
308+
309+
if (funcs::IsDeterministicRNG()) {
310+
auto cfg = funcs::GetDeterministicRNGConfig(x_numel, kVecSize);
311+
grid_size = cfg.grid_size;
312+
block_size = cfg.block_size;
313+
offset = cfg.increment;
314+
} else {
315+
auto gpu_config =
316+
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
317+
grid_size = gpu_config.GetGridSize();
318+
block_size = gpu_config.GetBlockSize();
319+
320+
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
321+
const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
322+
size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
323+
prop.multiProcessorCount / block_size;
324+
grid_size = std::min(grid_size, max_grid_size);
325+
326+
offset =
327+
((x_numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
328+
}
329+
317330
size_t main_offset =
318331
size / (block_size * kVecSize) * (block_size * kVecSize);
319332

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include <algorithm>
18+
#include <cstddef>
19+
#include <cstdint>
20+
21+
#include "paddle/common/flags.h"
22+
23+
COMMON_DECLARE_bool(deterministic_rng);
24+
COMMON_DECLARE_int32(deterministic_rng_grid);
25+
26+
namespace phi {
27+
namespace funcs {
28+
29+
inline bool IsDeterministicRNG() { return FLAGS_deterministic_rng; }
30+
31+
struct RNGLaunchConfig {
32+
size_t grid_size;
33+
size_t block_size;
34+
uint64_t increment;
35+
};
36+
37+
// Cross-device consistency requires the same FLAGS_deterministic_rng_grid.
38+
// vec_size: elements per thread per loop iteration (kReturnsCount).
39+
inline RNGLaunchConfig GetDeterministicRNGConfig(int64_t numel,
40+
int vec_size = 4) {
41+
RNGLaunchConfig config;
42+
constexpr size_t kBlockSize = 256;
43+
size_t grid_cap = static_cast<size_t>(FLAGS_deterministic_rng_grid);
44+
size_t needed = (static_cast<size_t>(numel) + kBlockSize - 1) / kBlockSize;
45+
config.grid_size = std::min(needed, grid_cap);
46+
config.block_size = kBlockSize;
47+
48+
size_t total_thread = config.grid_size * config.block_size;
49+
size_t loop_times =
50+
(static_cast<size_t>(numel) + vec_size * total_thread - 1) /
51+
(vec_size * total_thread);
52+
config.increment = static_cast<uint64_t>(loop_times * vec_size);
53+
54+
return config;
55+
}
56+
57+
} // namespace funcs
58+
} // namespace phi

paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#pragma once
1616

1717
#include "paddle/phi/kernels/funcs/distribution_helper.h"
18+
#include "paddle/phi/kernels/funcs/rng_launch_config.h"
1819

1920
namespace phi {
2021
namespace fusion {
@@ -23,17 +24,29 @@ template <typename Context>
2324
static inline std::vector<size_t> GetRandomCudaProp(int64_t numel,
2425
const Context& dev_ctx) {
2526
constexpr int kVecSize = funcs::uniform_distribution<float>::kReturnsCount;
26-
auto gpu_config =
27-
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, kVecSize);
28-
size_t grid_size = gpu_config.GetGridSize();
29-
size_t block_size = gpu_config.GetBlockSize();
30-
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
31-
const auto& prop = backends::gpu::GetDeviceProperties(device_id);
32-
size_t max_grid_size =
33-
prop.maxThreadsPerMultiProcessor * prop.multiProcessorCount / block_size;
34-
grid_size = std::min(grid_size, max_grid_size);
35-
auto offset =
36-
((numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
27+
28+
size_t grid_size;
29+
size_t block_size;
30+
size_t offset;
31+
32+
if (funcs::IsDeterministicRNG()) {
33+
auto cfg = funcs::GetDeterministicRNGConfig(numel, kVecSize);
34+
grid_size = cfg.grid_size;
35+
block_size = cfg.block_size;
36+
offset = cfg.increment;
37+
} else {
38+
auto gpu_config =
39+
backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, kVecSize);
40+
grid_size = gpu_config.GetGridSize();
41+
block_size = gpu_config.GetBlockSize();
42+
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
43+
const auto& prop = backends::gpu::GetDeviceProperties(device_id);
44+
size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
45+
prop.multiProcessorCount / block_size;
46+
grid_size = std::min(grid_size, max_grid_size);
47+
offset = ((numel - 1) / (grid_size * block_size * kVecSize) + 1) * kVecSize;
48+
}
49+
3750
size_t main_offset =
3851
numel / (block_size * kVecSize) * (block_size * kVecSize);
3952
return {grid_size, block_size, offset, main_offset};

paddle/phi/kernels/fusion/xpu/fused_rope_utils.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,10 @@ void XPUFusedRotaryEveryTwo(const Context& dev_ctx,
282282
single_func_name = "rotary_embedding_everytwo_unary_freqs_grad";
283283
fusion_func_name = "rotary_embedding_everytwo_binary_freqs_grad";
284284
}
285-
if (!in_k) {
285+
// Check k/v numel > 0, matching GPU's guard (k && k->numel() > 0), to
286+
// avoid passing a zero-element tensor pointer to the XPU library which
287+
// causes undefined behavior and overflow outputs.
288+
if (!in_k || in_k->numel() == 0) {
286289
int ret = everytwo_func(dev_ctx.x_context(),
287290
reinterpret_cast<const XPUType*>(in_q.data()),
288291
nullptr,
@@ -310,7 +313,7 @@ void XPUFusedRotaryEveryTwo(const Context& dev_ctx,
310313
10000.0f);
311314
PADDLE_ENFORCE_XDNN_SUCCESS(ret, fusion_func_name);
312315
}
313-
if (in_v) {
316+
if (in_v && in_v->numel() > 0) {
314317
int64_t num_heads_v = in_v->dims()[2];
315318
int ret = everytwo_func(dev_ctx.x_context(),
316319
reinterpret_cast<const XPUType*>(in_v->data()),
@@ -352,7 +355,10 @@ void XPUFusedRotaryHalf(const Context& dev_ctx,
352355
fusion_func_name = "xpu::rotary_embedding_half_binary_freqs_grad";
353356
}
354357

355-
if (!in_k) {
358+
// Check k/v numel > 0, matching GPU's guard (k && k->numel() > 0), to
359+
// avoid passing a zero-element tensor pointer to the XPU library which
360+
// causes undefined behavior and overflow outputs.
361+
if (!in_k || in_k->numel() == 0) {
356362
int ret = half_func(dev_ctx.x_context(),
357363
reinterpret_cast<const XPUType*>(in_q.data()),
358364
nullptr,
@@ -389,7 +395,7 @@ void XPUFusedRotaryHalf(const Context& dev_ctx,
389395
PADDLE_ENFORCE_XDNN_SUCCESS(ret, fusion_func_name);
390396
}
391397

392-
if (in_v) {
398+
if (in_v && in_v->numel() > 0) {
393399
int64_t num_heads_v = in_v->dims()[2];
394400
int ret = half_func(dev_ctx.x_context(),
395401
reinterpret_cast<const XPUType*>(in_v->data()),

0 commit comments

Comments
 (0)