Skip to content

Commit 29d789b

Browse files
add PADDLE_ENFORCE_GPU_SUCCESS
1 parent 28746cf commit 29d789b

1 file changed

Lines changed: 137 additions & 3 deletions

File tree

paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h

Lines changed: 137 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,11 @@ void launch_vectorized_rms_norm_kernel_driver(int N,
430430
vectorized_rms_norm_kernel<T, T_ACC, kVecSize>
431431
<<<blocks, threads, nshared, stream>>>(
432432
N, eps, X_data, scale_data, rstd_data, Y_data);
433+
#ifdef PADDLE_WITH_HIP
434+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
435+
#else
436+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
437+
#endif
433438
}
434439

435440
struct WelfordDataLN {
@@ -709,6 +714,11 @@ void launch_vectorized_layer_norm_kernel_driver(int N,
709714
vectorized_layer_norm_kernel<T, T_ACC, kVecSize>
710715
<<<blocks, threads, nshared, stream>>>(
711716
N, eps, X_data, gamma_data, beta_data, mean_data, var_data, Y_data);
717+
#ifdef PADDLE_WITH_HIP
718+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
719+
#else
720+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
721+
#endif
712722
}
713723

714724
template <typename T, typename Context>
@@ -723,6 +733,11 @@ void LayerNormFwdCompatKernel(
723733
T* y_data,
724734
typename phi::dtype::MPTypeTrait<T>::Type* mean_data,
725735
typename phi::dtype::MPTypeTrait<T>::Type* var_data) {
736+
#ifdef PADDLE_WITH_HIP
737+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
738+
#else
739+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
740+
#endif
726741
using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
727742

728743
if (rows == 0 || cols == 0) {
@@ -763,7 +778,11 @@ void LayerNormFwdCompatKernel(
763778
LayerNormRowwiseMomentsCUDAKernel<T, T_ACC>
764779
<<<rows, kCUDABlockReduceNumThreads, 0, stream>>>(
765780
cols, static_cast<T_ACC>(epsilon), x_data, mean_data, var_data);
766-
781+
#ifdef PADDLE_WITH_HIP
782+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
783+
#else
784+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
785+
#endif
767786
LayerNormForwardCUDAKernel<T, T_ACC>
768787
<<<rows, kCUDANumThreads, 0, stream>>>(cols,
769788
x_data,
@@ -773,6 +792,11 @@ void LayerNormFwdCompatKernel(
773792
gamma_data,
774793
beta_data,
775794
y_data);
795+
#ifdef PADDLE_WITH_HIP
796+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
797+
#else
798+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
799+
#endif
776800
}
777801
}
778802

@@ -1184,6 +1208,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
11841208
true>
11851209
<<<blocks, threads, shmem_sz, cuda_stream>>>(
11861210
M, N, dY_data, X_data, rstd_data, dscale_data);
1211+
#ifdef PADDLE_WITH_HIP
1212+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1213+
#else
1214+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1215+
#endif
11871216
} else {
11881217
ScaleBackwardCUDAKernelTemplate<T,
11891218
T_ACC,
@@ -1194,6 +1223,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
11941223
false>
11951224
<<<blocks, threads, shmem_sz, cuda_stream>>>(
11961225
M, N, dY_data, X_data, rstd_data, dscale_data);
1226+
#ifdef PADDLE_WITH_HIP
1227+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1228+
#else
1229+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1230+
#endif
11971231
}
11981232
} else {
11991233
if (aligned_grid) {
@@ -1206,6 +1240,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
12061240
true>
12071241
<<<blocks, threads, shmem_sz, cuda_stream>>>(
12081242
M, N, dY_data, X_data, rstd_data, dscale_data);
1243+
#ifdef PADDLE_WITH_HIP
1244+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1245+
#else
1246+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1247+
#endif
12091248
} else {
12101249
ScaleBackwardCUDAKernelTemplate<T,
12111250
T_ACC,
@@ -1216,6 +1255,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
12161255
false>
12171256
<<<blocks, threads, shmem_sz, cuda_stream>>>(
12181257
M, N, dY_data, X_data, rstd_data, dscale_data);
1258+
#ifdef PADDLE_WITH_HIP
1259+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1260+
#else
1261+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1262+
#endif
12191263
}
12201264
}
12211265
}
@@ -1232,7 +1276,12 @@ void RMSNormFwdKernel(const Context& dev_ctx,
12321276
double epsilon,
12331277
DenseTensor* y,
12341278
DenseTensor* invvar) {
1235-
using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
1279+
#ifdef PADDLE_WITH_HIP
1280+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1281+
#else
1282+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1283+
#endif
1284+
using T_ACC = typename dtype::MPTypeTrait<T>::Type;
12361285

12371286
if (x.numel() == 0) {
12381287
dev_ctx.template Alloc<T>(y);
@@ -1313,9 +1362,19 @@ void RMSNormFwdKernel(const Context& dev_ctx,
13131362
RowwiseMomentsCUDAKernel<T, T_ACC>
13141363
<<<rows, kCUDABlockReduceNumThreads, 0, stream>>>(
13151364
cols, static_cast<T_ACC>(epsilon), x_data, rstd_data);
1365+
#ifdef PADDLE_WITH_HIP
1366+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1367+
#else
1368+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1369+
#endif
13161370

13171371
RMSNormForwardCUDAKernel<T, T_ACC><<<rows, kCUDANumThreads, 0, stream>>>(
13181372
cols, x_data, rstd_data, scale_data, y_data);
1373+
#ifdef PADDLE_WITH_HIP
1374+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1375+
#else
1376+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1377+
#endif
13191378
}
13201379
}
13211380

@@ -1329,7 +1388,12 @@ void RMSNormBwdKernel(const Context& dev_ctx,
13291388
double epsilon,
13301389
DenseTensor* dX,
13311390
DenseTensor* dscale) {
1332-
using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
1391+
#ifdef PADDLE_WITH_HIP
1392+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1393+
#else
1394+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1395+
#endif
1396+
using T_ACC = typename dtype::MPTypeTrait<T>::Type;
13331397

13341398
if (X.numel() == 0) {
13351399
if (dX) {
@@ -1396,14 +1460,29 @@ void RMSNormBwdKernel(const Context& dev_ctx,
13961460
rms_norm_grad_input_kernel_vectorized<T, T_ACC, 8>
13971461
<<<blocks, num_threads, nshared, stream>>>(
13981462
dY_data, X_data, invvar_data, scale_data, dX_data, N);
1463+
#ifdef PADDLE_WITH_HIP
1464+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1465+
#else
1466+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1467+
#endif
13991468
} else if (is_supported_type && bAlignedBuffers && bVectorSizeMultiple) {
14001469
rms_norm_grad_input_kernel_vectorized<T, T_ACC, kVecSize>
14011470
<<<blocks, num_threads, nshared, stream>>>(
14021471
dY_data, X_data, invvar_data, scale_data, dX_data, N);
1472+
#ifdef PADDLE_WITH_HIP
1473+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1474+
#else
1475+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1476+
#endif
14031477
} else {
14041478
rms_norm_grad_input_kernel<T, T_ACC>
14051479
<<<blocks, num_threads, nshared, stream>>>(
14061480
dY_data, X_data, invvar_data, scale_data, dX_data, N);
1481+
#ifdef PADDLE_WITH_HIP
1482+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1483+
#else
1484+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1485+
#endif
14071486
}
14081487
}
14091488

@@ -1437,6 +1516,11 @@ void RMSNormBwdKernel(const Context& dev_ctx,
14371516
true,
14381517
true><<<blocks, threads, 0, stream>>>(
14391518
M, N, dY_data, X_data, invvar_data, dscale_blocks_ptr);
1519+
#ifdef PADDLE_WITH_HIP
1520+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1521+
#else
1522+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1523+
#endif
14401524
} else {
14411525
ScaleBackwardCUDAKernelTemplate<T,
14421526
T_ACC,
@@ -1446,6 +1530,11 @@ void RMSNormBwdKernel(const Context& dev_ctx,
14461530
true,
14471531
false><<<blocks, threads, 0, stream>>>(
14481532
M, N, dY_data, X_data, invvar_data, dscale_blocks_ptr);
1533+
#ifdef PADDLE_WITH_HIP
1534+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
1535+
#else
1536+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
1537+
#endif
14491538
}
14501539

14511540
// Sum reduction along blocks.y dimension to get final dscale
@@ -1943,6 +2032,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19432032
rstd_data,
19442033
dgamma_data,
19452034
dbeta_data);
2035+
#ifdef PADDLE_WITH_HIP
2036+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2037+
#else
2038+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2039+
#endif
19462040
} else {
19472041
GammaBetaBackwardCUDAKernelTemplate<T,
19482042
T_ACC,
@@ -1959,6 +2053,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19592053
rstd_data,
19602054
dgamma_data,
19612055
dbeta_data);
2056+
#ifdef PADDLE_WITH_HIP
2057+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2058+
#else
2059+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2060+
#endif
19622061
}
19632062
} else {
19642063
if (aligned_grid) {
@@ -1977,6 +2076,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19772076
rstd_data,
19782077
dgamma_data,
19792078
dbeta_data);
2079+
#ifdef PADDLE_WITH_HIP
2080+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2081+
#else
2082+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2083+
#endif
19802084
} else {
19812085
GammaBetaBackwardCUDAKernelTemplate<T,
19822086
T_ACC,
@@ -1993,6 +2097,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19932097
rstd_data,
19942098
dgamma_data,
19952099
dbeta_data);
2100+
#ifdef PADDLE_WITH_HIP
2101+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2102+
#else
2103+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2104+
#endif
19962105
}
19972106
}
19982107
}
@@ -2011,6 +2120,11 @@ void LayerNormBwdCompatKernel(
20112120
double epsilon,
20122121
int64_t rows,
20132122
int64_t cols) {
2123+
#ifdef PADDLE_WITH_HIP
2124+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2125+
#else
2126+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2127+
#endif
20142128
using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
20152129
if (rows == 0 || cols == 0) return;
20162130
auto stream = dev_ctx.stream();
@@ -2028,6 +2142,11 @@ void LayerNormBwdCompatKernel(
20282142
int64_t num_blocks = (M + kBlockSize - 1) / kBlockSize;
20292143
VarToRstdKernel<T_ACC><<<num_blocks, kBlockSize, 0, stream>>>(
20302144
var_data, static_cast<T_ACC>(epsilon), rstd_data, M);
2145+
#ifdef PADDLE_WITH_HIP
2146+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2147+
#else
2148+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2149+
#endif
20312150
}
20322151

20332152
// Step 2: Compute dX using vectorized or non-vectorized kernel
@@ -2059,6 +2178,11 @@ void LayerNormBwdCompatKernel(
20592178
layer_norm_grad_input_kernel_vectorized<T, T_ACC, kVecSize>
20602179
<<<blocks, num_threads, nshared, stream>>>(
20612180
dY_data, X_data, mean_data, rstd_data, gamma_data, dX_data, N);
2181+
#ifdef PADDLE_WITH_HIP
2182+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2183+
#else
2184+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2185+
#endif
20622186
} else {
20632187
layer_norm_grad_input_kernel<T, T_ACC>
20642188
<<<blocks, num_threads, nshared, stream>>>(
@@ -2112,6 +2236,11 @@ void LayerNormBwdCompatKernel(
21122236
rstd_data,
21132237
dgamma_blocks_ptr,
21142238
dbeta_blocks_ptr);
2239+
#ifdef PADDLE_WITH_HIP
2240+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2241+
#else
2242+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2243+
#endif
21152244
} else {
21162245
GammaBetaBackwardCUDAKernelTemplate<T,
21172246
T_ACC,
@@ -2128,6 +2257,11 @@ void LayerNormBwdCompatKernel(
21282257
rstd_data,
21292258
dgamma_blocks_ptr,
21302259
dbeta_blocks_ptr);
2260+
#ifdef PADDLE_WITH_HIP
2261+
PADDLE_ENFORCE_GPU_SUCCESS(hipGetLastError());
2262+
#else
2263+
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
2264+
#endif
21312265
}
21322266

21332267
// Sum reduction along blocks.y dimension to get final dgamma/dbeta.

0 commit comments

Comments
 (0)