@@ -430,6 +430,11 @@ void launch_vectorized_rms_norm_kernel_driver(int N,
430430 vectorized_rms_norm_kernel<T, T_ACC, kVecSize >
431431 <<<blocks, threads, nshared, stream>>>(
432432 N, eps, X_data, scale_data, rstd_data, Y_data);
433+ #ifdef PADDLE_WITH_HIP
434+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
435+ #else
436+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
437+ #endif
433438}
434439
435440struct WelfordDataLN {
@@ -709,6 +714,11 @@ void launch_vectorized_layer_norm_kernel_driver(int N,
709714 vectorized_layer_norm_kernel<T, T_ACC, kVecSize >
710715 <<<blocks, threads, nshared, stream>>>(
711716 N, eps, X_data, gamma_data, beta_data, mean_data, var_data, Y_data);
717+ #ifdef PADDLE_WITH_HIP
718+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
719+ #else
720+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
721+ #endif
712722}
713723
714724template <typename T, typename Context>
@@ -723,6 +733,11 @@ void LayerNormFwdCompatKernel(
723733 T* y_data,
724734 typename phi::dtype::MPTypeTrait<T>::Type* mean_data,
725735 typename phi::dtype::MPTypeTrait<T>::Type* var_data) {
736+ #ifdef PADDLE_WITH_HIP
737+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
738+ #else
739+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
740+ #endif
726741 using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
727742
728743 if (rows == 0 || cols == 0 ) {
@@ -763,7 +778,11 @@ void LayerNormFwdCompatKernel(
763778 LayerNormRowwiseMomentsCUDAKernel<T, T_ACC>
764779 <<<rows, kCUDABlockReduceNumThreads , 0 , stream>>>(
765780 cols, static_cast <T_ACC>(epsilon), x_data, mean_data, var_data);
766-
781+ #ifdef PADDLE_WITH_HIP
782+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
783+ #else
784+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
785+ #endif
767786 LayerNormForwardCUDAKernel<T, T_ACC>
768787 <<<rows, kCUDANumThreads , 0 , stream>>>(cols,
769788 x_data,
@@ -773,6 +792,11 @@ void LayerNormFwdCompatKernel(
773792 gamma_data,
774793 beta_data,
775794 y_data);
795+ #ifdef PADDLE_WITH_HIP
796+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
797+ #else
798+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
799+ #endif
776800 }
777801}
778802
@@ -1184,6 +1208,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
11841208 true >
11851209 <<<blocks, threads, shmem_sz, cuda_stream>>>(
11861210 M, N, dY_data, X_data, rstd_data, dscale_data);
1211+ #ifdef PADDLE_WITH_HIP
1212+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1213+ #else
1214+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1215+ #endif
11871216 } else {
11881217 ScaleBackwardCUDAKernelTemplate<T,
11891218 T_ACC,
@@ -1194,6 +1223,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
11941223 false >
11951224 <<<blocks, threads, shmem_sz, cuda_stream>>>(
11961225 M, N, dY_data, X_data, rstd_data, dscale_data);
1226+ #ifdef PADDLE_WITH_HIP
1227+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1228+ #else
1229+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1230+ #endif
11971231 }
11981232 } else {
11991233 if (aligned_grid) {
@@ -1206,6 +1240,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
12061240 true >
12071241 <<<blocks, threads, shmem_sz, cuda_stream>>>(
12081242 M, N, dY_data, X_data, rstd_data, dscale_data);
1243+ #ifdef PADDLE_WITH_HIP
1244+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1245+ #else
1246+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1247+ #endif
12091248 } else {
12101249 ScaleBackwardCUDAKernelTemplate<T,
12111250 T_ACC,
@@ -1216,6 +1255,11 @@ void ConfigureAndLaunchScaleBackwardKernel(const T* dY_data,
12161255 false >
12171256 <<<blocks, threads, shmem_sz, cuda_stream>>>(
12181257 M, N, dY_data, X_data, rstd_data, dscale_data);
1258+ #ifdef PADDLE_WITH_HIP
1259+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1260+ #else
1261+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1262+ #endif
12191263 }
12201264 }
12211265}
@@ -1232,7 +1276,12 @@ void RMSNormFwdKernel(const Context& dev_ctx,
12321276 double epsilon,
12331277 DenseTensor* y,
12341278 DenseTensor* invvar) {
1235- using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
1279+ #ifdef PADDLE_WITH_HIP
1280+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1281+ #else
1282+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1283+ #endif
1284+ using T_ACC = typename dtype::MPTypeTrait<T>::Type;
12361285
12371286 if (x.numel () == 0 ) {
12381287 dev_ctx.template Alloc <T>(y);
@@ -1313,9 +1362,19 @@ void RMSNormFwdKernel(const Context& dev_ctx,
13131362 RowwiseMomentsCUDAKernel<T, T_ACC>
13141363 <<<rows, kCUDABlockReduceNumThreads , 0 , stream>>>(
13151364 cols, static_cast <T_ACC>(epsilon), x_data, rstd_data);
1365+ #ifdef PADDLE_WITH_HIP
1366+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1367+ #else
1368+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1369+ #endif
13161370
13171371 RMSNormForwardCUDAKernel<T, T_ACC><<<rows, kCUDANumThreads , 0 , stream>>>(
13181372 cols, x_data, rstd_data, scale_data, y_data);
1373+ #ifdef PADDLE_WITH_HIP
1374+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1375+ #else
1376+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1377+ #endif
13191378 }
13201379}
13211380
@@ -1329,7 +1388,12 @@ void RMSNormBwdKernel(const Context& dev_ctx,
13291388 double epsilon,
13301389 DenseTensor* dX,
13311390 DenseTensor* dscale) {
1332- using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
1391+ #ifdef PADDLE_WITH_HIP
1392+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1393+ #else
1394+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1395+ #endif
1396+ using T_ACC = typename dtype::MPTypeTrait<T>::Type;
13331397
13341398 if (X.numel () == 0 ) {
13351399 if (dX) {
@@ -1396,14 +1460,29 @@ void RMSNormBwdKernel(const Context& dev_ctx,
13961460 rms_norm_grad_input_kernel_vectorized<T, T_ACC, 8 >
13971461 <<<blocks, num_threads, nshared, stream>>>(
13981462 dY_data, X_data, invvar_data, scale_data, dX_data, N);
1463+ #ifdef PADDLE_WITH_HIP
1464+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1465+ #else
1466+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1467+ #endif
13991468 } else if (is_supported_type && bAlignedBuffers && bVectorSizeMultiple) {
14001469 rms_norm_grad_input_kernel_vectorized<T, T_ACC, kVecSize >
14011470 <<<blocks, num_threads, nshared, stream>>>(
14021471 dY_data, X_data, invvar_data, scale_data, dX_data, N);
1472+ #ifdef PADDLE_WITH_HIP
1473+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1474+ #else
1475+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1476+ #endif
14031477 } else {
14041478 rms_norm_grad_input_kernel<T, T_ACC>
14051479 <<<blocks, num_threads, nshared, stream>>>(
14061480 dY_data, X_data, invvar_data, scale_data, dX_data, N);
1481+ #ifdef PADDLE_WITH_HIP
1482+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1483+ #else
1484+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1485+ #endif
14071486 }
14081487 }
14091488
@@ -1437,6 +1516,11 @@ void RMSNormBwdKernel(const Context& dev_ctx,
14371516 true ,
14381517 true ><<<blocks, threads, 0 , stream>>>(
14391518 M, N, dY_data, X_data, invvar_data, dscale_blocks_ptr);
1519+ #ifdef PADDLE_WITH_HIP
1520+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1521+ #else
1522+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1523+ #endif
14401524 } else {
14411525 ScaleBackwardCUDAKernelTemplate<T,
14421526 T_ACC,
@@ -1446,6 +1530,11 @@ void RMSNormBwdKernel(const Context& dev_ctx,
14461530 true ,
14471531 false ><<<blocks, threads, 0 , stream>>>(
14481532 M, N, dY_data, X_data, invvar_data, dscale_blocks_ptr);
1533+ #ifdef PADDLE_WITH_HIP
1534+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
1535+ #else
1536+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
1537+ #endif
14491538 }
14501539
14511540 // Sum reduction along blocks.y dimension to get final dscale
@@ -1943,6 +2032,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19432032 rstd_data,
19442033 dgamma_data,
19452034 dbeta_data);
2035+ #ifdef PADDLE_WITH_HIP
2036+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2037+ #else
2038+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2039+ #endif
19462040 } else {
19472041 GammaBetaBackwardCUDAKernelTemplate<T,
19482042 T_ACC,
@@ -1959,6 +2053,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19592053 rstd_data,
19602054 dgamma_data,
19612055 dbeta_data);
2056+ #ifdef PADDLE_WITH_HIP
2057+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2058+ #else
2059+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2060+ #endif
19622061 }
19632062 } else {
19642063 if (aligned_grid) {
@@ -1977,6 +2076,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19772076 rstd_data,
19782077 dgamma_data,
19792078 dbeta_data);
2079+ #ifdef PADDLE_WITH_HIP
2080+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2081+ #else
2082+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2083+ #endif
19802084 } else {
19812085 GammaBetaBackwardCUDAKernelTemplate<T,
19822086 T_ACC,
@@ -1993,6 +2097,11 @@ void ConfigureAndLaunchGammaBetaBackwardKernel(const T* dY_data,
19932097 rstd_data,
19942098 dgamma_data,
19952099 dbeta_data);
2100+ #ifdef PADDLE_WITH_HIP
2101+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2102+ #else
2103+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2104+ #endif
19962105 }
19972106 }
19982107}
@@ -2011,6 +2120,11 @@ void LayerNormBwdCompatKernel(
20112120 double epsilon,
20122121 int64_t rows,
20132122 int64_t cols) {
2123+ #ifdef PADDLE_WITH_HIP
2124+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2125+ #else
2126+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2127+ #endif
20142128 using T_ACC = typename phi::dtype::MPTypeTrait<T>::Type;
20152129 if (rows == 0 || cols == 0 ) return ;
20162130 auto stream = dev_ctx.stream ();
@@ -2028,6 +2142,11 @@ void LayerNormBwdCompatKernel(
20282142 int64_t num_blocks = (M + kBlockSize - 1 ) / kBlockSize ;
20292143 VarToRstdKernel<T_ACC><<<num_blocks, kBlockSize , 0 , stream>>>(
20302144 var_data, static_cast <T_ACC>(epsilon), rstd_data, M);
2145+ #ifdef PADDLE_WITH_HIP
2146+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2147+ #else
2148+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2149+ #endif
20312150 }
20322151
20332152 // Step 2: Compute dX using vectorized or non-vectorized kernel
@@ -2059,6 +2178,11 @@ void LayerNormBwdCompatKernel(
20592178 layer_norm_grad_input_kernel_vectorized<T, T_ACC, kVecSize >
20602179 <<<blocks, num_threads, nshared, stream>>>(
20612180 dY_data, X_data, mean_data, rstd_data, gamma_data, dX_data, N);
2181+ #ifdef PADDLE_WITH_HIP
2182+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2183+ #else
2184+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2185+ #endif
20622186 } else {
20632187 layer_norm_grad_input_kernel<T, T_ACC>
20642188 <<<blocks, num_threads, nshared, stream>>>(
@@ -2112,6 +2236,11 @@ void LayerNormBwdCompatKernel(
21122236 rstd_data,
21132237 dgamma_blocks_ptr,
21142238 dbeta_blocks_ptr);
2239+ #ifdef PADDLE_WITH_HIP
2240+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2241+ #else
2242+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2243+ #endif
21152244 } else {
21162245 GammaBetaBackwardCUDAKernelTemplate<T,
21172246 T_ACC,
@@ -2128,6 +2257,11 @@ void LayerNormBwdCompatKernel(
21282257 rstd_data,
21292258 dgamma_blocks_ptr,
21302259 dbeta_blocks_ptr);
2260+ #ifdef PADDLE_WITH_HIP
2261+ PADDLE_ENFORCE_GPU_SUCCESS (hipGetLastError ());
2262+ #else
2263+ PADDLE_ENFORCE_GPU_SUCCESS (cudaGetLastError ());
2264+ #endif
21312265 }
21322266
21332267 // Sum reduction along blocks.y dimension to get final dgamma/dbeta.
0 commit comments