@@ -341,6 +341,7 @@ void thd_read_half_tensor(const Tensor &tensor, const Tensor &cu_seqlens, Tensor
341341 thd_read_half_tensor_kernel<<<grid, block, sizeof (int ) * (batch + 1 ), stream>>> (
342342 half.data .dptr , tensor.data .dptr , reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch,
343343 hidden_size_in_bytes, half_idx, tensor_shape[seq_dim]);
344+ NVTE_CHECK_CUDA (cudaGetLastError ());
344345}
345346
346347/* **************************************************************************************************
@@ -397,11 +398,13 @@ void thd_second_half_lse_correction(Tensor lse, const Tensor &lse_per_step,
397398 reinterpret_cast <float *>(lse.data .dptr ), reinterpret_cast <float *>(lse_per_step.data .dptr ),
398399 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, num_heads, lse_seqlen,
399400 second_half_lse_seqlen);
401+ NVTE_CHECK_CUDA (cudaGetLastError ());
400402 } else {
401403 thd_lse_kernel<false , LseCorrectionFunctor><<<grid, block, sizeof (int ) * (batch + 1 ), stream>>> (
402404 reinterpret_cast <float *>(lse.data .dptr ), reinterpret_cast <float *>(lse_per_step.data .dptr ),
403405 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, num_heads, lse_seqlen,
404406 second_half_lse_seqlen);
407+ NVTE_CHECK_CUDA (cudaGetLastError ());
405408 }
406409}
407410
@@ -446,11 +449,13 @@ void thd_read_second_half_lse(const Tensor &lse, const Tensor &cu_seqlens, Tenso
446449 reinterpret_cast <float *>(lse.data .dptr ), reinterpret_cast <float *>(half_lse.data .dptr ),
447450 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, num_heads, lse_seqlen,
448451 second_half_lse_seqlen);
452+ NVTE_CHECK_CUDA (cudaGetLastError ());
449453 } else {
450454 thd_lse_kernel<false , ReadLseFunctor><<<grid, block, sizeof (int ) * (batch + 1 ), stream>>> (
451455 reinterpret_cast <float *>(lse.data .dptr ), reinterpret_cast <float *>(half_lse.data .dptr ),
452456 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, num_heads, lse_seqlen,
453457 second_half_lse_seqlen);
458+ NVTE_CHECK_CUDA (cudaGetLastError ());
454459 }
455460}
456461
@@ -519,6 +524,7 @@ static void thd_out_correction_helper(Tensor out, const Tensor &out_per_step, co
519524 reinterpret_cast <float *>(lse_per_step.data .dptr ),
520525 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, num_heads, dim_per_head,
521526 lse_seqlen, lse_per_step_seqlen);
527+ NVTE_CHECK_CUDA (cudaGetLastError ());
522528 } else {
523529 thd_out_correction_kernel<dtype, only_second_half, tile, false >
524530 <<<grid, block, sizeof (int ) * (batch + 1 ), stream>>> (
@@ -528,6 +534,7 @@ static void thd_out_correction_helper(Tensor out, const Tensor &out_per_step, co
528534 reinterpret_cast <float *>(lse_per_step.data .dptr ),
529535 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, num_heads, dim_per_head,
530536 lse_seqlen, lse_per_step_seqlen);
537+ NVTE_CHECK_CUDA (cudaGetLastError ());
531538 }
532539}
533540
@@ -602,6 +609,7 @@ static void thd_grad_correction_helper(Tensor grad, const Tensor &grad_per_step,
602609 reinterpret_cast <dtype *>(grad.data .dptr ),
603610 reinterpret_cast <dtype *>(grad_per_step.data .dptr ),
604611 reinterpret_cast <int *>(cu_seqlens.data .dptr ), batch, hidden_size, total_tokens);
612+ NVTE_CHECK_CUDA (cudaGetLastError ());
605613}
606614
607615template <typename dtype>
@@ -667,6 +675,7 @@ void thd_get_partitioned_indices(const Tensor &cu_seqlens, Tensor output, int to
667675 thd_partition_indices_kernel<<<grid, block, sizeof (int ) * (batch + 1 ), stream>>> (
668676 reinterpret_cast <int *>(output.data .dptr ), reinterpret_cast <int *>(cu_seqlens.data .dptr ),
669677 batch, total_tokens, world_size, rank);
678+ NVTE_CHECK_CUDA (cudaGetLastError ());
670679}
671680
672681} // namespace context_parallel
0 commit comments