diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp index a6e9245d4..41d963cd8 100644 --- a/csrc/flash_attn/flash_api.cpp +++ b/csrc/flash_attn/flash_api.cpp @@ -436,8 +436,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x round_mult const int seqlen_k_rounded = round_multiple(seqlen_k, 128); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); @@ -656,8 +655,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat)); @@ -898,8 +896,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x multipl bool loop = true; // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat)); @@ -1126,8 +1123,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size bool loop = true; // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); auto softmax_d = torch::empty({num_heads, total_q + 128 * batch_size}, opts.dtype(at::kFloat)); @@ -1363,8 +1359,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he const int seqlen_k_rounded = round_multiple(seqlen_k, 128); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); diff --git a/csrc/flash_attn_ck/mha_bwd.cpp b/csrc/flash_attn_ck/mha_bwd.cpp index e4a4b2a6b..0f17a71d6 100644 --- a/csrc/flash_attn_ck/mha_bwd.cpp +++ b/csrc/flash_attn_ck/mha_bwd.cpp @@ -309,8 +309,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num dv = torch::empty_like(v); } - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat)); diff --git a/csrc/flash_attn_ck/mha_fwd.cpp b/csrc/flash_attn_ck/mha_fwd.cpp index 7202cf2c8..013045a54 100644 --- a/csrc/flash_attn_ck/mha_fwd.cpp +++ b/csrc/flash_attn_ck/mha_fwd.cpp @@ -234,8 +234,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); bool has_lse = true; diff --git a/csrc/flash_attn_ck/mha_fwd_kvcache.cpp b/csrc/flash_attn_ck/mha_fwd_kvcache.cpp index 2f8b64363..6b2163ed7 100644 --- a/csrc/flash_attn_ck/mha_fwd_kvcache.cpp +++ b/csrc/flash_attn_ck/mha_fwd_kvcache.cpp @@ -399,8 +399,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_siz const int head_size_8x = round_multiple(head_size_og, 8); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); diff --git a/csrc/flash_attn_ck/mha_varlen_bwd.cpp b/csrc/flash_attn_ck/mha_varlen_bwd.cpp index 2e5dd7b51..233ea9375 100644 --- a/csrc/flash_attn_ck/mha_varlen_bwd.cpp +++ b/csrc/flash_attn_ck/mha_varlen_bwd.cpp @@ -327,8 +327,7 @@ mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads dv = torch::empty_like(v); } - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat)); diff --git a/csrc/flash_attn_ck/mha_varlen_fwd.cpp b/csrc/flash_attn_ck/mha_varlen_fwd.cpp index 7e8a347d4..76f2d660c 100644 --- a/csrc/flash_attn_ck/mha_varlen_fwd.cpp +++ b/csrc/flash_attn_ck/mha_varlen_fwd.cpp @@ -253,8 +253,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_si } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); bool has_lse = true; diff --git a/csrc/ft_attention/ft_attention.cpp b/csrc/ft_attention/ft_attention.cpp index b307cffc2..c1f832b7c 100644 --- a/csrc/ft_attention/ft_attention.cpp +++ b/csrc/ft_attention/ft_attention.cpp @@ -190,8 +190,7 @@ torch::Tensor single_query_attention(const torch::Tensor q, } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; torch::Tensor out = torch::empty_like(q); diff --git a/csrc/fused_dense_lib/fused_dense.cpp b/csrc/fused_dense_lib/fused_dense.cpp index 52a203889..55e3beca6 100644 --- a/csrc/fused_dense_lib/fused_dense.cpp +++ b/csrc/fused_dense_lib/fused_dense.cpp @@ -53,8 +53,7 @@ std::vector linear_bias_wgrad(at::Tensor input, at::Tensor d_output, CHECK_SHAPE(d_output, batch_size, out_features); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)input.get_device()}; + at::cuda::CUDAGuard device_guard{input.device()}; // create output/workspace tensor auto opts = input.options(); @@ -115,8 +114,7 @@ std::vector linear_act_forward(at::Tensor input, at::Tensor weight, } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)input.get_device()}; + at::cuda::CUDAGuard device_guard{input.device()}; // create output/workspace tensor auto opts = input.options(); @@ -176,8 +174,7 @@ std::vector bias_act_linear_dgrad_bgrad( CHECK_SHAPE(pre_act, batch_size, is_gelu ? in_features : in_features / 8); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)weight.get_device()}; + at::cuda::CUDAGuard device_guard{weight.device()}; // create output/workspace tensor auto opts = weight.options(); diff --git a/csrc/layer_norm/ln_api.cpp b/csrc/layer_norm/ln_api.cpp index 3981bbad5..612aa7248 100644 --- a/csrc/layer_norm/ln_api.cpp +++ b/csrc/layer_norm/ln_api.cpp @@ -194,8 +194,7 @@ std::vector dropout_add_ln_fwd(const at::Tensor &x0, // Input: TORCH_CHECK(epsilon >= 0.f); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)x0.get_device()}; + at::cuda::CUDAGuard device_guard{x0.device()}; auto opts = x0.options(); @@ -398,8 +397,7 @@ std::vector dropout_add_ln_bwd(const at::Tensor &dz, // BxSxhidd TORCH_CHECK(gamma.numel() == cols); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)dz.get_device()}; + at::cuda::CUDAGuard device_guard{dz.device()}; auto opts = x.options(); @@ -558,8 +556,7 @@ std::vector dropout_add_ln_parallel_residual_fwd( TORCH_CHECK(epsilon >= 0.f); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)x0.get_device()}; + at::cuda::CUDAGuard device_guard{x0.device()}; auto opts = x0.options(); @@ -744,8 +741,7 @@ std::vector dropout_add_ln_parallel_residual_bwd( TORCH_CHECK(mu.sizes() == rsigma.sizes()); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)dz0.get_device()}; + at::cuda::CUDAGuard device_guard{dz0.device()}; auto opts = x.options(); diff --git a/csrc/rotary/rotary.cpp b/csrc/rotary/rotary.cpp index b2a3cf0f7..640eea423 100644 --- a/csrc/rotary/rotary.cpp +++ b/csrc/rotary/rotary.cpp @@ -30,8 +30,7 @@ void apply_rotary(const torch::Tensor x1, const torch::Tensor x2, TORCH_CHECK(out1.sizes() == out2.sizes()); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)x1.get_device()}; + at::cuda::CUDAGuard device_guard{x1.device()}; apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj); } diff --git a/csrc/xentropy/xentropy_kernel.cu b/csrc/xentropy/xentropy_kernel.cu index 8d8836e6e..66aab0007 100644 --- a/csrc/xentropy/xentropy_kernel.cu +++ b/csrc/xentropy/xentropy_kernel.cu @@ -631,8 +631,7 @@ std::vector host_softmax_xentropy( AT_ASSERTM(labels_.scalar_type() == ScalarType::Long,"Label type should be CUDA Long"); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)input_.get_device()}; + at::cuda::CUDAGuard device_guard{input_.device()}; auto input = input_.contiguous(); Tensor max_log_sum_exp = at::empty_like(labels_, input.options().dtype(ScalarType::Float)); @@ -690,8 +689,7 @@ Tensor host_softmax_xentropy_backward( bool inplace, const int total_classes) { // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)grad_loss.get_device()}; + at::cuda::CUDAGuard device_guard{grad_loss.device()}; const int64_t dim = 1; Tensor gI = inplace ? logits_ : at::empty_like(logits_); diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp index 6a0abf6f8..a9b4fba4c 100644 --- a/hopper/flash_api.cpp +++ b/hopper/flash_api.cpp @@ -551,8 +551,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size if (is_causal) { window_size_right = 0; } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); @@ -758,8 +757,7 @@ mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \s if (is_causal) { window_size_right = 0; } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat)); @@ -948,8 +946,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); // Need softmax_d to have seqlen_q_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64 @@ -1168,8 +1165,7 @@ mha_varlen_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x if (is_causal) { window_size_right = 0; } // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options(); // Need softmax_d to have total_q_padded_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64 @@ -1393,8 +1389,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he const int seqlen_k_rounded = round_multiple(seqlen_k, 128); // Otherwise the kernel will be launched from cuda:0 device - // Cast to char to avoid compiler warning about narrowing - at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + at::cuda::CUDAGuard device_guard{q.device()}; auto opts = q.options();