diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
index a6e9245d4..41d963cd8 100644
--- a/csrc/flash_attn/flash_api.cpp
+++ b/csrc/flash_attn/flash_api.cpp
@@ -436,8 +436,7 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x round_mult
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
 
@@ -656,8 +655,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
     const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat));
@@ -898,8 +896,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x multipl
     bool loop = true;
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
@@ -1126,8 +1123,7 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     bool loop = true;
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     auto softmax_d = torch::empty({num_heads, total_q + 128 * batch_size}, opts.dtype(at::kFloat));
@@ -1363,8 +1359,7 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
 
diff --git a/csrc/flash_attn_ck/mha_bwd.cpp b/csrc/flash_attn_ck/mha_bwd.cpp
index e4a4b2a6b..0f17a71d6 100644
--- a/csrc/flash_attn_ck/mha_bwd.cpp
+++ b/csrc/flash_attn_ck/mha_bwd.cpp
@@ -309,8 +309,7 @@ mha_bwd(const at::Tensor &dout,                   // batch_size x seqlen_q x num
         dv = torch::empty_like(v);
     }
 
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
diff --git a/csrc/flash_attn_ck/mha_fwd.cpp b/csrc/flash_attn_ck/mha_fwd.cpp
index 7202cf2c8..013045a54 100644
--- a/csrc/flash_attn_ck/mha_fwd.cpp
+++ b/csrc/flash_attn_ck/mha_fwd.cpp
@@ -234,8 +234,7 @@ mha_fwd(at::Tensor &q,                            // batch_size x seqlen_q x num
     }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     bool has_lse = true;
diff --git a/csrc/flash_attn_ck/mha_fwd_kvcache.cpp b/csrc/flash_attn_ck/mha_fwd_kvcache.cpp
index 2f8b64363..6b2163ed7 100644
--- a/csrc/flash_attn_ck/mha_fwd_kvcache.cpp
+++ b/csrc/flash_attn_ck/mha_fwd_kvcache.cpp
@@ -399,8 +399,7 @@ mha_fwd_kvcache(at::Tensor &q,                                      // batch_siz
     const int head_size_8x = round_multiple(head_size_og, 8);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
 
diff --git a/csrc/flash_attn_ck/mha_varlen_bwd.cpp b/csrc/flash_attn_ck/mha_varlen_bwd.cpp
index 2e5dd7b51..233ea9375 100644
--- a/csrc/flash_attn_ck/mha_varlen_bwd.cpp
+++ b/csrc/flash_attn_ck/mha_varlen_bwd.cpp
@@ -327,8 +327,7 @@ mha_varlen_bwd(const at::Tensor &dout,                   // total_q x num_heads
         dv = torch::empty_like(v);
     }
 
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
diff --git a/csrc/flash_attn_ck/mha_varlen_fwd.cpp b/csrc/flash_attn_ck/mha_varlen_fwd.cpp
index 7e8a347d4..76f2d660c 100644
--- a/csrc/flash_attn_ck/mha_varlen_fwd.cpp
+++ b/csrc/flash_attn_ck/mha_varlen_fwd.cpp
@@ -253,8 +253,7 @@ mha_varlen_fwd(at::Tensor &q,                   // total_q x num_heads x head_si
     }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     bool has_lse = true;
diff --git a/csrc/ft_attention/ft_attention.cpp b/csrc/ft_attention/ft_attention.cpp
index b307cffc2..c1f832b7c 100644
--- a/csrc/ft_attention/ft_attention.cpp
+++ b/csrc/ft_attention/ft_attention.cpp
@@ -190,8 +190,7 @@ torch::Tensor single_query_attention(const torch::Tensor q,
     }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     torch::Tensor out = torch::empty_like(q);
 
diff --git a/csrc/fused_dense_lib/fused_dense.cpp b/csrc/fused_dense_lib/fused_dense.cpp
index 52a203889..55e3beca6 100644
--- a/csrc/fused_dense_lib/fused_dense.cpp
+++ b/csrc/fused_dense_lib/fused_dense.cpp
@@ -53,8 +53,7 @@ std::vector<at::Tensor> linear_bias_wgrad(at::Tensor input, at::Tensor d_output,
   CHECK_SHAPE(d_output, batch_size, out_features);
 
   // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  at::cuda::CUDAGuard device_guard{input.device()};
 
   // create output/workspace tensor
   auto opts = input.options();
@@ -115,8 +114,7 @@ std::vector<at::Tensor> linear_act_forward(at::Tensor input, at::Tensor weight,
   }
 
   // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  at::cuda::CUDAGuard device_guard{input.device()};
 
   // create output/workspace tensor
   auto opts = input.options();
@@ -176,8 +174,7 @@ std::vector<at::Tensor> bias_act_linear_dgrad_bgrad(
   CHECK_SHAPE(pre_act, batch_size, is_gelu ? in_features : in_features / 8);
 
   // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)weight.get_device()};
+  at::cuda::CUDAGuard device_guard{weight.device()};
 
   // create output/workspace tensor
   auto opts = weight.options();
diff --git a/csrc/layer_norm/ln_api.cpp b/csrc/layer_norm/ln_api.cpp
index 3981bbad5..612aa7248 100644
--- a/csrc/layer_norm/ln_api.cpp
+++ b/csrc/layer_norm/ln_api.cpp
@@ -194,8 +194,7 @@ std::vector<at::Tensor> dropout_add_ln_fwd(const at::Tensor &x0,      // Input:
     TORCH_CHECK(epsilon >= 0.f);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x0.get_device()};
+    at::cuda::CUDAGuard device_guard{x0.device()};
 
     auto opts = x0.options();
 
@@ -398,8 +397,7 @@ std::vector<at::Tensor> dropout_add_ln_bwd(const at::Tensor &dz,     // BxSxhidd
     TORCH_CHECK(gamma.numel() == cols);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)dz.get_device()};
+    at::cuda::CUDAGuard device_guard{dz.device()};
 
     auto opts = x.options();
 
@@ -558,8 +556,7 @@ std::vector<at::Tensor> dropout_add_ln_parallel_residual_fwd(
     TORCH_CHECK(epsilon >= 0.f);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x0.get_device()};
+    at::cuda::CUDAGuard device_guard{x0.device()};
 
     auto opts = x0.options();
 
@@ -744,8 +741,7 @@ std::vector<at::Tensor> dropout_add_ln_parallel_residual_bwd(
     TORCH_CHECK(mu.sizes() == rsigma.sizes());
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)dz0.get_device()};
+    at::cuda::CUDAGuard device_guard{dz0.device()};
 
     auto opts = x.options();
 
diff --git a/csrc/rotary/rotary.cpp b/csrc/rotary/rotary.cpp
index b2a3cf0f7..640eea423 100644
--- a/csrc/rotary/rotary.cpp
+++ b/csrc/rotary/rotary.cpp
@@ -30,8 +30,7 @@ void apply_rotary(const torch::Tensor x1, const torch::Tensor x2,
     TORCH_CHECK(out1.sizes() == out2.sizes());
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x1.get_device()};
+    at::cuda::CUDAGuard device_guard{x1.device()};
 
     apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj);
 }
diff --git a/csrc/xentropy/xentropy_kernel.cu b/csrc/xentropy/xentropy_kernel.cu
index 8d8836e6e..66aab0007 100644
--- a/csrc/xentropy/xentropy_kernel.cu
+++ b/csrc/xentropy/xentropy_kernel.cu
@@ -631,8 +631,7 @@ std::vector<Tensor> host_softmax_xentropy(
   AT_ASSERTM(labels_.scalar_type() == ScalarType::Long,"Label type should be CUDA Long");
 
   // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)input_.get_device()};
+  at::cuda::CUDAGuard device_guard{input_.device()};
 
   auto input = input_.contiguous();
   Tensor max_log_sum_exp = at::empty_like(labels_, input.options().dtype(ScalarType::Float));
@@ -690,8 +689,7 @@ Tensor host_softmax_xentropy_backward(
     bool inplace,
     const int total_classes) {
   // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)grad_loss.get_device()};
+  at::cuda::CUDAGuard device_guard{grad_loss.device()};
 
   const int64_t dim = 1;
   Tensor gI = inplace ? logits_ : at::empty_like(logits_);
diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp
index 6a0abf6f8..a9b4fba4c 100644
--- a/hopper/flash_api.cpp
+++ b/hopper/flash_api.cpp
@@ -551,8 +551,7 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
     if (is_causal) { window_size_right = 0; }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
 
@@ -758,8 +757,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
     if (is_causal) { window_size_right = 0; }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat));
@@ -948,8 +946,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     // Need softmax_d to have seqlen_q_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64
@@ -1168,8 +1165,7 @@ mha_varlen_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x
     if (is_causal) { window_size_right = 0; }
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();
     // Need softmax_d to have total_q_padded_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64
@@ -1393,8 +1389,7 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
 
     // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    at::cuda::CUDAGuard device_guard{q.device()};
 
     auto opts = q.options();