huggingface · rwightman · Oct 3, 2025 · Sep 26, 2025 · Sep 27, 2025 · Sep 27, 2025
diff --git a/timm/layers/activations.py b/timm/layers/activations.py
@@ -19,7 +19,7 @@ def swish(x, inplace: bool = False):
 
 class Swish(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(Swish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -37,7 +37,7 @@ class Mish(nn.Module):
     """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
     """
     def __init__(self, inplace: bool = False):
-        super(Mish, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return mish(x)
@@ -50,7 +50,7 @@ def sigmoid(x, inplace: bool = False):
 # PyTorch has this, but not with a consistent inplace argument interface
 class Sigmoid(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(Sigmoid, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -64,7 +64,7 @@ def tanh(x, inplace: bool = False):
 # PyTorch has this, but not with a consistent inplace argument interface
 class Tanh(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(Tanh, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -78,7 +78,7 @@ def hard_swish(x, inplace: bool = False):
 
 class HardSwish(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSwish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -94,7 +94,7 @@ def hard_sigmoid(x, inplace: bool = False):
 
 class HardSigmoid(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSigmoid, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -114,7 +114,7 @@ def hard_mish(x, inplace: bool = False):
 
 class HardMish(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardMish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -125,7 +125,7 @@ class PReLU(nn.PReLU):
     """Applies PReLU (w/ dummy inplace arg)
     """
     def __init__(self, num_parameters: int = 1, init: float = 0.25, inplace: bool = False) -> None:
-        super(PReLU, self).__init__(num_parameters=num_parameters, init=init)
+        super().__init__(num_parameters=num_parameters, init=init)
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.prelu(input, self.weight)
@@ -139,7 +139,7 @@ class GELU(nn.Module):
     """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
     """
     def __init__(self, inplace: bool = False):
-        super(GELU, self).__init__()
+        super().__init__()
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.gelu(input)
@@ -153,7 +153,7 @@ class GELUTanh(nn.Module):
     """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
     """
     def __init__(self, inplace: bool = False):
-        super(GELUTanh, self).__init__()
+        super().__init__()
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.gelu(input, approximate='tanh')
@@ -167,7 +167,7 @@ class QuickGELU(nn.Module):
     """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
     """
     def __init__(self, inplace: bool = False):
-        super(QuickGELU, self).__init__()
+        super().__init__()
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return quick_gelu(input)
diff --git a/timm/layers/activations_me.py b/timm/layers/activations_me.py
@@ -49,7 +49,7 @@ def swish_me(x, inplace=False):
 
 class SwishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(SwishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return SwishAutoFn.apply(x)
@@ -86,7 +86,7 @@ def mish_me(x, inplace=False):
 
 class MishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(MishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return MishAutoFn.apply(x)
@@ -119,7 +119,7 @@ def hard_sigmoid_me(x, inplace: bool = False):
 
 class HardSigmoidMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSigmoidMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return HardSigmoidAutoFn.apply(x)
@@ -161,7 +161,7 @@ def hard_swish_me(x, inplace=False):
 
 class HardSwishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSwishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return HardSwishAutoFn.apply(x)
@@ -199,7 +199,7 @@ def hard_mish_me(x, inplace: bool = False):
 
 class HardMishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardMishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return HardMishAutoFn.apply(x)

diff --git a/timm/layers/adaptive_avgmax_pool.py b/timm/layers/adaptive_avgmax_pool.py
@@ -57,7 +57,7 @@ def select_adaptive_pool2d(x, pool_type='avg', output_size: _int_tuple_2_t = 1):
 
 class FastAdaptiveAvgPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: F = 'NCHW'):
-        super(FastAdaptiveAvgPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim = get_spatial_dim(input_fmt)
 
@@ -67,7 +67,7 @@ def forward(self, x):
 
 class FastAdaptiveMaxPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveMaxPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim = get_spatial_dim(input_fmt)
 
@@ -77,7 +77,7 @@ def forward(self, x):
 
 class FastAdaptiveAvgMaxPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveAvgMaxPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim = get_spatial_dim(input_fmt)
 
@@ -89,7 +89,7 @@ def forward(self, x):
 
 class FastAdaptiveCatAvgMaxPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveCatAvgMaxPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim_reduce = get_spatial_dim(input_fmt)
         if flatten:
@@ -105,7 +105,7 @@ def forward(self, x):
 
 class AdaptiveAvgMaxPool2d(nn.Module):
     def __init__(self, output_size: _int_tuple_2_t = 1):
-        super(AdaptiveAvgMaxPool2d, self).__init__()
+        super().__init__()
         self.output_size = output_size
 
     def forward(self, x):
@@ -114,7 +114,7 @@ def forward(self, x):
 
 class AdaptiveCatAvgMaxPool2d(nn.Module):
     def __init__(self, output_size: _int_tuple_2_t = 1):
-        super(AdaptiveCatAvgMaxPool2d, self).__init__()
+        super().__init__()
         self.output_size = output_size
 
     def forward(self, x):
@@ -131,7 +131,7 @@ def __init__(
             flatten: bool = False,
             input_fmt: str = 'NCHW',
     ):
-        super(SelectAdaptivePool2d, self).__init__()
+        super().__init__()
         assert input_fmt in ('NCHW', 'NHWC')
         self.pool_type = pool_type or ''  # convert other falsy values to empty string for consistent TS typing
         pool_type = pool_type.lower()

diff --git a/timm/layers/attention.py b/timm/layers/attention.py
@@ -36,6 +36,8 @@ def __init__(
             attn_drop: float = 0.,
             proj_drop: float = 0.,
             norm_layer: Optional[Type[nn.Module]] = None,
+            device=None,
+            dtype=None
     ) -> None:
         """Initialize the Attention module.
 
@@ -50,6 +52,7 @@ def __init__(
             norm_layer: Normalization layer constructor for QK normalization if enabled
         """
         super().__init__()
+        dd = {'device': device, 'dtype': dtype}
         assert dim % num_heads == 0, 'dim should be divisible by num_heads'
         if qk_norm or scale_norm:
             assert norm_layer is not None, 'norm_layer must be provided if qk_norm or scale_norm is True'
@@ -58,12 +61,12 @@ def __init__(
         self.scale = self.head_dim ** -0.5
         self.fused_attn = use_fused_attn()
 
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias, **dd)
+        self.q_norm = norm_layer(self.head_dim, **dd) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim, **dd) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
-        self.norm = norm_layer(dim) if scale_norm else nn.Identity()
-        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.norm = norm_layer(dim, **dd) if scale_norm else nn.Identity()
+        self.proj = nn.Linear(dim, dim, bias=proj_bias, **dd)
         self.proj_drop = nn.Dropout(proj_drop)
 
     def forward(
@@ -122,6 +125,8 @@ def __init__(
             scale_norm: bool = False,
             proj_bias: bool = True,
             rotate_half: bool = False,
+            device=None,
+            dtype=None,
     ):
         """Initialize the Attention module.
 
@@ -140,6 +145,7 @@ def __init__(
             rotate_half: Use 'half' ROPE layout instead of default 'interleaved'
         """
         super().__init__()
+        dd = {'device': device, 'dtype': dtype}
         if scale_norm or qk_norm:
             assert norm_layer is not None, 'norm_layer must be provided if qk_norm or scale_norm is True'
         self.num_heads = num_heads
@@ -153,19 +159,19 @@ def __init__(
         self.rotate_half = rotate_half
 
         if qkv_fused:
-            self.qkv = nn.Linear(dim, attn_dim * 3, bias=qkv_bias)
+            self.qkv = nn.Linear(dim, attn_dim * 3, bias=qkv_bias, **dd)
             self.q_proj = self.k_proj = self.v_proj = None
         else:
             self.qkv = None
-            self.q_proj = nn.Linear(dim, attn_dim, bias=qkv_bias)
-            self.k_proj = nn.Linear(dim, attn_dim, bias=qkv_bias)
-            self.v_proj = nn.Linear(dim, attn_dim, bias=qkv_bias)
+            self.q_proj = nn.Linear(dim, attn_dim, bias=qkv_bias, **dd)
+            self.k_proj = nn.Linear(dim, attn_dim, bias=qkv_bias, **dd)
+            self.v_proj = nn.Linear(dim, attn_dim, bias=qkv_bias, **dd)
 
-        self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.q_norm = norm_layer(head_dim, **dd) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(head_dim, **dd) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
-        self.norm = norm_layer(attn_dim) if scale_norm else nn.Identity()
-        self.proj = nn.Linear(attn_dim, dim, bias=proj_bias)
+        self.norm = norm_layer(attn_dim, **dd) if scale_norm else nn.Identity()
+        self.proj = nn.Linear(attn_dim, dim, bias=proj_bias, **dd)
         self.proj_drop = nn.Dropout(proj_drop)
 
     def forward(