huggingface
diff --git a/‎timm/layers/activations.py‎
Lines changed: 11 additions & 11 deletions b/‎timm/layers/activations.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎timm/layers/activations_me.py‎
Lines changed: 5 additions & 5 deletions b/‎timm/layers/activations_me.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎timm/layers/adaptive_avgmax_pool.py‎
Lines changed: 7 additions & 7 deletions b/‎timm/layers/adaptive_avgmax_pool.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎timm/layers/cbam.py‎
Lines changed: 8 additions & 8 deletions b/‎timm/layers/cbam.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎timm/layers/classifier.py‎
Lines changed: 2 additions & 2 deletions b/‎timm/layers/classifier.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎timm/layers/cond_conv2d.py‎
Lines changed: 12 additions & 10 deletions b/‎timm/layers/cond_conv2d.py‎
Lines changed: 12 additions & 10 deletions
@@ -19,7 +19,7 @@ def swish(x, inplace: bool = False):
 
 class Swish(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(Swish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -37,7 +37,7 @@ class Mish(nn.Module):
     """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
     """
     def __init__(self, inplace: bool = False):
-        super(Mish, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return mish(x)
@@ -50,7 +50,7 @@ def sigmoid(x, inplace: bool = False):
 # PyTorch has this, but not with a consistent inplace argument interface
 class Sigmoid(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(Sigmoid, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -64,7 +64,7 @@ def tanh(x, inplace: bool = False):
 # PyTorch has this, but not with a consistent inplace argument interface
 class Tanh(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(Tanh, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -78,7 +78,7 @@ def hard_swish(x, inplace: bool = False):
 
 class HardSwish(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSwish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -94,7 +94,7 @@ def hard_sigmoid(x, inplace: bool = False):
 
 class HardSigmoid(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSigmoid, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -114,7 +114,7 @@ def hard_mish(x, inplace: bool = False):
 
 class HardMish(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardMish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, x):
@@ -125,7 +125,7 @@ class PReLU(nn.PReLU):
     """Applies PReLU (w/ dummy inplace arg)
     """
     def __init__(self, num_parameters: int = 1, init: float = 0.25, inplace: bool = False) -> None:
-        super(PReLU, self).__init__(num_parameters=num_parameters, init=init)
+        super().__init__(num_parameters=num_parameters, init=init)
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.prelu(input, self.weight)
@@ -139,7 +139,7 @@ class GELU(nn.Module):
     """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
     """
     def __init__(self, inplace: bool = False):
-        super(GELU, self).__init__()
+        super().__init__()
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.gelu(input)
@@ -153,7 +153,7 @@ class GELUTanh(nn.Module):
     """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
     """
     def __init__(self, inplace: bool = False):
-        super(GELUTanh, self).__init__()
+        super().__init__()
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.gelu(input, approximate='tanh')
@@ -167,7 +167,7 @@ class QuickGELU(nn.Module):
     """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
     """
     def __init__(self, inplace: bool = False):
-        super(QuickGELU, self).__init__()
+        super().__init__()
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         return quick_gelu(input)
@@ -49,7 +49,7 @@ def swish_me(x, inplace=False):
 
 class SwishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(SwishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return SwishAutoFn.apply(x)
@@ -86,7 +86,7 @@ def mish_me(x, inplace=False):
 
 class MishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(MishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return MishAutoFn.apply(x)
@@ -119,7 +119,7 @@ def hard_sigmoid_me(x, inplace: bool = False):
 
 class HardSigmoidMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSigmoidMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return HardSigmoidAutoFn.apply(x)
@@ -161,7 +161,7 @@ def hard_swish_me(x, inplace=False):
 
 class HardSwishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardSwishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return HardSwishAutoFn.apply(x)
@@ -199,7 +199,7 @@ def hard_mish_me(x, inplace: bool = False):
 
 class HardMishMe(nn.Module):
     def __init__(self, inplace: bool = False):
-        super(HardMishMe, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return HardMishAutoFn.apply(x)
 
@@ -57,7 +57,7 @@ def select_adaptive_pool2d(x, pool_type='avg', output_size: _int_tuple_2_t = 1):
 
 class FastAdaptiveAvgPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: F = 'NCHW'):
-        super(FastAdaptiveAvgPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim = get_spatial_dim(input_fmt)
 
@@ -67,7 +67,7 @@ def forward(self, x):
 
 class FastAdaptiveMaxPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveMaxPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim = get_spatial_dim(input_fmt)
 
@@ -77,7 +77,7 @@ def forward(self, x):
 
 class FastAdaptiveAvgMaxPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveAvgMaxPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim = get_spatial_dim(input_fmt)
 
@@ -89,7 +89,7 @@ def forward(self, x):
 
 class FastAdaptiveCatAvgMaxPool(nn.Module):
     def __init__(self, flatten: bool = False, input_fmt: str = 'NCHW'):
-        super(FastAdaptiveCatAvgMaxPool, self).__init__()
+        super().__init__()
         self.flatten = flatten
         self.dim_reduce = get_spatial_dim(input_fmt)
         if flatten:
@@ -105,7 +105,7 @@ def forward(self, x):
 
 class AdaptiveAvgMaxPool2d(nn.Module):
     def __init__(self, output_size: _int_tuple_2_t = 1):
-        super(AdaptiveAvgMaxPool2d, self).__init__()
+        super().__init__()
         self.output_size = output_size
 
     def forward(self, x):
@@ -114,7 +114,7 @@ def forward(self, x):
 
 class AdaptiveCatAvgMaxPool2d(nn.Module):
     def __init__(self, output_size: _int_tuple_2_t = 1):
-        super(AdaptiveCatAvgMaxPool2d, self).__init__()
+        super().__init__()
         self.output_size = output_size
 
     def forward(self, x):
@@ -131,7 +131,7 @@ def __init__(
             flatten: bool = False,
             input_fmt: str = 'NCHW',
     ):
-        super(SelectAdaptivePool2d, self).__init__()
+        super().__init__()
         assert input_fmt in ('NCHW', 'NHWC')
         self.pool_type = pool_type or ''  # convert other falsy values to empty string for consistent TS typing
         pool_type = pool_type.lower()
 
@@ -34,7 +34,7 @@ def __init__(
             dtype=None,
     ):
         dd = {'device': device, 'dtype': dtype}
-        super(ChannelAttn, self).__init__()
+        super().__init__()
         if not rd_channels:
             rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
         self.fc1 = nn.Conv2d(channels, rd_channels, 1, bias=mlp_bias, **dd)
@@ -63,7 +63,7 @@ def __init__(
             device=None,
             dtype=None
     ):
-        super(LightChannelAttn, self).__init__(
+        super().__init__(
             channels, rd_ratio, rd_channels, rd_divisor, act_layer, gate_layer, mlp_bias, device=device, dtype=dtype)
 
     def forward(self, x):
@@ -82,8 +82,8 @@ def __init__(
             device=None,
             dtype=None,
     ):
-        super(SpatialAttn, self).__init__()
-        self.conv = ConvNormAct(2, 1, kernel_size, apply_act=False)
+        super().__init__()
+        self.conv = ConvNormAct(2, 1, kernel_size, apply_act=False, device=device, dtype=dtype)
         self.gate = create_act_layer(gate_layer)
 
     def forward(self, x):
@@ -102,8 +102,8 @@ def __init__(
             device=None,
             dtype=None,
     ):
-        super(LightSpatialAttn, self).__init__()
-        self.conv = ConvNormAct(1, 1, kernel_size, apply_act=False)
+        super().__init__()
+        self.conv = ConvNormAct(1, 1, kernel_size, apply_act=False, device=device, dtype=dtype)
         self.gate = create_act_layer(gate_layer)
 
     def forward(self, x):
@@ -127,7 +127,7 @@ def __init__(
             dtype=None,
     ):
         dd = {'device': device, 'dtype': dtype}
-        super(CbamModule, self).__init__()
+        super().__init__()
         self.channel = ChannelAttn(
             channels,
             rd_ratio=rd_ratio,
@@ -161,7 +161,7 @@ def __init__(
             dtype=None,
     ):
         dd = {'device': device, 'dtype': dtype}
-        super(LightCbamModule, self).__init__()
+        super().__init__()
         self.channel = LightChannelAttn(
             channels,
             rd_ratio=rd_ratio,
 
@@ -95,7 +95,7 @@ def __init__(
             pool_type: Global pooling type, pooling disabled if empty string ('').
             drop_rate: Pre-classifier dropout rate.
         """
-        super(ClassifierHead, self).__init__()
+        super().__init__()
         self.in_features = in_features
         self.use_conv = use_conv
         self.input_fmt = input_fmt
@@ -258,7 +258,7 @@ def __init__(
         norm_layer = get_norm_layer(norm_layer)
         act_layer = get_act_layer(act_layer)
 
-        self.norm = norm_layer(in_features)
+        self.norm = norm_layer(in_features, **dd)
         if hidden_size:
             self.pre_logits = nn.Sequential(OrderedDict([
                 ('fc', nn.Linear(in_features, hidden_size, **dd)),
 
@@ -8,6 +8,8 @@
 
 import math
 from functools import partial
+from typing import Union, Tuple
+
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
@@ -43,20 +45,20 @@ class CondConv2d(nn.Module):
 
     def __init__(
             self,
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=1,
-            padding='',
-            dilation=1,
-            groups=1,
-            bias=False,
-            num_experts=4,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: Union[int, Tuple[int, int]] = 3,
+            stride: Union[int, Tuple[int, int]] = 1,
+            padding: Union[int, Tuple[int, int], str] = '',
+            dilation: Union[int, Tuple[int, int]] = 1,
+            groups: int = 1,
+            bias: bool = False,
+            num_experts: int = 4,
             device=None,
             dtype=None,
     ):
         dd = {'device': device, 'dtype': dtype}
-        super(CondConv2d, self).__init__()
+        super().__init__()
 
         self.in_channels = in_channels
         self.out_channels = out_channels