huggingface
diff --git a/‎timm/layers/blur_pool.py‎
Lines changed: 62 additions & 19 deletions b/‎timm/layers/blur_pool.py‎
Lines changed: 62 additions & 19 deletions
diff --git a/‎timm/layers/mlp.py‎
Lines changed: 48 additions & 47 deletions b/‎timm/layers/mlp.py‎
Lines changed: 48 additions & 47 deletions
diff --git a/‎timm/models/byobnet.py‎
Lines changed: 6 additions & 6 deletions b/‎timm/models/byobnet.py‎
Lines changed: 6 additions & 6 deletions
@@ -7,7 +7,7 @@
 """
 from functools import partial
 from math import comb  # Python 3.8
-from typing import Optional, Type
+from typing import Callable, Optional, Type, Union
 
 import torch
 import torch.nn as nn
@@ -39,8 +39,7 @@ def __init__(
             device=None,
             dtype=None
     ) -> None:
-        dd = {'device': device, 'dtype': dtype}
-        super(BlurPool2d, self).__init__()
+        super().__init__()
         assert filt_size > 1
         self.channels = channels
         self.filt_size = filt_size
@@ -51,12 +50,18 @@ def __init__(
         # (0.5 + 0.5 x)^N => coefficients = C(N,k) / 2^N,  k = 0..N
         coeffs = torch.tensor(
             [comb(filt_size - 1, k) for k in range(filt_size)],
-            **dd,
+            device='cpu',
+            dtype=torch.float32,
         ) / (2 ** (filt_size - 1))  # normalise so coefficients sum to 1
         blur_filter = (coeffs[:, None] * coeffs[None, :])[None, None, :, :]
         if channels is not None:
             blur_filter = blur_filter.repeat(self.channels, 1, 1, 1)
-        self.register_buffer('filt', blur_filter, persistent=False)
+
+        self.register_buffer(
+            'filt',
+            blur_filter.to(device=device, dtype=dtype),
+            persistent=False,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = F.pad(x, self.padding, mode=self.pad_mode)
@@ -69,6 +74,39 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return F.conv2d(x, weight, stride=self.stride, groups=channels)
 
 
+def _normalize_aa_layer(aa_layer: LayerType) -> Callable[..., nn.Module]:
+    """Map string shorthands to callables (class or partial)."""
+    if isinstance(aa_layer, str):
+        key = aa_layer.lower().replace('_', '').replace('-', '')
+        if key in ('avg', 'avgpool'):
+            return nn.AvgPool2d
+        if key in ('blur', 'blurpool'):
+            return BlurPool2d
+        if key == 'blurpc':
+            # preconfigure a constant-pad BlurPool2d
+            return partial(BlurPool2d, pad_mode='constant')
+        raise AssertionError(f"Unknown anti-aliasing layer ({aa_layer}).")
+    return aa_layer
+
+
+def _underlying_cls(layer_callable: Callable[..., nn.Module]):
+    """Return the class behind a callable (unwrap partial), else None."""
+    if isinstance(layer_callable, partial):
+        return layer_callable.func
+    return layer_callable if isinstance(layer_callable, type) else None
+
+
+def _is_blurpool(layer_callable: Callable[..., nn.Module]) -> bool:
+    """True if callable is BlurPool2d or a partial of it."""
+    cls = _underlying_cls(layer_callable)
+    try:
+        return issubclass(cls, BlurPool2d)  # cls may be None, protect below
+    except TypeError:
+        return False
+    except Exception:
+        return False
+
+
 def create_aa(
         aa_layer: LayerType,
         channels: Optional[int] = None,
@@ -77,24 +115,29 @@ def create_aa(
         noop: Optional[Type[nn.Module]] = nn.Identity,
         device=None,
         dtype=None,
-) -> nn.Module:
-    """ Anti-aliasing """
+) -> Optional[nn.Module]:
+    """ Anti-aliasing factory that supports strings, classes, and partials. """
     if not aa_layer or not enable:
         return noop() if noop is not None else None
 
-    if isinstance(aa_layer, str):
-        aa_layer = aa_layer.lower().replace('_', '').replace('-', '')
-        if aa_layer == 'avg' or aa_layer == 'avgpool':
-            aa_layer = nn.AvgPool2d
-        elif aa_layer == 'blur' or aa_layer == 'blurpool':
-            aa_layer = partial(BlurPool2d, device=device, dtype=dtype)
-        elif aa_layer == 'blurpc':
-            aa_layer = partial(BlurPool2d, pad_mode='constant', device=device, dtype=dtype)
+    # Resolve strings to callables
+    aa_layer = _normalize_aa_layer(aa_layer)
 
-        else:
-            assert False, f"Unknown anti-aliasing layer ({aa_layer})."
+    # Build kwargs we *intend* to pass
+    call_kwargs = {"channels": channels, "stride": stride}
+
+    # Only add device/dtype for BlurPool2d (or partial of it) and don't override if already provided in the partial.
+    if _is_blurpool(aa_layer):
+        # Check if aa_layer is a partial and already has device/dtype set
+        existing_kw = aa_layer.keywords if isinstance(aa_layer, partial) and aa_layer.keywords else {}
+        if "device" not in existing_kw and device is not None:
+            call_kwargs["device"] = device
+        if "dtype" not in existing_kw and dtype is not None:
+            call_kwargs["dtype"] = dtype
 
+    # Try (channels, stride, [device, dtype]) first; fall back to (stride) only
     try:
-        return aa_layer(channels=channels, stride=stride)
-    except TypeError as e:
+        return aa_layer(**call_kwargs)
+    except TypeError:
+        # Some layers (e.g., AvgPool2d) may not accept 'channels' and need stride passed as kernel
         return aa_layer(stride)
@@ -3,6 +3,7 @@
 Hacked together by / Copyright 2020 Ross Wightman
 """
 from functools import partial
+from typing import Optional, Type, Union, Tuple
 
 from torch import nn as nn
 
@@ -17,14 +18,14 @@ class Mlp(nn.Module):
     """
     def __init__(
             self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-            use_conv=False,
+            in_features: int,
+            hidden_features: Optional[int] = None,
+            out_features: Optional[int] = None,
+            act_layer: Type[nn.Module] = nn.GELU,
+            norm_layer: Optional[Type[nn.Module]] = None,
+            bias: Union[bool, Tuple[bool, bool]] = True,
+            drop: Union[float, Tuple[float, float]] = 0.,
+            use_conv: bool = False,
             device=None,
             dtype=None,
     ):
@@ -61,15 +62,15 @@ class GluMlp(nn.Module):
     """
     def __init__(
             self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.Sigmoid,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-            use_conv=False,
-            gate_last=True,
+            in_features: int,
+            hidden_features: Optional[int] = None,
+            out_features: Optional[int] = None,
+            act_layer: Type[nn.Module] = nn.Sigmoid,
+            norm_layer: Optional[Type[nn.Module]] = None,
+            bias: Union[bool, Tuple[bool, bool]] = True,
+            drop: Union[float, Tuple[float, float]] = 0.,
+            use_conv: bool = False,
+            gate_last: bool = True,
             device=None,
             dtype=None,
     ):
@@ -118,14 +119,14 @@ class SwiGLU(nn.Module):
     """
     def __init__(
             self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.SiLU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-            align_to=0,
+            in_features: int,
+            hidden_features: Optional[int] = None,
+            out_features: Optional[int] = None,
+            act_layer: Type[nn.Module] = nn.SiLU,
+            norm_layer: Optional[Type[nn.Module]] = None,
+            bias: Union[bool, Tuple[bool, bool]] = True,
+            drop: Union[float, Tuple[float, float]] = 0.,
+            align_to: int = 0,
             device=None,
             dtype=None,
     ):
@@ -169,14 +170,14 @@ class GatedMlp(nn.Module):
     """
     def __init__(
             self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            norm_layer=None,
-            gate_layer=None,
-            bias=True,
-            drop=0.,
+            in_features: int,
+            hidden_features: Optional[int] = None,
+            out_features: Optional[int] = None,
+            act_layer: Type[nn.Module] = nn.GELU,
+            norm_layer: Optional[Type[nn.Module]] = None,
+            gate_layer: Optional[Type[nn.Module]] = None,
+            bias: Union[bool, Tuple[bool, bool]] = True,
+            drop: Union[float, Tuple[float, float]] = 0.,
             device=None,
             dtype=None,
     ):
@@ -216,13 +217,13 @@ class ConvMlp(nn.Module):
     """
     def __init__(
             self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.ReLU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
+            in_features: int,
+            hidden_features: Optional[int] = None,
+            out_features: Optional[int] = None,
+            act_layer: Type[nn.Module] = nn.ReLU,
+            norm_layer: Optional[Type[nn.Module]] = None,
+            bias: Union[bool, Tuple[bool, bool]] = True,
+            drop: float = 0.,
             device=None,
             dtype=None,
     ):
@@ -254,13 +255,13 @@ class GlobalResponseNormMlp(nn.Module):
     """
     def __init__(
             self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            bias=True,
-            drop=0.,
-            use_conv=False,
+            in_features: int,
+            hidden_features: Optional[int] = None,
+            out_features: Optional[int] = None,
+            act_layer: Type[nn.Module] = nn.GELU,
+            bias: Union[bool, Tuple[bool, bool]] = True,
+            drop: Union[float, Tuple[float, float]] = 0.,
+            use_conv: bool = False,
             device=None,
             dtype=None,
     ):
 
@@ -31,7 +31,7 @@
 import math
 from dataclasses import dataclass, field, replace
 from functools import partial
-from typing import Tuple, List, Dict, Optional, Union, Any, Callable, Sequence
+from typing import Tuple, List, Dict, Optional, Union, Any, Callable, Sequence, Type
 
 import torch
 import torch.nn as nn
@@ -245,11 +245,11 @@ def num_groups(group_size: Optional[int], channels: int) -> int:
 @dataclass
 class LayerFn:
     """Container for layer factory functions."""
-    conv_norm_act: Callable = ConvNormAct
-    norm_act: Callable = BatchNormAct2d
-    act: Callable = nn.ReLU
-    attn: Optional[Callable] = None
-    self_attn: Optional[Callable] = None
+    conv_norm_act: Type[nn.Module] = ConvNormAct
+    norm_act: Type[nn.Module] = BatchNormAct2d
+    act: Type[nn.Module] = nn.ReLU
+    attn: Optional[Type[nn.Module]] = None
+    self_attn: Optional[Type[nn.Module]] = None
 
 
 class DownsampleAvg(nn.Module):