4242 NormMlpClassifierHead ,
4343 ConvNormAct ,
4444 BatchNormAct2d ,
45+ DropBlock2d ,
4546 EvoNorm2dS0a ,
4647 AttentionPool2d ,
4748 RotAttentionPool2d ,
@@ -1339,11 +1340,42 @@ def update_block_kwargs(block_kwargs: Dict[str, Any], block_cfg: ByoBlockCfg, mo
13391340 block_kwargs .update (override_kwargs (block_cfg .block_kwargs , model_cfg .block_kwargs ))
13401341
13411342
1343+ def drop_blocks (
1344+ drop_prob : float = 0. ,
1345+ block_size : int = 3 ,
1346+ num_stages : int = 4 ,
1347+ ) -> List [Optional [partial ]]:
1348+ """Create DropBlock layer partials for each stage.
1349+
1350+ DropBlock is applied to the last two stages only, following common practice.
1351+ The block_size specifies the size for the final stage; the second-to-last
1352+ stage uses a larger block size scaled to account for 2x larger feature maps.
1353+
1354+ Args:
1355+ drop_prob: Drop probability for DropBlock.
1356+ block_size: Block size for the final stage. Second-to-last stage
1357+ uses `block_size * 2 - 1` to scale with feature map size.
1358+ num_stages: Number of stages in the model.
1359+
1360+ Returns:
1361+ List of DropBlock partial instances or None for each stage.
1362+ """
1363+ assert num_stages >= 2
1364+ dbs = [None ] * num_stages
1365+ if drop_prob :
1366+ # Scale block size for second-to-last stage (2x larger feature maps)
1367+ dbs [- 2 ] = partial (DropBlock2d , drop_prob = drop_prob , block_size = block_size * 2 - 1 , gamma_scale = 0.25 )
1368+ dbs [- 1 ] = partial (DropBlock2d , drop_prob = drop_prob , block_size = block_size , gamma_scale = 1.00 )
1369+ return dbs
1370+
1371+
13421372def create_byob_stages (
13431373 cfg : ByoModelCfg ,
13441374 drop_path_rate : float ,
13451375 output_stride : int ,
13461376 stem_feat : Dict [str , Any ],
1377+ drop_block_rate : float = 0. ,
1378+ drop_block_size : int = 3 ,
13471379 feat_size : Optional [int ] = None ,
13481380 layers : Optional [LayerFn ] = None ,
13491381 block_kwargs_fn : Optional [Callable ] = update_block_kwargs ,
@@ -1353,8 +1385,10 @@ def create_byob_stages(
13531385 layers = layers or LayerFn ()
13541386 feature_info = []
13551387 block_cfgs = [expand_blocks_cfg (s ) for s in cfg .blocks ]
1388+ num_stages = len (block_cfgs )
13561389 depths = [sum ([bc .d for bc in stage_bcs ]) for stage_bcs in block_cfgs ]
13571390 dpr = calculate_drop_path_rates (drop_path_rate , depths , stagewise = True )
1391+ dbs = drop_blocks (drop_block_rate , drop_block_size , num_stages )
13581392 dilation = 1
13591393 net_stride = stem_feat ['reduction' ]
13601394 prev_chs = stem_feat ['num_chs' ]
@@ -1384,6 +1418,7 @@ def create_byob_stages(
13841418 group_size = group_size ,
13851419 bottle_ratio = block_cfg .br ,
13861420 downsample = cfg .downsample ,
1421+ drop_block = dbs [stage_idx ],
13871422 drop_path_rate = dpr [stage_idx ][block_idx ],
13881423 layers = layers ,
13891424 device = device ,
@@ -1437,6 +1472,8 @@ def __init__(
14371472 output_stride : int = 32 ,
14381473 img_size : Optional [Union [int , Tuple [int , int ]]] = None ,
14391474 drop_rate : float = 0. ,
1475+ drop_block_rate : float = 0. ,
1476+ drop_block_size : int = 3 ,
14401477 drop_path_rate : float = 0. ,
14411478 zero_init_last : bool = True ,
14421479 device = None ,
@@ -1452,6 +1489,8 @@ def __init__(
14521489 output_stride: Output stride of network, one of (8, 16, 32).
14531490 img_size: Image size for fixed image size models (i.e. self-attn).
14541491 drop_rate: Classifier dropout rate.
1492+ drop_block_rate: DropBlock drop rate.
1493+ drop_block_size: DropBlock block size for final stage (scales up for earlier stages).
14551494 drop_path_rate: Stochastic depth drop-path rate.
14561495 zero_init_last: Zero-init last weight of residual path.
14571496 **kwargs: Extra kwargs overlayed onto cfg.
@@ -1490,6 +1529,8 @@ def __init__(
14901529 drop_path_rate ,
14911530 output_stride ,
14921531 stem_feat [- 1 ],
1532+ drop_block_rate = drop_block_rate ,
1533+ drop_block_size = drop_block_size ,
14931534 layers = stage_layers ,
14941535 feat_size = feat_size ,
14951536 ** dd ,
0 commit comments