diff --git a/README.md b/README.md
index a5709ff..f625cdc 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,10 @@
+# new features:
+## you can use Resnest and xception as Backbone!(xception65 without pretrained)
+- resnest50
+- resnest101
+- resnest200
+- resnest269
+- xception65
 # FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation
 [[Project]](http://wuhuikai.me/FastFCNProject/)    [[Paper]](http://wuhuikai.me/FastFCNProject/fast_fcn.pdf)    [[arXiv]](https://arxiv.org/abs/1903.11816)    [[Home]](http://wuhuikai.me) 
 
diff --git a/encoding/dilated/__init__.py b/encoding/dilated/__init__.py
index ed88810..2b35db5 100644
--- a/encoding/dilated/__init__.py
+++ b/encoding/dilated/__init__.py
@@ -1,2 +1,4 @@
 """Dilated ResNet and DenseNet"""
 from .resnet import *
+from .resnest import *
+from .xception import *
diff --git a/encoding/dilated/resnest.py b/encoding/dilated/resnest.py
new file mode 100644
index 0000000..0398397
--- /dev/null
+++ b/encoding/dilated/resnest.py
@@ -0,0 +1,77 @@
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2020
+##
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+"""ResNeSt models"""
+
+import torch
+from .resnet import ResNet, Bottleneck
+from ..models.model_store import get_model_file
+
+__all__ = ['resnest50', 'resnest101', 'resnest200', 'resnest269']
+
+_url_format = 'https://hangzh.s3.amazonaws.com/encoding/models/{}-{}.pth'
+
+
+def resnest50(pretrained=False, root='~/.encoding/models', **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 6, 3],
+                   radix=2, groups=1, bottleneck_width=64,
+                   deep_stem=True, stem_width=32, avg_down=True,
+                   avd=True, avd_first=False, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(
+            get_model_file('resnest50', root=root)), strict=True)
+    return model
+
+def resnest101(pretrained=False, root='~/.encoding/models', **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   radix=2, groups=1, bottleneck_width=64,
+                   deep_stem=True, stem_width=64, avg_down=True,
+                   avd=True, avd_first=False, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(
+            get_model_file('resnest101', root=root)), strict=True)
+    return model
+
+def resnest200(pretrained=False, root='~/.encoding/models', **kwargs):
+    model = ResNet(Bottleneck, [3, 24, 36, 3],
+                   radix=2, groups=1, bottleneck_width=64,
+                   deep_stem=True, stem_width=64, avg_down=True,
+                   avd=True, avd_first=False, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(
+            get_model_file('resnest200', root=root)), strict=False)
+    return model
+
+def resnest269(pretrained=False, root='~/.encoding/models', **kwargs):
+    model = ResNet(Bottleneck, [3, 30, 48, 8],
+                   radix=2, groups=1, bottleneck_width=64,
+                   deep_stem=True, stem_width=64, avg_down=True,
+                   avd=True, avd_first=False, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(
+            get_model_file('resnest269', root=root)), strict=True)
+    return model
+
+def resnest50_fast(pretrained=False, root='~/.encoding/models', **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 6, 3],
+                   radix=2, groups=1, bottleneck_width=64,
+                   deep_stem=True, stem_width=32, avg_down=True,
+                   avd=True, avd_first=True, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(
+            get_model_file('resnest50fast', root=root)), strict=True)
+    return model
+
+def resnest101_fast(pretrained=False, root='~/.encoding/models', **kwargs):
+    model = ResNet(Bottleneck, [3, 4, 23, 3],
+                   radix=2, groups=1, bottleneck_width=64,
+                   deep_stem=True, stem_width=64, avg_down=True,
+                   avd=True, avd_first=True, **kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(
+            get_model_file('resnest101fast', root=root)), strict=True)
+    return model
diff --git a/encoding/dilated/resnet.py b/encoding/dilated/resnet.py
index 996b987..b114947 100644
--- a/encoding/dilated/resnet.py
+++ b/encoding/dilated/resnet.py
@@ -1,102 +1,102 @@
-"""Dilated ResNet"""
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2020
+##
+## LICENSE file in the root directory of this source tree 
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+"""ResNet variants"""
 import math
 import torch
 import torch.nn as nn
-import torch.utils.model_zoo as model_zoo
 
-__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
-           'resnet152', 'BasicBlock', 'Bottleneck']
-
-model_urls = {
-    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
-    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
-}
-
-
-def conv3x3(in_planes, out_planes, stride=1):
-    "3x3 convolution with padding"
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=1, bias=False)
-
-
-class BasicBlock(nn.Module):
-    """ResNet BasicBlock
-    """
-    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1,
-                 norm_layer=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
-                               padding=dilation, dilation=dilation, bias=False)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
-                               padding=previous_dilation, dilation=previous_dilation, bias=False)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
+from ..nn import SplAtConv2d, DropBlock2D, GlobalAvgPool2d
+from ..models.model_store import get_model_file
 
+__all__ = ['ResNet', 'Bottleneck',
+           'resnet50', 'resnet101', 'resnet152']
 
 class Bottleneck(nn.Module):
     """ResNet Bottleneck
     """
     # pylint: disable=unused-argument
     expansion = 4
-    def __init__(self, inplanes, planes, stride=1, dilation=1,
-                 downsample=None, previous_dilation=1, norm_layer=None):
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 radix=1, cardinality=1, bottleneck_width=64,
+                 avd=False, avd_first=False, dilation=1, is_first=False,
+                 norm_layer=None, dropblock_prob=0.0, last_gamma=False):
         super(Bottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(planes)
-        self.conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, stride=stride,
-            padding=dilation, dilation=dilation, bias=False)
-        self.bn2 = norm_layer(planes)
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(group_width)
+        self.dropblock_prob = dropblock_prob
+        self.radix = radix
+        self.avd = avd and (stride > 1 or is_first)
+        self.avd_first = avd_first
+
+        if self.avd:
+            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
+            stride = 1
+
+        if dropblock_prob > 0.0:
+            self.dropblock1 = DropBlock2D(dropblock_prob, 3)
+            if radix == 1:
+                self.dropblock2 = DropBlock2D(dropblock_prob, 3)
+            self.dropblock3 = DropBlock2D(dropblock_prob, 3)
+
+        if radix > 1:
+            self.conv2 = SplAtConv2d(
+                group_width, group_width, kernel_size=3,
+                stride=stride, padding=dilation,
+                dilation=dilation, groups=cardinality, bias=False,
+                radix=radix,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+        else:
+            self.conv2 = nn.Conv2d(
+                group_width, group_width, kernel_size=3, stride=stride,
+                padding=dilation, dilation=dilation,
+                groups=cardinality, bias=False)
+            self.bn2 = norm_layer(group_width)
+
         self.conv3 = nn.Conv2d(
-            planes, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(planes * 4)
+            group_width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(planes*4)
+
+        if last_gamma:
+            from torch.nn.init import zeros_
+            zeros_(self.bn3.weight)
         self.relu = nn.ReLU(inplace=True)
         self.downsample = downsample
         self.dilation = dilation
         self.stride = stride
 
-    def _sum_each(self, x, y):
-        assert(len(x) == len(y))
-        z = []
-        for i in range(len(x)):
-            z.append(x[i]+y[i])
-        return z
-
     def forward(self, x):
         residual = x
 
         out = self.conv1(x)
         out = self.bn1(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock1(out)
         out = self.relu(out)
 
+        if self.avd and self.avd_first:
+            out = self.avd_layer(out)
+
         out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
+        if self.radix == 1:
+            out = self.bn2(out)
+            if self.dropblock_prob > 0.0:
+                out = self.dropblock2(out)
+            out = self.relu(out)
+
+        if self.avd and not self.avd_first:
+            out = self.avd_layer(out)
 
         out = self.conv3(out)
         out = self.bn3(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock3(out)
 
         if self.downsample is not None:
             residual = self.downsample(x)
@@ -106,10 +106,8 @@ def forward(self, x):
 
         return out
 
-
 class ResNet(nn.Module):
-    """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
-
+    """ResNet Variants
     Parameters
     ----------
     block : Block
@@ -124,56 +122,73 @@ class ResNet(nn.Module):
     norm_layer : object
         Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
         for Synchronized Cross-GPU BachNormalization).
-
     Reference:
-
         - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
-
         - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
     """
     # pylint: disable=unused-variable
-    def __init__(self, block, layers, num_classes=1000, dilated=True,
-                 deep_base=True, norm_layer=nn.BatchNorm2d, output_size=8):
-        self.inplanes = 128 if deep_base else 64
+    def __init__(self, block, layers, radix=1, groups=1, bottleneck_width=64,
+                 num_classes=1000, dilated=False, dilation=1,
+                 deep_stem=True, stem_width=64, avg_down=False,
+                 avd=False, avd_first=False,
+                 final_drop=0.0, dropblock_prob=0,
+                 last_gamma=False, norm_layer=nn.BatchNorm2d):
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width*2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
         super(ResNet, self).__init__()
-        if deep_base:
+        conv_layer = nn.Conv2d
+        if deep_stem:
             self.conv1 = nn.Sequential(
-                nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False),
-                norm_layer(64),
+                conv_layer(3, stem_width, kernel_size=3, stride=2, padding=1, bias=False),
+                norm_layer(stem_width),
                 nn.ReLU(inplace=True),
-                nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
-                norm_layer(64),
+                conv_layer(stem_width, stem_width, kernel_size=3, stride=1, padding=1, bias=False),
+                norm_layer(stem_width),
                 nn.ReLU(inplace=True),
-                nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
+                conv_layer(stem_width, stem_width*2, kernel_size=3, stride=1, padding=1, bias=False),
             )
         else:
-            self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                                   bias=False)
+            self.conv1 = conv_layer(3, 64, kernel_size=7, stride=2, padding=3,
+                                   bias=False, **conv_kwargs)
         self.bn1 = norm_layer(self.inplanes)
         self.relu = nn.ReLU(inplace=True)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
+        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer, is_first=False)
         self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
-
-        dilation_rate = 2
-        if dilated and output_size <= 8:
+        if dilated or dilation == 4:
             self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
-                                           dilation=dilation_rate, norm_layer=norm_layer)
-            dilation_rate *= 2
-        else:
+                                           dilation=2, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                           dilation=4, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        elif dilation==2:
             self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
-                                           norm_layer=norm_layer)
-
-        if dilated and output_size <= 16:
+                                           dilation=1, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
             self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
-                                           dilation=dilation_rate, norm_layer=norm_layer)
+                                           dilation=2, norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
         else:
+            self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
             self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
-                                           norm_layer=norm_layer)
-
-        self.avgpool = nn.AvgPool2d(7, stride=1)
+                                           norm_layer=norm_layer,
+                                           dropblock_prob=dropblock_prob)
+        self.avgpool = GlobalAvgPool2d()
+        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
         self.fc = nn.Linear(512 * block.expansion, num_classes)
-
+        self.layers =[]# add for saving c1,c2,c3,c4
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
@@ -181,84 +196,91 @@ def __init__(self, block, layers, num_classes=1000, dilated=True,
             elif isinstance(m, norm_layer):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None):
+     
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None,
+                    dropblock_prob=0.0, is_first=True):
         downsample = None
         if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                norm_layer(planes * block.expansion),
-            )
+            down_layers = []
+            if self.avg_down:
+                if dilation == 1:
+                    down_layers.append(nn.AvgPool2d(kernel_size=stride, stride=stride,
+                                                    ceil_mode=True, count_include_pad=False))
+                else:
+                    down_layers.append(nn.AvgPool2d(kernel_size=1, stride=1,
+                                                    ceil_mode=True, count_include_pad=False))
+                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
+                                             kernel_size=1, stride=1, bias=False))
+            else:
+                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion,
+                                             kernel_size=1, stride=stride, bias=False))
+            down_layers.append(norm_layer(planes * block.expansion))
+            downsample = nn.Sequential(*down_layers)
 
         layers = []
         if dilation == 1 or dilation == 2:
-            layers.append(block(self.inplanes, planes, stride, dilation=1,
-                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
+            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
+                                radix=self.radix, cardinality=self.cardinality,
+                                bottleneck_width=self.bottleneck_width,
+                                avd=self.avd, avd_first=self.avd_first,
+                                dilation=1, is_first=is_first,
+                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
+                                last_gamma=self.last_gamma))
         elif dilation == 4:
-            layers.append(block(self.inplanes, planes, stride, dilation=2,
-                                downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
+            layers.append(block(self.inplanes, planes, stride, downsample=downsample,
+                                radix=self.radix, cardinality=self.cardinality,
+                                bottleneck_width=self.bottleneck_width,
+                                avd=self.avd, avd_first=self.avd_first,
+                                dilation=2, is_first=is_first,
+                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
+                                last_gamma=self.last_gamma))
         else:
             raise RuntimeError("=> unknown dilation size: {}".format(dilation))
 
         self.inplanes = planes * block.expansion
         for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation,
-                                norm_layer=norm_layer))
+            layers.append(block(self.inplanes, planes,
+                                radix=self.radix, cardinality=self.cardinality,
+                                bottleneck_width=self.bottleneck_width,
+                                avd=self.avd, avd_first=self.avd_first,
+                                dilation=dilation,
+                                norm_layer=norm_layer, dropblock_prob=dropblock_prob,
+                                last_gamma=self.last_gamma))
 
         return nn.Sequential(*layers)
 
     def forward(self, x):
+        self.layers = []
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
         x = self.maxpool(x)
 
         x = self.layer1(x)
+        self.layers.append(x)
         x = self.layer2(x)
+        self.layers.append(x)
         x = self.layer3(x)
+        self.layers.append(x)
         x = self.layer4(x)
-
+        self.layers.append(x)
+        
         x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
+        #x = x.view(x.size(0), -1)
+        x = torch.flatten(x, 1)
+        if self.drop:
+            x = self.drop(x)
         x = self.fc(x)
 
         return x
 
-
-def resnet18(pretrained=False, **kwargs):
-    """Constructs a ResNet-18 model.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
-    return model
-
-
-def resnet34(pretrained=False, **kwargs):
-    """Constructs a ResNet-34 model.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
-    return model
-
-
 def resnet50(pretrained=False, root='~/.encoding/models', **kwargs):
     """Constructs a ResNet-50 model.
-
     Args:
         pretrained (bool): If True, returns a model pre-trained on ImageNet
     """
     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
     if pretrained:
-        from ..models.model_store import get_model_file
         model.load_state_dict(torch.load(
             get_model_file('resnet50', root=root)), strict=False)
     return model
@@ -266,13 +288,11 @@ def resnet50(pretrained=False, root='~/.encoding/models', **kwargs):
 
 def resnet101(pretrained=False, root='~/.encoding/models', **kwargs):
     """Constructs a ResNet-101 model.
-
     Args:
         pretrained (bool): If True, returns a model pre-trained on ImageNet
     """
     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
     if pretrained:
-        from ..models.model_store import get_model_file
         model.load_state_dict(torch.load(
             get_model_file('resnet101', root=root)), strict=False)
     return model
@@ -280,13 +300,11 @@ def resnet101(pretrained=False, root='~/.encoding/models', **kwargs):
 
 def resnet152(pretrained=False, root='~/.encoding/models', **kwargs):
     """Constructs a ResNet-152 model.
-
     Args:
         pretrained (bool): If True, returns a model pre-trained on ImageNet
     """
     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
     if pretrained:
-        from ..models.model_store import get_model_file
         model.load_state_dict(torch.load(
             get_model_file('resnet152', root=root)), strict=False)
     return model
diff --git a/encoding/dilated/xception.py b/encoding/dilated/xception.py
new file mode 100644
index 0000000..1008bc0
--- /dev/null
+++ b/encoding/dilated/xception.py
@@ -0,0 +1,350 @@
+# code adapted from https://github.com/jfzhang95/pytorch-deeplab-xception/
+import math
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..nn import SyncBatchNorm, GlobalAvgPool2d
+from ..models.model_store import get_model_file
+
+__all__ = ['Xception65', 'Xception71', 'xception65']
+
+def fixed_padding(inputs, kernel_size, dilation):
+    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)
+    pad_total = kernel_size_effective - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    padded_inputs = F.pad(inputs, (pad_beg, pad_end, pad_beg, pad_end))
+    return padded_inputs
+
+
+class SeparableConv2d(nn.Module):
+    def __init__(self, inplanes, planes, kernel_size=3, stride=1, dilation=1, bias=False, norm_layer=None):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv1 = nn.Conv2d(inplanes, inplanes, kernel_size, stride, 0, dilation,
+                               groups=inplanes, bias=bias)
+        self.bn = norm_layer(inplanes)
+        self.pointwise = nn.Conv2d(inplanes, planes, 1, 1, 0, 1, 1, bias=bias)
+
+    def forward(self, x):
+        x = fixed_padding(x, self.conv1.kernel_size[0], dilation=self.conv1.dilation[0])
+        x = self.conv1(x)
+        x = self.bn(x)
+        x = self.pointwise(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(self, inplanes, planes, reps, stride=1, dilation=1, norm_layer=None,
+                 start_with_relu=True, grow_first=True, is_last=False):
+        super(Block, self).__init__()
+        if planes != inplanes or stride != 1:
+            self.skip = nn.Conv2d(inplanes, planes, 1, stride=stride, bias=False)
+            self.skipbn = norm_layer(planes)
+        else:
+            self.skip = None
+        self.relu = nn.ReLU(inplace=True)
+        self.hook_layer = None
+        rep = []
+        filters = inplanes
+        if grow_first:
+            if start_with_relu:
+                rep.append(self.relu)
+            rep.append(SeparableConv2d(inplanes, planes, 3, 1, dilation, norm_layer=norm_layer))
+            rep.append(norm_layer(planes))
+            filters = planes
+        for i in range(reps - 1):
+            if grow_first or start_with_relu:
+                rep.append(self.relu)
+            rep.append(SeparableConv2d(filters, filters, 3, 1, dilation, norm_layer=norm_layer))
+            rep.append(norm_layer(filters))
+        if not grow_first:
+            rep.append(self.relu)
+            rep.append(SeparableConv2d(inplanes, planes, 3, 1, dilation, norm_layer=norm_layer))
+            rep.append(norm_layer(planes))
+        if stride != 1:
+            rep.append(self.relu)
+            rep.append(SeparableConv2d(planes, planes, 3, 2, norm_layer=norm_layer))
+            rep.append(norm_layer(planes))
+        elif is_last:
+            rep.append(self.relu)
+            rep.append(SeparableConv2d(planes, planes, 3, 1, dilation, norm_layer=norm_layer))
+            rep.append(norm_layer(planes))
+        #if not start_with_relu:
+        #    rep = rep[1:]
+        self.rep = nn.Sequential(*rep)
+
+    def forward(self, inp):
+        x = inp
+        for i in range(len(self.rep)):
+            if hasattr(self.rep[i], 'conv1') and self.rep[i].conv1.stride == (2,2):
+               self.hook_layer = x
+            x = self.rep[i](x)
+               
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+        x = x + skip
+        return x
+
+class Xception65(nn.Module):
+    """Modified Aligned Xception
+    """
+    def __init__(self, dilated = False, norm_layer=nn.BatchNorm2d):
+        super(Xception65, self).__init__()
+        output_stride = 32
+        if dilated:
+            output_stride = 16
+        if output_stride == 32:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block20_stride = 2
+            exit_block_dilations = (1, 1)
+        elif output_stride == 16:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block20_stride = 1
+            exit_block_dilations = (1, 2)
+        elif output_stride == 8:
+            entry_block3_stride = 1
+            middle_block_dilation = 2
+            exit_block20_stride = 1
+            exit_block_dilations = (2, 4)
+        else:
+            raise NotImplementedError
+
+        # Entry flow
+        self.conv1 = nn.Conv2d(3, 32, 3, stride=2, padding=1, bias=False)
+        self.bn1 = norm_layer(32)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1, bias=False)
+        self.bn2 = norm_layer(64)
+
+        self.block1 = Block(64, 128, reps=2, stride=2, norm_layer=norm_layer, start_with_relu=False)
+        self.block2 = Block(128, 256, reps=2, stride=2, norm_layer=norm_layer, start_with_relu=False,
+                            grow_first=True)
+        #print('self.block2', self.block2)
+        self.block3 = Block(256, 728, reps=2, stride=entry_block3_stride, norm_layer=norm_layer,
+                            start_with_relu=True, grow_first=True, is_last=True)
+
+        # Middle flow
+        midflowblocks = []
+        for i in range(4, 20):
+            midflowblocks.append(('block%d'%i, Block(728, 728, reps=3, stride=1,
+                                                     dilation=middle_block_dilation,
+                                                     norm_layer=norm_layer, start_with_relu=True,
+                                                     grow_first=True)))
+        self.midflow = nn.Sequential(OrderedDict(midflowblocks))
+
+        # Exit flow
+        self.block20 = Block(728, 1024, reps=2, stride=exit_block20_stride, dilation=exit_block_dilations[0],
+                             norm_layer=norm_layer, start_with_relu=True, grow_first=False, is_last=True)
+
+        self.conv3 = SeparableConv2d(1024, 1536, 3, stride=1, dilation=exit_block_dilations[1], norm_layer=norm_layer)
+        self.bn3 = norm_layer(1536)
+
+        self.conv4 = SeparableConv2d(1536, 1536, 3, stride=1, dilation=exit_block_dilations[1], norm_layer=norm_layer)
+        self.bn4 = norm_layer(1536)
+
+        self.conv5 = SeparableConv2d(1536, 2048, 3, stride=1, dilation=exit_block_dilations[1], norm_layer=norm_layer)
+        self.bn5 = norm_layer(2048)
+
+        self.avgpool = GlobalAvgPool2d()
+        self.fc = nn.Linear(2048, 1000)
+
+        # Init weights
+        self._init_weight()
+        self.layers = []
+    def forward(self, x):
+        self.layers = []
+        # Entry flow
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        
+        x = self.block1(x)
+        # add relu here
+        x = self.relu(x)
+        x = self.block2(x)
+        self.layers.append(self.block2.hook_layer)
+        x = self.block3(x)
+        self.layers.append(self.block3.hook_layer)
+        # Middle flow
+        x = self.midflow(x)
+        #c3 = x
+
+        # Exit flow
+        x = self.block20(x)
+        self.layers.append(self.block20.hook_layer)
+        x = self.relu(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        x = self.relu(x)
+
+        x = self.conv5(x)
+        x = self.bn5(x)
+        x = self.relu(x)
+        self.layers.append(x)
+        
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, SyncBatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+class Xception71(nn.Module):
+    """Modified Aligned Xception
+    """
+    def __init__(self, output_stride=32, norm_layer=nn.BatchNorm2d):
+        super(Xception71, self).__init__()
+
+        if output_stride == 32:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block20_stride = 2
+            exit_block_dilations = (1, 1)
+        elif output_stride == 16:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block20_stride = 1
+            exit_block_dilations = (1, 2)
+        elif output_stride == 8:
+            entry_block3_stride = 1
+            middle_block_dilation = 2
+            exit_block20_stride = 1
+            exit_block_dilations = (2, 4)
+        else:
+            raise NotImplementedError
+
+        # Entry flow
+        self.conv1 = nn.Conv2d(3, 32, 3, stride=2, padding=1, bias=False)
+        self.bn1 = norm_layer(32)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1, bias=False)
+        self.bn2 = norm_layer(64)
+
+        self.block1 = Block(64, 128, reps=2, stride=2, norm_layer=norm_layer, start_with_relu=False)
+        block2 = []
+        block2.append(Block(128, 256, reps=2, stride=1, norm_layer=norm_layer, start_with_relu=False,
+                            grow_first=True))
+        block2.append(Block(256, 256, reps=2, stride=2, norm_layer=norm_layer, start_with_relu=False,
+                            grow_first=True))
+        block2.append(Block(256, 728, reps=2, stride=1, norm_layer=norm_layer, start_with_relu=False,
+                            grow_first=True))
+        self.block2 = nn.Sequential(*block2)
+        self.block3 = Block(728, 728, reps=2, stride=entry_block3_stride, norm_layer=norm_layer,
+                            start_with_relu=True, grow_first=True, is_last=True)
+
+        # Middle flow
+        midflowblocks = []
+        for i in range(4, 20):
+            midflowblocks.append(('block%d'%i, Block(728, 728, reps=3, stride=1,
+                                                     dilation=middle_block_dilation,
+                                                     norm_layer=norm_layer, start_with_relu=True,
+                                                     grow_first=True)))
+        self.midflow = nn.Sequential(OrderedDict(midflowblocks))
+
+        # Exit flow
+        self.block20 = Block(728, 1024, reps=2, stride=exit_block20_stride, dilation=exit_block_dilations[0],
+                             norm_layer=norm_layer, start_with_relu=True, grow_first=False, is_last=True)
+
+        self.conv3 = SeparableConv2d(1024, 1536, 3, stride=1, dilation=exit_block_dilations[1], norm_layer=norm_layer)
+        self.bn3 = norm_layer(1536)
+
+        self.conv4 = SeparableConv2d(1536, 1536, 3, stride=1, dilation=exit_block_dilations[1], norm_layer=norm_layer)
+        self.bn4 = norm_layer(1536)
+
+        self.conv5 = SeparableConv2d(1536, 2048, 3, stride=1, dilation=exit_block_dilations[1], norm_layer=norm_layer)
+        self.bn5 = norm_layer(2048)
+
+        self.avgpool = GlobalAvgPool2d()
+        self.fc = nn.Linear(2048, 1000)
+
+        # Init weights
+        self._init_weight()
+
+    def forward(self, x):
+        # Entry flow
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.block1(x)
+        # add relu here
+        x = self.relu(x)
+        low_level_feat = x
+        x = self.block2(x)
+        x = self.block3(x)
+
+        # Middle flow
+        x = self.midflow(x)
+
+        # Exit flow
+        x = self.block20(x)
+        x = self.relu(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        x = self.relu(x)
+
+        x = self.conv5(x)
+        x = self.bn5(x)
+        x = self.relu(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x#, low_level_feat
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, SyncBatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+def xception65(pretrained=False, root='~/.encoding/models', **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = Xception65(**kwargs)
+    if pretrained:
+        model.load_state_dict(torch.load(get_model_file('xception65', root=root)))
+    return model
diff --git a/encoding/models/base.py b/encoding/models/base.py
index 840de7e..7ad0aea 100644
--- a/encoding/models/base.py
+++ b/encoding/models/base.py
@@ -14,7 +14,7 @@
 from torch.nn.parallel.data_parallel import DataParallel
 
 from ..nn import JPU, JPU_X
-from .. import dilated as resnet
+from .. import dilated as backbonemodels 
 from ..utils import batch_pix_accuracy, batch_intersection_union
 
 up_kwargs = {'mode': 'bilinear', 'align_corners': True}
@@ -35,13 +35,28 @@ def __init__(self, nclass, backbone, aux, se_loss, jpu=True, dilated=False, norm
         self.crop_size = crop_size
         # copying modules from pretrained models
         if backbone == 'resnet50':
-            self.pretrained = resnet.resnet50(pretrained=True, dilated=dilated,
+            self.pretrained = backbonemodels.resnet50(pretrained=True, dilated=dilated,
                                               norm_layer=norm_layer, root=root)
         elif backbone == 'resnet101':
-            self.pretrained = resnet.resnet101(pretrained=True, dilated=dilated,
+            self.pretrained = backbonemodels.resnet101(pretrained=True, dilated=dilated,
                                                norm_layer=norm_layer, root=root)
         elif backbone == 'resnet152':
-            self.pretrained = resnet.resnet152(pretrained=True, dilated=dilated,
+            self.pretrained = backbonemodels.resnet152(pretrained=True, dilated=dilated,
+                                               norm_layer=norm_layer, root=root)
+        elif backbone == 'resnest50':
+            self.pretrained = backbonemodels.resnest50(pretrained=True, dilated=dilated,
+                                               norm_layer=norm_layer, root=root)
+        elif backbone == 'resnest101':
+            self.pretrained = backbonemodels.resnest101(pretrained=True, dilated=dilated,
+                                               norm_layer=norm_layer, root=root)
+        elif backbone == 'resnest200':
+            self.pretrained = backbonemodels.resnest200(pretrained=True, dilated=dilated,
+                                               norm_layer=norm_layer, root=root)   
+        elif backbone == 'resnest269':
+            self.pretrained = backbonemodels.resnest269(pretrained=True, dilated=dilated,
+                                               norm_layer=norm_layer, root=root)
+        elif backbone == 'xception65':
+            self.pretrained = backbonemodels.xception65(pretrained=False, dilated=dilated,
                                                norm_layer=norm_layer, root=root)
         else:
             raise RuntimeError('unknown backbone: {}'.format(backbone))
@@ -49,20 +64,24 @@ def __init__(self, nclass, backbone, aux, se_loss, jpu=True, dilated=False, norm
         self._up_kwargs = up_kwargs
         self.backbone = backbone
         self.jpu = None
-        if jpu == 'JPU':
-            self.jpu = JPU([512, 1024, 2048], width=512, norm_layer=norm_layer, up_kwargs=up_kwargs)
-        elif jpu == 'JPU_X':
-            self.jpu = JPU_X([512, 1024, 2048], width=512, norm_layer=norm_layer, up_kwargs=up_kwargs)
+        if 'xception' in self.backbone:
+            if jpu == 'JPU':
+               self.jpu = JPU([728, 1024, 2048], width=512, norm_layer=norm_layer, up_kwargs=up_kwargs)
+            elif jpu == 'JPU_X':
+               self.jpu = JPU_X([728, 1024, 2048], width=512, norm_layer=norm_layer, up_kwargs=up_kwargs)
+        else:
+            if jpu == 'JPU':
+               self.jpu = JPU([512, 1024, 2048], width=512, norm_layer=norm_layer, up_kwargs=up_kwargs)
+            elif jpu == 'JPU_X':
+               self.jpu = JPU_X([512, 1024, 2048], width=512, norm_layer=norm_layer, up_kwargs=up_kwargs)
 
+        
     def base_forward(self, x):
-        x = self.pretrained.conv1(x)
-        x = self.pretrained.bn1(x)
-        x = self.pretrained.relu(x)
-        x = self.pretrained.maxpool(x)
-        c1 = self.pretrained.layer1(x)
-        c2 = self.pretrained.layer2(c1)
-        c3 = self.pretrained.layer3(c2)
-        c4 = self.pretrained.layer4(c3)
+        x = self.pretrained(x)
+        c1 = self.pretrained.layers[0]
+        c2 = self.pretrained.layers[1]
+        c3 = self.pretrained.layers[2]
+        c4 = self.pretrained.layers[3]
 
         if self.jpu:
             return self.jpu(c1, c2, c3, c4)
diff --git a/encoding/models/model_store.py b/encoding/models/model_store.py
index cc68a86..4e4de90 100644
--- a/encoding/models/model_store.py
+++ b/encoding/models/model_store.py
@@ -9,9 +9,16 @@
 __all__ = ['get_model_file', 'purge']
 
 _model_sha1 = {name: checksum for checksum, name in [
+    #resnest
+    ('fb9de5b360976e3e8bd3679d3e93c5409a5eff3c', 'resnest50'),
+    ('966fb78c22323b0c68097c5c1242bd16d3e07fd5', 'resnest101'),
+    ('d7fd712f5a1fcee5b3ce176026fbb6d0d278454a', 'resnest200'),
+    ('51ae5f19032e22af4ec08e695496547acdba5ce5', 'resnest269'),
+    #resnet
     ('ebb6acbbd1d1c90b7f446ae59d30bf70c74febc1', 'resnet50'),
     ('2a57e44de9c853fa015b172309a1ee7e2d0e4e2a', 'resnet101'),
     ('0d43d698c66aceaa2bc0309f55efdd7ff4b143af', 'resnet152'),
+    #resnet segmentation models
     ('662e979de25a389f11c65e9f1df7e06c2c356381', 'fcn_resnet50_ade'),
     ('eeed8e582f0fdccdba8579e7490570adc6d85c7c', 'fcn_resnet50_pcontext'),
     ('54f70c772505064e30efd1ddd3a14e1759faa363', 'psp_resnet50_ade'),
@@ -21,6 +28,17 @@
     ('9f27ea13d514d7010e59988341bcbd4140fcc33d', 'encnet_resnet101_pcontext'),
     ('07ac287cd77e53ea583f37454e17d30ce1509a4a', 'encnet_resnet50_ade'),
     ('3f54fa3b67bac7619cd9b3673f5c8227cf8f4718', 'encnet_resnet101_ade'),
+    # resnest segmentation models
+    ('4aba491aaf8e4866a9c9981b210e3e3266ac1f2a', 'fcn_resnest50_ade'),
+    ('2225f09d0f40b9a168d9091652194bc35ec2a5a9', 'deeplab_resnest50_ade'),
+    ('06ca799c8cc148fe0fafb5b6d052052935aa3cc8', 'deeplab_resnest101_ade'),
+    ('7b9e7d3e6f0e2c763c7d77cad14d306c0a31fe05', 'deeplab_resnest200_ade'),
+    ('0074dd10a6e6696f6f521653fb98224e75955496', 'deeplab_resnest269_ade'),
+    ('77a2161deeb1564e8b9c41a4bb7a3f33998b00ad', 'fcn_resnest50_pcontext'),
+    ('08dccbc4f4694baab631e037a374d76d8108c61f', 'deeplab_resnest50_pcontext'),
+    ('faf5841853aae64bd965a7bdc2cdc6e7a2b5d898', 'deeplab_resnest101_pcontext'),
+    ('fe76a26551dd5dcf2d474fd37cba99d43f6e984e', 'deeplab_resnest200_pcontext'),
+    ('b661fd26c49656e01e9487cd9245babb12f37449', 'deeplab_resnest269_pcontext'),
     ]}
 
 encoding_repo_url = 'https://hangzh.s3.amazonaws.com/'
diff --git a/encoding/models/model_zoo.py b/encoding/models/model_zoo.py
index c97bb42..a523529 100644
--- a/encoding/models/model_zoo.py
+++ b/encoding/models/model_zoo.py
@@ -24,13 +24,37 @@ def get_model(name, **kwargs):
         The model.
     """
     models = {
-        'fcn_resnet50_pcontext': get_fcn_resnet50_pcontext,
-        'encnet_resnet50_pcontext': get_encnet_resnet50_pcontext,
-        'encnet_resnet101_pcontext': get_encnet_resnet101_pcontext,
-        'encnet_resnet50_ade': get_encnet_resnet50_ade,
-        'encnet_resnet101_ade': get_encnet_resnet101_ade,
-        'fcn_resnet50_ade': get_fcn_resnet50_ade,
-        'psp_resnet50_ade': get_psp_resnet50_ade,
+        # resnet
+        'resnet50': resnet50,
+        'resnet101': resnet101,
+        'resnet152': resnet152,
+        # resnest
+        'resnest50': resnest50,
+        'resnest101': resnest101,
+        'resnest200': resnest200,
+        'resnest269': resnest269,
+        # other segmentation backbones
+        'xception65': xception65,
+        # segmentation resnet models
+        'encnet_resnet101s_coco': get_encnet_resnet101_coco,
+        'fcn_resnet50s_pcontext': get_fcn_resnet50_pcontext,
+        'encnet_resnet50s_pcontext': get_encnet_resnet50_pcontext,
+        'encnet_resnet101s_pcontext': get_encnet_resnet101_pcontext,
+        'encnet_resnet50s_ade': get_encnet_resnet50_ade,
+        'encnet_resnet101s_ade': get_encnet_resnet101_ade,
+        'fcn_resnet50s_ade': get_fcn_resnet50_ade,
+        'psp_resnet50s_ade': get_psp_resnet50_ade,
+        # segmentation resnest models
+        'fcn_resnest50_ade': get_fcn_resnest50_ade,
+        'deeplab_resnest50_ade': get_deeplab_resnest50_ade,
+        'deeplab_resnest101_ade': get_deeplab_resnest101_ade,
+        'deeplab_resnest200_ade': get_deeplab_resnest200_ade,
+        'deeplab_resnest269_ade': get_deeplab_resnest269_ade,
+        'fcn_resnest50_pcontext': get_fcn_resnest50_pcontext,
+        'deeplab_resnest50_pcontext': get_deeplab_resnest50_pcontext,
+        'deeplab_resnest101_pcontext': get_deeplab_resnest101_pcontext,
+        'deeplab_resnest200_pcontext': get_deeplab_resnest200_pcontext,
+        'deeplab_resnest269_pcontext': get_deeplab_resnest269_pcontext,
         }
     name = name.lower()
     if name not in models:
diff --git a/encoding/nn/__init__.py b/encoding/nn/__init__.py
index ff1e3d8..8395324 100644
--- a/encoding/nn/__init__.py
+++ b/encoding/nn/__init__.py
@@ -12,3 +12,5 @@
 from .syncbn import *
 from .encoding import *
 from .customize import *
+from .splat import SplAtConv2d
+from .dropblock import *
diff --git a/encoding/nn/customize.py b/encoding/nn/customize.py
index 6d027e3..f039654 100644
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
@@ -19,7 +19,7 @@
 
 torch_ver = torch.__version__[:3]
 
-__all__ = ['SegmentationLosses', 'PyramidPooling', 'JPU', 'JPU_X', 'Mean']
+__all__ = ['SegmentationLosses', 'GlobalAvgPool2d', 'PyramidPooling', 'JPU', 'JPU_X', 'Mean']
 
 class SegmentationLosses(CrossEntropyLoss):
     """2D Cross Entropy Loss with Auxilary Loss"""
@@ -96,7 +96,14 @@ def __init__(self, p=2, dim=1):
 
     def forward(self, x):
         return F.normalize(x, self.p, self.dim, eps=1e-8)
-
+    
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, inputs):
+        return F.adaptive_avg_pool2d(inputs, 1).view(inputs.size(0), -1)
 
 class PyramidPooling(Module):
     """
diff --git a/encoding/nn/dropblock.py b/encoding/nn/dropblock.py
new file mode 100644
index 0000000..3612c50
--- /dev/null
+++ b/encoding/nn/dropblock.py
@@ -0,0 +1,126 @@
+# https://github.com/Randl/MobileNetV3-pytorch/blob/master/dropblock.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+__all__ = ['DropBlock2D', 'reset_dropblock']
+
+class DropBlock2D(nn.Module):
+    r"""Randomly zeroes 2D spatial blocks of the input tensor.
+    As described in the paper
+    `DropBlock: A regularization method for convolutional networks`_ ,
+    dropping whole blocks of feature map allows to remove semantic
+    information as compared to regular dropout.
+    Args:
+        drop_prob (float): probability of an element to be dropped.
+        block_size (int): size of the block to drop
+    Shape:
+        - Input: `(N, C, H, W)`
+        - Output: `(N, C, H, W)`
+    .. _DropBlock: A regularization method for convolutional networks:
+       https://arxiv.org/abs/1810.12890
+    """
+
+    def __init__(self, drop_prob, block_size, share_channel=False):
+        super(DropBlock2D, self).__init__()
+        self.register_buffer('i', torch.zeros(1, dtype=torch.int64))
+        self.register_buffer('drop_prob', drop_prob * torch.ones(1, dtype=torch.float32))
+        self.inited = False
+        self.step_size = 0.0
+        self.start_step = 0
+        self.nr_steps = 0
+        self.block_size = block_size
+        self.share_channel = share_channel
+
+    def reset(self):
+        """stop DropBlock"""
+        self.inited = True
+        self.i[0] = 0
+        self.drop_prob = 0.0
+
+    def reset_steps(self, start_step, nr_steps, start_value=0, stop_value=None):
+        self.inited = True
+        stop_value = self.drop_prob.item() if stop_value is None else stop_value
+        self.i[0] = 0
+        self.drop_prob[0] = start_value
+        self.step_size = (stop_value - start_value) / nr_steps
+        self.nr_steps = nr_steps
+        self.start_step = start_step
+
+    def forward(self, x):
+        if not self.training or self.drop_prob.item() == 0.:
+            return x
+        else:
+            self.step()
+
+            # get gamma value
+            gamma = self._compute_gamma(x)
+
+            # sample mask and place on input device
+            if self.share_channel:
+                mask = (torch.rand(*x.shape[2:], device=x.device, dtype=x.dtype) < gamma).unsqueeze(0).unsqueeze(0)
+            else:
+                mask = (torch.rand(*x.shape[1:], device=x.device, dtype=x.dtype) < gamma).unsqueeze(0)
+
+            # compute block mask
+            block_mask, keeped = self._compute_block_mask(mask)
+
+            # apply block mask
+            out = x * block_mask
+
+            # scale output
+            out = out * (block_mask.numel() / keeped).to(out)
+            return out
+
+    def _compute_block_mask(self, mask):
+        block_mask = F.max_pool2d(mask,
+                                  kernel_size=(self.block_size, self.block_size),
+                                  stride=(1, 1),
+                                  padding=self.block_size // 2)
+
+        keeped = block_mask.numel() - block_mask.sum().to(torch.float32)
+        block_mask = 1 - block_mask
+
+        return block_mask, keeped
+
+    def _compute_gamma(self, x):
+        _, c, h, w = x.size()
+        gamma = self.drop_prob.item() / (self.block_size ** 2) * (h * w) / \
+            ((w - self.block_size + 1) * (h - self.block_size + 1))
+        return gamma
+
+    def step(self):
+        assert self.inited
+        idx = self.i.item()
+        if idx > self.start_step and idx < self.start_step + self.nr_steps:
+            self.drop_prob += self.step_size
+        self.i += 1
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        idx_key = prefix + 'i'
+        drop_prob_key = prefix + 'drop_prob'
+        if idx_key not in state_dict:
+            state_dict[idx_key] =  torch.zeros(1, dtype=torch.int64)
+        if idx_key not in drop_prob_key:
+            state_dict[drop_prob_key] =  torch.ones(1, dtype=torch.float32)
+        super(DropBlock2D, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        """overwrite save method"""
+        pass
+
+    def extra_repr(self):
+        return 'drop_prob={}, step_size={}'.format(self.drop_prob, self.step_size)
+
+def reset_dropblock(start_step, nr_steps, start_value, stop_value, m):
+    """
+    Example:
+        from functools import partial
+        apply_drop_prob = partial(reset_dropblock, 0, epochs*iters_per_epoch, 0.0, 0.1)
+        net.apply(apply_drop_prob)
+    """
+    if isinstance(m, DropBlock2D):
+        m.reset_steps(start_step, nr_steps, start_value, stop_value)
diff --git a/encoding/nn/splat.py b/encoding/nn/splat.py
new file mode 100644
index 0000000..67cf648
--- /dev/null
+++ b/encoding/nn/splat.py
@@ -0,0 +1,85 @@
+"""Split-Attention"""
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
+from torch.nn.modules.utils import _pair
+
+from .dropblock import DropBlock2D
+
+__all__ = ['SplAtConv2d']
+
+class SplAtConv2d(Module):
+    """Split-Attention Conv2d
+    """
+    def __init__(self, in_channels, channels, kernel_size, stride=(1, 1), padding=(0, 0),
+                 dilation=(1, 1), groups=1, bias=True,
+                 radix=2, reduction_factor=4,
+                 norm_layer=None,
+                 dropblock_prob=0.0, **kwargs):
+        super(SplAtConv2d, self).__init__()
+        padding = _pair(padding)
+        inter_channels = max(in_channels*radix//reduction_factor, 32)
+        self.radix = radix
+        self.cardinality = groups
+        self.channels = channels
+        self.dropblock_prob = dropblock_prob
+        self.conv = Conv2d(in_channels, channels*radix, kernel_size, stride, padding, dilation,
+                               groups=groups*radix, bias=bias, **kwargs)
+        self.use_bn = norm_layer is not None
+        self.bn0 = norm_layer(channels*radix)
+        self.relu = ReLU(inplace=True)
+        self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality)
+        self.bn1 = norm_layer(inter_channels)
+        self.fc2 = Conv2d(inter_channels, channels*radix, 1, groups=self.cardinality)
+        if dropblock_prob > 0.0:
+            self.dropblock = DropBlock2D(dropblock_prob, 3)
+        self.rsoftmax = rSoftMax(radix, groups)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn0(x)
+        if self.dropblock_prob > 0.0:
+            x = self.dropblock(x)
+        x = self.relu(x)
+
+        batch, channel = x.shape[:2]
+        if self.radix > 1:
+            splited = torch.split(x, channel//self.radix, dim=1)
+            gap = sum(splited) 
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        if self.use_bn:
+            gap = self.bn1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            atten = torch.split(atten, channel//self.radix, dim=1)
+            out = sum([att*split for (att, split) in zip(atten, splited)])
+        else:
+            out = atten * x
+        return out.contiguous()
+
+class rSoftMax(nn.Module):
+    def __init__(self, radix, cardinality):
+        super().__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x