H-A-I-L
diff --git a/‎parlai/agents/transformer/biencoder.py
+36 b/‎parlai/agents/transformer/biencoder.py
+36
diff --git a/‎parlai/agents/transformer/crossencoder.py
+146 b/‎parlai/agents/transformer/crossencoder.py
+146
diff --git a/‎parlai/agents/transformer/modules.py
+49-20 b/‎parlai/agents/transformer/modules.py
+49-20
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .transformer import TransformerRankerAgent
+from parlai.core.torch_ranker_agent import TorchRankerAgent
+import torch
+
+
+class BiencoderAgent(TransformerRankerAgent):
+    """ Equivalent of bert_ranker/biencoder but does not rely on an external
+        library (hugging face).
+    """
+
+    def __init__(self, opt, shared=None):
+        super().__init__(opt, shared)
+        # favor average instead of sum for the loss.
+        self.rank_loss = torch.nn.CrossEntropyLoss(reduce=True, size_average=True)
+        if self.use_cuda:
+            self.rank_loss.cuda()
+
+    def vectorize(self, *args, **kwargs):
+        """ Add the start and end token to the text.
+        """
+        kwargs['add_start'] = True
+        kwargs['add_end'] = True
+        obs = TorchRankerAgent.vectorize(self, *args, **kwargs)
+        return obs
+
+    def _set_text_vec(self, *args, **kwargs):
+        """ Add the start and end token to the text.
+        """
+        obs = super()._set_text_vec(*args, **kwargs)
+        if 'text_vec' in obs:
+            obs['text_vec'] = self._add_start_end_tokens(obs['text_vec'], True, True)
+        return obs
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# hack to make sure -m transformer/generator works as expected
+from .modules import TransformerEncoder
+from .modules import get_n_positions_from_options
+from parlai.core.torch_ranker_agent import TorchRankerAgent
+from .transformer import TransformerRankerAgent
+import torch
+
+
+class CrossencoderAgent(TorchRankerAgent):
+    """ Equivalent of bert_ranker/crossencoder but does not rely on an external
+        library (hugging face).
+    """
+
+    def __init__(self, opt, shared=None):
+        super().__init__(opt, shared)
+        self.rank_loss = torch.nn.CrossEntropyLoss(reduce=True, size_average=True)
+        if self.use_cuda:
+            self.rank_loss.cuda()
+        self.data_parallel = opt.get('data_parallel') and self.use_cuda
+        if self.data_parallel:
+            from parlai.core.distributed_utils import is_distributed
+
+            if is_distributed():
+                raise ValueError('Cannot combine --data-parallel and distributed mode')
+            self.model = torch.nn.DataParallel(self.model)
+
+    @classmethod
+    def add_cmdline_args(cls, argparser):
+        """Add command-line arguments specifically for this agent."""
+        TransformerRankerAgent.add_cmdline_args(argparser)
+        return argparser
+
+    def build_model(self, states=None):
+        self.model = CrossEncoderModule(self.opt, self.dict, self.NULL_IDX)
+        return self.model
+
+    def vectorize(self, *args, **kwargs):
+        """ Add the start and end token to the text.
+        """
+        kwargs['add_start'] = True
+        kwargs['add_end'] = True
+        obs = super().vectorize(*args, **kwargs)
+        return obs
+
+    def _set_text_vec(self, *args, **kwargs):
+        """ Add the start and end token to the text.
+        """
+        obs = super()._set_text_vec(*args, **kwargs)
+        if 'text_vec' in obs:
+            obs['text_vec'] = self._add_start_end_tokens(obs['text_vec'], True, True)
+        return obs
+
+    def concat_without_padding(self, text_idx, cand_idx, null_idx=0):
+        """ if text_idx = [[1, 2, 3, 4, 0, 0  ]]
+            and cand_idx = [[5, 6, 7, 8, 0, 0 ]]
+            then result = (tokens, segments) where
+            tokens = [[1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0]]
+            segments = [[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]]
+        """
+        assert text_idx.size(0) == cand_idx.size(0)
+        assert len(text_idx.size()) == 2
+        assert len(cand_idx.size()) == 2
+        segments_idx = [0, 1]
+        text_idx = text_idx.cpu()
+        cand_idx = cand_idx.cpu()
+        cand_len = cand_idx.size(1)
+        concat_len = text_idx.size(1) + cand_idx.size(1)
+        tokens = text_idx.new_zeros(text_idx.size(0), concat_len) + null_idx
+        segments = text_idx.new_zeros(text_idx.size(0), concat_len) + null_idx
+        for i in range(len(tokens)):
+            non_nuls = torch.sum(text_idx[i, :] != null_idx)
+            tokens[i, 0:non_nuls] = text_idx[i, 0:non_nuls]
+            segments[i, 0:non_nuls] = segments_idx[0]
+            tokens[i, non_nuls : non_nuls + cand_len] = cand_idx[i, :]
+            segments[i, non_nuls : non_nuls + cand_len] = segments_idx[1]
+        if self.use_cuda:
+            tokens = tokens.cuda()
+            segments = segments.cuda()
+        return tokens, segments
+
+    def score_candidates(self, batch, cand_vecs, cand_encs=None):
+        if cand_encs is not None:
+            raise Exception(
+                'Candidate pre-computation is impossible on the ' 'crossencoder'
+            )
+        num_cands_per_sample = cand_vecs.size(1)
+        bsz = cand_vecs.size(0)
+        text_idx = (
+            batch.text_vec.unsqueeze(1)
+            .expand(-1, num_cands_per_sample, -1)
+            .contiguous()
+            .view(num_cands_per_sample * bsz, -1)
+        )
+        cand_idx = cand_vecs.view(num_cands_per_sample * bsz, -1)
+        tokens, segments = self.concat_without_padding(
+            text_idx, cand_idx, self.NULL_IDX
+        )
+        scores = self.model(tokens, segments)
+        scores = scores.view(bsz, num_cands_per_sample)
+        return scores
+
+
+class CrossEncoderModule(torch.nn.Module):
+    """ A simple wrapper around the transformer encoder which adds a linear
+        layer.
+    """
+
+    def __init__(self, opt, dict, null_idx):
+        super(CrossEncoderModule, self).__init__()
+        n_positions = get_n_positions_from_options(opt)
+        embeddings = torch.nn.Embedding(
+            len(dict), opt['embedding_size'], padding_idx=null_idx
+        )
+        torch.nn.init.normal_(embeddings.weight, 0, opt['embedding_size'] ** -0.5)
+        self.encoder = TransformerEncoder(
+            n_heads=opt['n_heads'],
+            n_layers=opt['n_layers'],
+            embedding_size=opt['embedding_size'],
+            ffn_size=opt['ffn_size'],
+            vocabulary_size=len(dict),
+            embedding=embeddings,
+            dropout=opt['dropout'],
+            attention_dropout=opt['attention_dropout'],
+            relu_dropout=opt['relu_dropout'],
+            padding_idx=null_idx,
+            learn_positional_embeddings=opt['learn_positional_embeddings'],
+            embeddings_scale=opt['embeddings_scale'],
+            reduction_type=opt.get('reduction_type', 'first'),
+            n_positions=n_positions,
+            n_segments=2,
+            activation=opt['activation'],
+            variant=opt['variant'],
+            output_scaling=opt['output_scaling'],
+        )
+        self.linear_layer = torch.nn.Linear(opt['embedding_size'], 1)
+
+    def forward(self, tokens, segments):
+        """ Scores each concatenation text + candidate.
+        """
+        encoded = self.encoder(tokens, None, segments)
+        res = self.linear_layer(encoded)
+        return res
@@ -76,6 +76,7 @@ def _build_encoder(
         n_segments=n_segments,
         activation=opt['activation'],
         variant=opt['variant'],
+        output_scaling=opt['output_scaling'],
     )
 
 
@@ -111,6 +112,22 @@ def gelu(tensor):
     return 0.5 * tensor * (1.0 + torch.erf(tensor / math.sqrt(2.0)))
 
 
+def get_n_positions_from_options(opt):
+    if opt.get('n_positions'):
+        # if the number of positions is explicitly provided, use that
+        n_positions = opt['n_positions']
+    else:
+        # else, use the worst case from truncate
+        n_positions = max(
+            opt.get('truncate') or 0,
+            opt.get('text_truncate') or 0,
+            opt.get('label_truncate') or 0,
+        )
+        if n_positions == 0:
+            n_positions = 1024
+    return n_positions
+
+
 class TransformerMemNetModel(nn.Module):
     """Model which takes context, memories, candidates and encodes them."""
 
@@ -135,19 +152,7 @@ def __init__(self, opt, dictionary):
             if not self.share_word_embedding:
                 self.cand_embeddings.weight.requires_grad = False
 
-        if opt.get('n_positions'):
-            # if the number of positions is explicitly provided, use that
-            n_positions = opt['n_positions']
-        else:
-            # else, use the worst case from truncate
-            n_positions = max(
-                opt.get('truncate') or 0,
-                opt.get('text_truncate') or 0,
-                opt.get('label_truncate') or 0,
-            )
-            if n_positions == 0:
-                # default to 1024
-                n_positions = 1024
+        n_positions = get_n_positions_from_options(opt)
 
         if n_positions < 0:
             raise ValueError('n_positions must be positive')
@@ -192,7 +197,9 @@ def __init__(self, opt, dictionary):
         else:
             self.memory_transformer = self.context_encoder
 
-        self.attender = BasicAttention(dim=2, attn=opt['memory_attention'])
+        self.attender = BasicAttention(
+            dim=2, attn=opt['memory_attention'], residual=True
+        )
 
     def encode_cand(self, words):
         """Encode the candidates."""
@@ -318,6 +325,8 @@ class TransformerEncoder(nn.Module):
     :param variant:
         Which transformer architecture to use. Could be AIAYN or XLM.
         Future versions may support things like GPT-2, ...
+    :param output_scaling:
+        Scale the outputs by a given scalar
     """
 
     def __init__(
@@ -339,6 +348,7 @@ def __init__(
         activation='relu',
         variant='aiayn',
         n_segments=0,
+        output_scaling=1.0,
     ):
         super(TransformerEncoder, self).__init__()
 
@@ -414,6 +424,7 @@ def __init__(
                     activation=activation,
                 )
             )
+        self.output_scaling = output_scaling
 
     def forward(self, input, positions=None, segments=None):
         """
@@ -457,6 +468,7 @@ def forward(self, input, positions=None, segments=None):
         for i in range(self.n_layers):
             tensor = self.layers[i](tensor, mask)
 
+        tensor *= self.output_scaling
         if self.reduction_type == 'first':
             return tensor[:, 0, :]
         elif self.reduction_type == 'max':
@@ -805,29 +817,46 @@ def output(self, tensor):
 class BasicAttention(nn.Module):
     """Implements simple/classical attention."""
 
-    def __init__(self, dim=1, attn='cosine'):
+    def __init__(self, dim=1, attn='cosine', residual=False, get_weights=True):
         super().__init__()
         self.softmax = nn.Softmax(dim=dim)
         if attn == 'cosine':
             self.cosine = nn.CosineSimilarity(dim=dim)
         self.attn = attn
         self.dim = dim
+        self.get_weights = get_weights
+        self.residual = residual
 
-    def forward(self, xs, ys):
-        """Forward pass."""
+    def forward(self, xs, ys, mask_ys=None):
+        """ xs: B x query_len x dim
+            ys: B x key_len x dim
+            TODO: Document this
+        """
+        bsz = xs.size(0)
+        y_len = ys.size(1)
+        x_len = xs.size(1)
         if self.attn == 'cosine':
             l1 = self.cosine(xs, ys).unsqueeze(self.dim - 1)
         else:
             l1 = torch.bmm(xs, ys.transpose(1, 2))
             if self.attn == 'sqrt':
                 d_k = ys.size(-1)
                 l1 = l1 / math.sqrt(d_k)
+        if mask_ys is not None:
+            attn_mask = (mask_ys == 0).view(bsz, 1, y_len)
+            attn_mask = attn_mask.repeat(1, x_len, 1)
+            l1.masked_fill_(attn_mask, -float('inf'))
         l2 = self.softmax(l1)
         lhs_emb = torch.bmm(l2, ys)
-        # add back the query
-        lhs_emb = lhs_emb.add(xs)
 
-        return lhs_emb.squeeze(self.dim - 1), l2
+        # # add back the query
+        if self.residual:
+            lhs_emb = lhs_emb.add(xs)
+
+        if self.get_weights:
+            return lhs_emb.squeeze(self.dim - 1), l2
+        else:
+            return lhs_emb.squeeze(self.dim - 1)
 
 
 class MultiHeadAttention(nn.Module):