allenai · jacobdanovitch · May 8, 2020 · May 13, 2020
diff --git a/allennlp/common/from_params.py b/allennlp/common/from_params.py
@@ -275,6 +275,7 @@ def pop_and_construct_arg(
                 f"The module from model at {archive_file} at path {module_path} "
                 f"was expected of type {annotation} but is of type {type(result)}"
             )
+        print("_PRETRAINED FINISHED")
         return result
 
     popped_params = params.pop(name, default) if default != _NO_DEFAULT else params.pop(name)

diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
@@ -277,6 +277,7 @@ def from_instances(
         of what the other parameters do.
         """
         logger.info("Fitting token dictionary from dataset.")
+        print("FROM INSTANCES")
         padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
         oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
         namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
@@ -361,6 +362,7 @@ def from_files_and_instances(
         """
         vocab = cls.from_files(directory, padding_token, oov_token)
         logger.info("Fitting token dictionary from dataset.")
+        print("FROM FILES AND INSTANCES")
         namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
         for instance in Tqdm.tqdm(instances):
             instance.count_vocab_items(namespace_token_counts)
@@ -374,6 +376,7 @@ def from_files_and_instances(
             tokens_to_add=tokens_to_add,
             min_pretrained_embeddings=min_pretrained_embeddings,
         )
+        print(f"\n\nVOCABULARY: {vocab}\n\n")
         return vocab
 
     @classmethod
@@ -446,6 +449,7 @@ def set_from_file(
 
     def extend_from_instances(self, instances: Iterable["adi.Instance"]) -> None:
         logger.info("Fitting token dictionary from dataset.")
+        print("EXTEND FROM INSTANCES")
         namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
         for instance in Tqdm.tqdm(instances):
             instance.count_vocab_items(namespace_token_counts)

diff --git a/allennlp/models/basic_classifier.py b/allennlp/models/basic_classifier.py
@@ -92,6 +92,7 @@ def __init__(
         self._accuracy = CategoricalAccuracy()
         self._loss = torch.nn.CrossEntropyLoss()
         initializer(self)
+        self.extend_embedder_vocab()
 
     def forward(  # type: ignore
         self, tokens: TextFieldTensors, label: torch.IntTensor = None

diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
@@ -241,6 +241,7 @@ def extend_vocab(
             to give a helpful error message when extend_vocab is implicitly called
             by train or any other command.
         """
+        print("\n\nEXTENDING VOCAB\n\n")
         # Caveat: For allennlp v0.8.1 and below, we weren't storing vocab_namespace as an attribute,
         # knowing which is necessary at time of embedding vocab extension. So old archive models are
         # currently unextendable.
@@ -255,6 +256,13 @@ def extend_vocab(
             return
 
         extended_num_embeddings = extended_vocab.get_vocab_size(vocab_namespace)
+        print(f"VOCAB: {extended_vocab}")
+        print(f"NAMESPACE: {vocab_namespace}")
+        print(f"EXTENDED_NUM_EMBEDDINGS: {extended_num_embeddings}")
+        print(f"NUM EMBEDDINGS: {self.num_embeddings}")
+        import traceback
+        for line in traceback.format_stack()[:-5]:
+            print(line)
         if extended_num_embeddings == self.num_embeddings:
             # It's already been extended. No need to initialize / read pretrained file in first place (no-op)
             return

diff --git a/allennlp/tests/fixtures/data/esnli_train.jsonl b/allennlp/tests/fixtures/data/esnli_train.jsonl
diff --git a/allennlp/tests/fixtures/data/movies_train.jsonl b/allennlp/tests/fixtures/data/movies_train.jsonl
diff --git a/allennlp/tests/transfer_learning/fixtures/esnli.jsonnet b/allennlp/tests/transfer_learning/fixtures/esnli.jsonnet
@@ -0,0 +1,72 @@
+local embedding_dim = 100;
+local seq_encoder = {
+    "type": "lstm",
+    "input_size": embedding_dim,
+    "hidden_size": embedding_dim,
+    "num_layers": 1,
+    "bidirectional": true
+};
+
+{
+  "dataset_reader": {
+    "type": "snli",
+    "token_indexers": {
+        "tokens": {
+            "type": "single_id",
+            "lowercase_tokens": true
+        }
+    }
+  },
+  "train_data_path": "allennlp/tests/fixtures/data/esnli_train.jsonl",
+  "validation_data_path": "allennlp/tests/fixtures/data/esnli_train.jsonl",
+  "model": {
+    "type": "esim",
+    "text_field_embedder": {
+        "token_embedders": {
+            "tokens": {
+                "type": "embedding",
+                "pretrained_file": "allennlp/tests/fixtures/embeddings/glove.6B.100d.sample.txt.gz", //"https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
+                "embedding_dim": embedding_dim,
+                "trainable": true
+            }
+        }
+    },
+    "encoder": seq_encoder,
+    "matrix_attention": {
+        "type": "dot_product"
+    },
+    "projection_feedforward": {
+        "input_dim": 8*embedding_dim,
+        "hidden_dims": embedding_dim,
+        "num_layers": 1,
+        "activations": "relu"
+    },
+    "inference_encoder": seq_encoder,
+    "output_feedforward": {
+        "input_dim": 8*embedding_dim,
+        "num_layers": 1,
+        "hidden_dims": embedding_dim,
+        "activations": "relu",
+        "dropout": 0.5
+    },
+    "output_logit": {
+        "input_dim": embedding_dim,
+        "num_layers": 1,
+        "hidden_dims": 3,
+        "activations": "linear"
+    },
+  },
+  "data_loader": {
+    "type": "default",
+    "batch_size": 10
+  },
+  "trainer": {
+    "num_epochs": 1,
+    "cuda_device": -1,
+    "validation_metric": "+accuracy",
+    "optimizer": {
+      "type": "adam",
+      "lr": 5e-4
+    },
+  }
+}
diff --git a/allennlp/tests/transfer_learning/fixtures/movies.jsonnet b/allennlp/tests/transfer_learning/fixtures/movies.jsonnet
@@ -0,0 +1,52 @@
+local pretrained = function(module_path, frozen=false) {"_pretrained": {
+    "archive_file": std.extVar("ARCHIVE_PATH"),
+    "module_path": module_path,
+    "freeze": frozen
+}};
+
+{
+  "dataset_reader": {
+    "type": "text_classification_json",
+    "tokenizer": {
+      "type": "whitespace"
+    }
+  },
+
+  "train_data_path": "allennlp/tests/fixtures/data/movies_train.jsonl",
+  "validation_data_path": "allennlp/tests/fixtures/data/movies_train.jsonl",
+  "vocabulary": {
+    "type": "extend",
+    "directory": "/tmp/taskA/vocabulary"
+  },
+  "model": {
+    "type": "basic_classifier",
+    "text_field_embedder": pretrained("_text_field_embedder"),
+    "seq2seq_encoder": pretrained("_encoder"),
+    "seq2vec_encoder": {
+        "type": "boe",
+        "embedding_dim": 200
+    },
+    "feedforward": {
+      "input_dim": 200,
+      "num_layers": 2,
+      "hidden_dims": [200, 2],
+      "activations": ["relu", "linear"],
+      "dropout": [0.15, 0.0]
+    },
+  },
+  "data_loader": {
+    "type": "default",
+    "batch_size": 10
+  },
+  "trainer": {
+    "num_epochs": 1,
+    "patience": 5,
+    "cuda_device": -1,
+    "grad_norm": 40,
+    "validation_metric": "+accuracy",
+    "optimizer": {
+      "type": "adam",
+      "lr": 5e-3
+    },
+  }
+}
diff --git a/allennlp/tests/transfer_learning/models/__init__.py b/allennlp/tests/transfer_learning/models/__init__.py
diff --git a/allennlp/tests/transfer_learning/models/transfer_learning_test.py b/allennlp/tests/transfer_learning/models/transfer_learning_test.py
@@ -0,0 +1,22 @@
+# pylint: disable=invalid-name,protected-access
+import pathlib, json, os
+
+from allennlp_models.nli import snli_reader
+from allennlp.common.testing import ModelTestCase
+from allennlp.common.testing.test_case import TEST_DIR
+from allennlp.commands.train import train_model, train_model_from_file
+
+os.environ['ARCHIVE_PATH'] = "/tmp/taskA"
+
+class TransferLearningTest(ModelTestCase):
+    def setUp(self):
+        super().setUp()
+        self.set_up_model('allennlp/tests/fixtures/esnli.jsonnet',
+                          'allennlp/tests/fixtures/esnli_train.jsonl')
+
+    def test_taskA_end_to_end(self):
+        train_model_from_file("allennlp/tests/transfer_learning/fixtures/esnli.jsonnet", serialization_dir="/tmp/taskA", force=True)
+
+    def test_taskB_end_to_end(self):
+        train_model_from_file("allennlp/tests/transfer_learning/fixtures/movies.jsonnet", serialization_dir="/tmp/taskB", force=True)
+