Continuous evaluations init commit (facebookresearch#325)

Summary: Create a script that continuously evaluates benchmarks as they become available from a pretraining. ![Uploading Screen Shot 2021-06-02 at 10.22.01 AM.png…]() ![Uploading Screen Shot 2021-06-02 at 10.22.19 AM.png…]() <img width="593" alt="Screen Shot 2021-06-02 at 10 22 37 AM" src="https://user-images.githubusercontent.com/25669348/120497511-7888c880-c38c-11eb-8bc1-78bacc5d968b.png"> <img width="1237" alt="Screen Shot 2021-06-02 at 10 22 59 AM" src="https://user-images.githubusercontent.com/25669348/120497575-85a5b780-c38c-11eb-9445-2076e15be888.png"> Next Steps: 1. Deal with sharded checkpoints and their conversion 1. Improve max_iteration logic 1. Extend to FB infra. 1. Write unit tests 1. Think about how these tricky evaluation tests: facebookresearch#325 (comment) 1. Try not to replicate so much logic in the class (e.g. get path names from vissl code, requires some refactoring). 1. Look into email notifications. Testing: 1. Run 8node Swav with 10 epochs with 3 different benchmark evaluations with different resource requirements. SUCCESS. json config: ``` { "params": { "training_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints", "benchmarks": [ { "evaluation_name": "clevr_count_linear", "config_files": [ "config=config_local/eval_resnet_8gpu_transfer_clevr_count_linear_benchmark_suite_scheduler_test.yaml" ] }, { "evaluation_name": "clevr_dist_linear", "config_files": [ "config=config_local/eval_resnet_8gpu_transfer_clevr_dist_linear_benchmark_suite_scheduler_test.yaml" ] }, { "evaluation_name": "in1k_linear", "config_files": [ "config=config_local/eval_resnet_8gpu_transfer_in1k_linear_benchmark_suite_scheduler_test.yaml" ] } ], "evaluation_iter_freq": 600, "evaluation_phase_freq": 2, "evaluate_final_phase": true, "autoload_slurm_evaluator_checkpoint": false, "slurm_evaluator_checkpoint": null, "auto_retry_evaluations": true, "retry_evaluation_job_ids": [], "max_retries": 3, "pytorch_ports": [40050, 40051, 40052, 40053, 40054, 40055, 40056, 40057, 40058, 40059, 40060, 40061, 40062, 40063] }, "slurm_options": { "PARTITION": "learnfair" } } ``` Example snippet from `evaluation_metrics.json`: ``` { "model_final_checkpoint_phase9": [ { "checkpoint_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear/checkpoints", "config_files": [ "config=config_local/eval_resnet_8gpu_transfer_clevr_count_linear_benchmark_suite_scheduler_test.yaml", "hydra.run.dir='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear'", "config.CHECKPOINT.DIR='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear/checkpoints'", "config.SLURM.LOG_FOLDER='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear'", "config.SLURM.LOG_FOLDER='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear'", "config.SLURM.USE_SLURM=true", "config.MODEL.WEIGHTS_INIT.PARAMS_FILE='/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/model_final_checkpoint_phase9.torch'" ], "evaluation_name": "clevr_count_linear", "job_id": "42410489", "metrics": { "test_accuracy_list_meter_top_1_res5": { "iteration": 822, "metric": 34.62, "train_phase_idx": 2 }, "train_accuracy_list_meter_top_1_res5": { "iteration": 822, "metric": 33.8514, "train_phase_idx": 2 } }, "num_retries": 1, "slurm_checkpoint_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear/checkpoints", "slurm_log_dir": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/evaluations/model_final_checkpoint_phase9/clevr_count_linear", "slurm_state": "COMPLETED", "weights_init_params_file": "/checkpoint/iseessel/vissl/2021-06-09-11-19-12/checkpoints/model_final_checkpoint_phase9.torch" }, ... ``` The following hold: 1. Training completes appropriately, w/o errors. 1. Able to resume checkpoints. 1. Evaluation folder structure is as expected above. 1. Best Metrics are extracted. Pull Request resolved: facebookresearch#325 Reviewed By: prigoyal Differential Revision: D28901750 Pulled By: iseessel fbshipit-source-id: 732074043200ac51f3e709d5e67e686f26d36835
iseessel · Jun 14, 2021 · b9c5b34 · b9c5b34
1 parent 5376448
commit b9c5b34
Show file tree

Hide file tree

Showing 11 changed files with 944 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -97,5 +97,5 @@ website/pages/tutorials/*
 **/.ipynb_checkpoints/**
 
 # Configs for local development
-configs/config_local/*
+configs/config/config_local/*
 train_config.yaml
diff --git a/dev/__init__.py b/dev/__init__.py
diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json b/dev/benchmark_suite/benchmark_suite_scheduler_defaults.json
@@ -0,0 +1,22 @@
+{
+    "params": {
+           "evaluation_iter_freq": -1,
+           "evaluation_phase_freq": -1,
+           "evaluate_final_phase": true,
+           "autoload_slurm_evaluator_checkpoint": false,
+           "slurm_evaluator_checkpoint": null,
+           "auto_retry_evaluations": false,
+           "retry_evaluation_job_ids": [],
+           "max_retries": 3,
+           "pytorch_ports": [40050]
+       },
+       "slurm_options": {
+            "NAME": "vissl",
+            "COMMENT": "vissl evaluation job",
+            "CONSTRAINT": "",
+            "TIMEOUT_MIN": 4320,
+            "CPUS_PER_TASK": 8,
+            "MEM_GB": 16,
+            "ADDITIONAL_PARAMETERS": {}
+        }
+}
diff --git a/dev/benchmark_suite/benchmark_suite_scheduler_template.json b/dev/benchmark_suite/benchmark_suite_scheduler_template.json
@@ -0,0 +1,33 @@
+{
+    "params": {
+           "training_checkpoint_dir": "(str) Training checkpoint directory. That is the CHECKPOINT.DIR of the training config",
+           "benchmarks": [
+               {
+                   "evaluation_name": "(str) Name of benchmark for convenience",
+                   "config_files": [
+                       "config=path/to/evaluation/config",
+                       "config.OVERRIDES=new_value"
+                   ]
+               }
+           ],
+           "evaluation_iter_freq": "(int, default=-1) Evaluate the checkpoint every N iterations",
+           "evaluation_phase_freq": "(int, default=-1) Evaluate the checkpoint every N phases",
+           "evaluate_final_phase": "(bool, default=True) Evaluate the final phase",
+           "autoload_slurm_evaluator_checkpoint": "(bool, default=False) Whether or not to automatically load the benchmark checkpoint",
+           "slurm_evaluator_checkpoint": "(str, default=None) Path to load the benchmark checkpoint",
+           "auto_retry_evaluations": "(bool, default=False) Whether or not to automatically retry the evaluations",
+           "retry_evaluation_job_ids": "(array[int], default=[]) Array of job_ids to retry",
+           "max_retries": "(int, default=3) Maximum number of retries",
+           "pytorch_ports": "(List[int], default=[40500]) List of pytorch ports to cycle through as you are launching your evaluations, in order to prevent Pytorch DDP port colissions."
+       },
+       "slurm_options": {
+            "PARTITION": "(str) Partition",
+            "NAME": "(str, default=vissl) Name of slurm job",
+            "COMMENT": "(str, default=vissl evaluation job) Comment of slurm job",
+            "CONSTRAINT": "(str, default='') Constraing of slurm job",
+            "TIMEOUT_MIN": "(int, default=72 * 60) Minimum amount of minutes to timeout",
+            "CPUS_PER_TASK": "(int, default=8) Numer of cpus per task.",
+            "MEM_GB": "(int, default=32) Amount of RAM to request from slurm",
+            "ADDITIONAL_PARAMETERS": "(Dict[[str, Any]], default={}) Any default slurm options to pass to submitit"
+    }
+}
diff --git a/dev/launch_benchmark_suite_scheduler_slurm.sh b/dev/launch_benchmark_suite_scheduler_slurm.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This benchmark suite script launches a benchmark suite scheduler slurm job.
+# The job takes an absolute json config path (see benchmark_suite_scheduler_template.json for info)
+# The job continuously monitors training benchmarks, and dynamically launches evaluation jobs
+# and amalgamates the results.
+
+######################### EXAMPLE USAGE #################################
+
+# cd into vissl root directory.
+#
+# bash ./dev/launch_benchmark_suite_scheduler_slurm.sh /path/to/benchmark_suite_scheduler.json
+
+# See benchmark_suite_scheduler_template.json or for config information or slurm_evaluator.py for class structure.
+######################### INPUT PARAMS ##################################
+
+FILE=( "$@" )
+
+####################### setup experiment dir ###################################
+
+# create a temporary experiment folder to run the SLURM job in isolation
+RUN_ID=$(date +'%Y-%m-%d-%H-%M-%S')
+EXP_ROOT_DIR="/checkpoint/$USER/vissl/$RUN_ID"
+
+echo "EXP_ROOT_DIR: $EXP_ROOT_DIR"
+echo "CONFIG_FILE: ${FILE[0]}"
+
+rm -rf "$EXP_ROOT_DIR"
+mkdir -p "$EXP_ROOT_DIR"
+cp -r . "$EXP_ROOT_DIR"
+
+####################### setup experiment dir ###################################
+export PYTHONPATH="$EXP_ROOT_DIR/:$PYTHONPATH"
+python -u "$EXP_ROOT_DIR/tools/launch_benchmark_suite_scheduler_slurm.py" \
+    "${FILE[@]}"
diff --git a/tools/launch_benchmark_suite_scheduler_slurm.py b/tools/launch_benchmark_suite_scheduler_slurm.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+import pkg_resources
+import submitit
+from fvcore.common.file_io import PathManager
+from vissl.config.attr_dict import AttrDict
+from vissl.utils.benchmark_suite_scheduler import BenchmarkSuiteScheduler
+from vissl.utils.hydra_config import is_hydra_available
+from vissl.utils.io import load_file
+from vissl.utils.misc import recursive_dict_merge
+from vissl.utils.slurm import is_submitit_available
+
+
+# Default config options
+default_config_file = pkg_resources.resource_filename(
+    "dev", "benchmark_suite/benchmark_suite_scheduler_defaults.json"
+)
+_DEFAULT_CONFIG = load_file(default_config_file)
+
+
+class SlurmEvaluatorJob:
+    """
+    The slurm evaluator job is a thin wrapper around BenchmarkSuiteScheduler
+    used by submitit. It's main function is to run multiple evaluations
+    on a single training.
+    """
+
+    def __init__(self, benchmark_suite_scheduler: BenchmarkSuiteScheduler):
+        self.benchmark_suite_scheduler = benchmark_suite_scheduler
+
+    def __call__(self):
+        self.benchmark_suite_scheduler.evaluate()
+
+    def checkpoint(self):
+        """
+        This method is called whenever a job is pre-empted, timedout, etc,.
+        Here we save the evaluation benchmarks, so that we can reload them
+        and continue where we left off.
+        """
+        self.benchmark_suite_scheduler.save_evaluation_benchmarks()
+        # Forces the benchmark_suite_scheduler to automatically reload it's
+        # checkpoint, the benchmark results.
+        self.benchmark_suite_scheduler.autoload_benchmark_suite_scheduler_checkpoint = (
+            True
+        )
+
+        trainer = SlurmEvaluatorJob(
+            benchmark_suite_scheduler=self.benchmark_suite_scheduler
+        )
+        return submitit.helpers.DelayedSubmission(trainer)
+
+
+def launch_benchmark_suite_scheduler(config_file):
+    assert PathManager.exists(config_file), "Slurm evaluator config file must exist"
+
+    user_config = load_file(config_file)
+    config = _DEFAULT_CONFIG.copy()
+    recursive_dict_merge(config, user_config)
+
+    benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"])
+    benchmark_suite_scheduler_job = SlurmEvaluatorJob(
+        benchmark_suite_scheduler=benchmark_suite_scheduler
+    )
+    executor = submitit.AutoExecutor(folder=benchmark_suite_scheduler.evaluation_dir())
+
+    assert "slurm_options" in config, "slurm_options must be specified"
+    assert (
+        "PARTITION" in config["slurm_options"]
+    ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm"
+
+    slurm_options = AttrDict(config["slurm_options"])
+    executor.update_parameters(
+        name=slurm_options.NAME,
+        slurm_comment=slurm_options.COMMENT,
+        slurm_partition=slurm_options.PARTITION,
+        slurm_constraint=slurm_options.CONSTRAINT,
+        timeout_min=slurm_options.TIMEOUT_MIN,
+        nodes=1,
+        cpus_per_task=slurm_options.CPUS_PER_TASK,
+        tasks_per_node=1,
+        mem_gb=slurm_options.MEM_GB,
+        slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS,
+    )
+
+    job = executor.submit(benchmark_suite_scheduler_job)
+    print(f"SUBMITTED EVALUATION JOB: {job.job_id}")
+
+
+if __name__ == "__main__":
+    """
+    Example usage:
+    python -u "./vissl/engines/benchmark_suite_scheduler.py" \
+        "/path/to/benchmark_suite_scheduler_example.json"
+    """
+    assert is_hydra_available(), "Make sure to install hydra"
+
+    assert (
+        is_submitit_available()
+    ), "Please 'pip install submitit' to schedule jobs on SLURM"
+
+    config_file = sys.argv[1]
+    launch_benchmark_suite_scheduler(config_file)
diff --git a/vissl/hooks/log_hooks.py b/vissl/hooks/log_hooks.py
@@ -245,6 +245,11 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
                     "eta": eta_string,
                     "peak_mem(M)": peak_mem_used,
                 }
+
+                if iteration == 1:
+                    # Set max iterations. Currently used in benchmark_suite_scheduler.py
+                    log_data["max_iterations"] = task.max_iteration
+
                 if self.btime_freq and len(batch_times) >= self.btime_freq:
                     rolling_avg_time = (
                         sum(batch_times[-self.btime_freq :]) / self.btime_freq