Skip to content

Commit

Permalink
Nemo 2 Model Checkpoint Load Test (#270)
Browse files Browse the repository at this point in the history
Adds a test that ensures a Geneformer model checkpoint saved with nemo 2 can be loaded.

Also adds the top-level `results/` directory to the docker and git ignore files.
---------

Co-authored-by: John St John <[email protected]>
  • Loading branch information
malcolmgreaves and jstjohn authored Oct 7, 2024
1 parent b28c7a7 commit fb33522
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# -- Docker-specific ignores --

# bionemo related
results/

# we copy over docs/ into the image for notebook testing, but can skip the images for space
docs/docs/assets

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ docs/site/
*.nemo
protein/
singlecell/
results/

# Local configs
.gitconfig
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@

nemo1_checkpoint_path: Path = load("geneformer/qa")
nemo1_release_checkpoint_path: Path = load("geneformer/10M_240530:1.0")
nemo2_release_checkpoint_path: Path = load("geneformer/10M_240530:2.0")
nemo_1_per_layer_outputs_path: Path = load("single_cell/nemo1-geneformer-per-layer-outputs")
nemo_1_expected_values_path: Path = load("single_cell/nemo1-geneformer-golden-vals")
data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
Expand Down Expand Up @@ -747,6 +748,32 @@ def test_inference_loss_10m_released_checkpoint(geneformer_config: GeneformerCon
assert mean_loss < TARGET_MEAN_LOSS or mean_loss == pytest.approx(TARGET_MEAN_LOSS, abs=1e-2, rel=None)


def test_inference_loss_10m_nemo2_released_checkpoint(geneformer_config: GeneformerConfig, seed: int = 42):
"""Test that we get a good loss when loading a bionemo1 checkpoint with a properly initialized config"""
geneformer_config_logit = deepcopy(geneformer_config)
# Set up the model to return logits and switch to the released 10M checkpoint
geneformer_config_logit.set_hparam("return_only_hidden_states", False) # return logits
geneformer_config_logit.set_hparam(
"initial_ckpt_path", nemo2_release_checkpoint_path
) # release checkpoint is important

mean_loss = _get_loss_from_model(geneformer_config_logit, seed)

# NOTE: the values in the table were from the average of averages of 8 sized batches
# Experiment 1) loaded the 10M model and did the mean of mean loss with 8 sized batches
# this gives: 2.3558831214904785 vs 2.357126723703872, so we actually do better!
# For NVIDIA employees see work here:
# https://docs.google.com/document/d/1CofamqHbQlp5U8SjmW7NR7PbTbF72Lj9L9xz1W5t3ZI/edit
# Experiment 2)
# With a proper loss (sum/n) and limiting to 200 _random_ batches of size 8 for speed
# we get a similar range number of 2.368649959564209.
# a small change that has lower impact than the change between models is probably acceptable.
# the target is defined as described above for the 10M checkpoint based on our first pass
# of the megatron implementation. Since we manually passed experiment 1 this experiment
# will define our initial "golden value" test target.
assert mean_loss < TARGET_MEAN_LOSS or mean_loss == pytest.approx(TARGET_MEAN_LOSS, abs=1e-2, rel=None)


def test_inference_loss_10m_released_checkpoint_wrong_activation(geneformer_config: GeneformerConfig, seed: int = 42):
"""Test that when we use the wrong activation we get worse loss out of the same function we test for a positive
signal. This acts as the negative control.
Expand Down

0 comments on commit fb33522

Please sign in to comment.