Docs (#71)

shahbuland · web-flow · commit b623d1bbd918 · 2022-10-28T13:35:45.000-04:00
* Initial docs commit

* Add extensive docstrings for data types

* Add documentation on configs and data

* Add documentation for models

* Add documentation for orchestrators

* Add documentation for pipelines

* Resolve missed merge conflict

* Add docs for ilql

* Add some brief documentation on examples present

* update readme with link to docs

* Add rtd yml config

* Remove unneeded/ugly undoc-members

* Update docs for configs to account for method specific configs

* Add docstrings for method configs

* Move docstring into ModelBranch class

* Update docs with pipeline and model refactors

* Resolve erroneous merge (use updated dataclass attributes from master)

* Remove old file from before merge

* Add spacing after docstrings

* Update README.md

* removed duplicated class method

* Removed unneeded whitespace

* Add whitespace after docstrings where appropriate

* Update readthedocs version to py39

* precommit fixes

* Change save_interval to checkpoint_interval in docstring

* Remove redundant docs links from readme
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -4,6 +4,6 @@ sphinx:
   configuration: docs/source/conf.py
 
 python:
-  version: 3.8
+  version: 3.9
   install:
   - requirements: docs/requirements.txt
diff --git a/docs/source/configs.rst b/docs/source/configs.rst
@@ -26,11 +26,9 @@ the specific method being used (i.e. ILQL or PPO)
 **PPO**
 
 .. autoclass:: trlx.data.method_configs.PPOConfig
-    :undoc-members:
     :members:
 
 **ILQL**
 
 .. autoclass:: trlx.data.method_configs.ILQLConfig
-    :undoc-members:
     :members:
diff --git a/docs/source/data.rst b/docs/source/data.rst
@@ -12,7 +12,6 @@ each demand different kinds of data during training.
 
 **Basic Data Elements for Accelerate**
 
-
 .. autoclass:: trlx.data.accelerate_base_datatypes.PromptElement
     :members:
 
@@ -25,7 +24,6 @@ each demand different kinds of data during training.
 .. autoclass:: trlx.data.accelerate_base_datatypes.AccelerateRLBatchElement
     :members:
 
-
 **Data Elements for PPO**
 
 .. autoclass:: trlx.data.ppo_types.PPORLElement
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -19,17 +19,19 @@ Note that new models must be registered with ``trlx.model.register_model``.
 .. autoclass:: trlx.model.accelerate_ppo_model.AcceleratePPOModel
     :members:
 
-.. autoclass:: trlx.model.nn.ppo_models.ValueHead
+.. autoclass:: trlx.model.nn.ppo_models.GPTHeadWithValueModel
     :members:
 
-.. autoclass:: trlx.model.nn.ppo_models.GPT2HeadWithValueModel
+.. autoclass:: trlx.model.nn.ppo_models.ModelBranch
+    :members:
+
+.. autoclass:: trlx.model.nn.ppo_models.GPTHydraHeadWithValueModel
     :members:
 
 **ILQL**
 
 .. autoclass:: trlx.model.accelerate_ilql_model.AccelerateILQLModel
     :members:
 
-
 .. autoclass:: trlx.model.nn.ilql_models.CausalLMWithValueHeads
     :members:
diff --git a/trlx/data/configs.py b/trlx/data/configs.py
@@ -23,7 +23,7 @@ class ModelConfig:
 
     model_path: str
     tokenizer_path: str
-    model_type: str  # One of the architectures present in framework.model
+    model_type: str
     num_layers_unfrozen: int = -1
 
     @classmethod
@@ -75,6 +75,9 @@ class TrainConfig:
     :param orchestrator: Orchestrator to use for training. One of the registered orchestrators present in trlx.orchestrator
     :type orchestrator: str
 
+    :param checkpoint_dir: Directory to save checkpoints
+    :type checkpoint_dir: str
+
     :param project_name: Project name for wandb
     :type project_name: str
     """
diff --git a/trlx/data/method_configs.py b/trlx/data/method_configs.py
@@ -59,6 +59,43 @@ def from_dict(cls, config: Dict[str, Any]):
 @dataclass
 @register_method
 class PPOConfig(MethodConfig):
+    """
+    Config for PPO method
+
+    :param ppo_epochs: Number of updates per batch
+    :type ppo_epochs: int
+
+    :param num_rollouts: Number  of experiences to observe before learning
+    :type num_rollouts: int
+
+    :param init_kl_coef: Initial value for KL coefficient
+    :type init_kl_coef: float
+
+    :param target: Target value for KL coefficient
+    :type target: float
+
+    :param horizon: Number of steps for KL coefficient to reach target
+    :type horizon: int
+
+    :param gamma: Discount factor
+    :type gamma: float
+
+    :param lam: GAE lambda
+    :type lam: float
+
+    :param cliprange: Clipping range for PPO policy loss (1 - cliprange, 1 + cliprange)
+    :type cliprange: float
+
+    :param cliprange_value: Clipping range for predicted values (observed values - cliprange_value, observed values + cliprange_value)
+    :type cliprange_value: float
+
+    :param vf_coef: Value loss scale w.r.t policy loss
+    :type vf_coef: float
+
+    :param gen_kwargs: Additioanl kwargs for the generation
+    :type gen_kwargs: Dict[str, Any]
+    """
+
     ppo_epochs: int
     num_rollouts: int
     chunk_size: int
@@ -76,6 +113,28 @@ class PPOConfig(MethodConfig):
 @dataclass
 @register_method
 class ILQLConfig(MethodConfig):
+    """
+    Config for ILQL method
+
+    :param tau: Control tradeoff in value loss between punishing value network for underestimating the target Q (i.e. Q value corresponding to the action taken) (high tau) and overestimating the target Q (low tau)
+    :type tau: float
+
+    :param gamma: Discount factor for future rewards
+    :type gamma: float
+
+    :param cql_scale: Weight for CQL loss term
+    :type cql_scale: float
+
+    :param awac_scale: Weight for AWAC loss term
+    :type awac_scale: float
+
+    :param steps_for_target_q_sync: Number of steps to wait before syncing target Q network with Q network
+    :type steps_for_target_q_sync: int
+
+    :param two_qs: Use minimum of two Q-value estimates
+    :type two_qs: bool
+    """
+
     tau: float
     gamma: float
     cql_scale: float
diff --git a/trlx/data/ppo_types.py b/trlx/data/ppo_types.py
@@ -6,8 +6,6 @@
 @dataclass
 class PPORLElement:
     """
-    RLElement for PPO
-
     :param query_tensor: The query tensor i.e. the prompt tokens. Should be a long tensor.
     :type query_tensor: torch.Tensor
 
diff --git a/trlx/model/nn/ppo_models.py b/trlx/model/nn/ppo_models.py
@@ -99,14 +99,13 @@ def forward(
         )
 
 
-"""
-ModelBranch implements the frozen upper trunk of the reference model
-used when computing the PPO KL-divergence penalty. Expects a list of
-frozen transformer blocks and an lm_head from the base model.
-"""
-
-
 class ModelBranch(PreTrainedModel):
+    """
+    ModelBranch implements the frozen upper trunk of the reference model
+    used when computing the PPO KL-divergence penalty. Expects a list of
+    frozen transformer blocks and an lm_head from the base model.
+    """
+
     def __init__(self, config, transformer_blocks, ln_f, lm_head):
         super().__init__(config)