Merge pull request #273 from cpnota/release/0.8.0

Release/0.8.0
cpnota · Jun 27, 2022 · b86676d · b86676d
2 parents aaa5403 + 46a30d6
commit b86676d
Show file tree

Hide file tree

Showing 78 changed files with 801 additions and 841 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.8, 3.9]
 
     steps:
     - uses: actions/checkout@v2
@@ -27,10 +27,8 @@ jobs:
       run: |
         sudo apt-get install swig
         sudo apt-get install unrar
-        pip install torch==1.9.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch~=1.11 --extra-index-url https://download.pytorch.org/whl/cpu
         make install
-        AutoROM -v
-        python -m atari_py.import_roms $(python -c 'import site; print(site.getsitepackages()[0])')/multi_agent_ale_py/ROM
     - name: Lint code
       run: |
         make lint

diff --git a/Makefile b/Makefile
@@ -1,10 +1,11 @@
 install:
 	pip install -e .[dev]
+	AutoROM -y --quiet
 
 test: unit-test integration-test
 
 unit-test:
-	python -m unittest discover -s all -p "*test.py"
+	python -m unittest discover -s all -p "*test.py" -t .
 
 integration-test:
 	python -m unittest discover -s integration -p "*test.py"

diff --git a/README.md b/README.md
@@ -46,24 +46,24 @@ It also contains implementations of the following "vanilla" agents, which provid
 ## Installation
 
 First, you will need a new version of [PyTorch](https://pytorch.org) (>1.3), as well as [Tensorboard](https://pypi.org/project/tensorboard/).
-Then, you can install the `autonomous-learning-library` through PyPi:
+Then, you can install the core `autonomous-learning-library` through PyPi:
 
 ```
 pip install autonomous-learning-library
 ```
 
-Alternately, you can install directly from this repository:
+You can also install all of the extras (such as Gym environments) using:
 
 ```
-git clone https://github.com/cpnota/autonomous-learning-library.git
-cd autonomous-learning-library
-pip install -e .
+pip install autonomous-learning-library[all]
 ```
 
-You can also install the prerequisites using:
+Finally, you can install directly from this repository including the dev dependencies using:
 
 ```
-pip install autonomous-learning-library[pytorch]
+git clone https://github.com/cpnota/autonomous-learning-library.git
+cd autonomous-learning-library
+pip install -e .[dev]
 ```
 
 ## Running the Presets
@@ -81,7 +81,7 @@ tensorboard --logdir runs
 ```
 
 and opening your browser to http://localhost:6006.
-Once the model is trained to your satisfaction, you can watch the trained model play using:
+Once the model is fully trained, you can watch the trained model play using:
 
 ```
 all-watch-atari Breakout "runs/a2c_[id]/preset.pt"

diff --git a/all/agents/a2c.py b/all/agents/a2c.py
@@ -1,6 +1,6 @@
 import torch
 from torch.nn.functional import mse_loss
-from all.logging import DummyWriter
+from all.logging import DummyLogger
 from all.memory import NStepAdvantageBuffer
 from ._agent import Agent
 from ._parallel_agent import ParallelAgent
@@ -24,7 +24,7 @@ class A2C(ParallelAgent):
         discount_factor (float): Discount factor for future rewards.
         n_envs (int): Number of parallel actors/environments
         n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout.
-        writer (Writer): Used for logging.
+        logger (Logger): Used for logging.
     """
 
     def __init__(
@@ -36,15 +36,15 @@ def __init__(
             entropy_loss_scaling=0.01,
             n_envs=None,
             n_steps=4,
-            writer=DummyWriter()
+            logger=DummyLogger()
     ):
         if n_envs is None:
             raise RuntimeError("Must specify n_envs.")
         # objects
         self.features = features
         self.v = v
         self.policy = policy
-        self.writer = writer
+        self.logger = logger
         # hyperparameters
         self.discount_factor = discount_factor
         self.entropy_loss_scaling = entropy_loss_scaling
@@ -81,15 +81,17 @@ def _train(self, next_states):
             policy_gradient_loss = -(distribution.log_prob(actions) * advantages).mean()
             entropy_loss = -distribution.entropy().mean()
             policy_loss = policy_gradient_loss + self.entropy_loss_scaling * entropy_loss
+            loss = value_loss + policy_loss
 
             # backward pass
-            self.v.reinforce(value_loss)
-            self.policy.reinforce(policy_loss)
-            self.features.reinforce()
+            loss.backward()
+            self.v.step(loss=value_loss)
+            self.policy.step(loss=policy_loss)
+            self.features.step()
 
-            # debugging
-            self.writer.add_loss('policy_gradient', policy_gradient_loss.detach())
-            self.writer.add_loss('entropy', entropy_loss.detach())
+            # record metrics
+            self.logger.add_info('entropy', -entropy_loss)
+            self.logger.add_info('normalized_value_error', value_loss / targets.var())
 
     def _make_buffer(self):
         return NStepAdvantageBuffer(

diff --git a/all/agents/c51.py b/all/agents/c51.py
@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-from all.logging import DummyWriter
+from all.logging import DummyLogger
 from ._agent import Agent
 
 
@@ -35,12 +35,12 @@ def __init__(
             minibatch_size=32,
             replay_start_size=5000,
             update_frequency=1,
-            writer=DummyWriter(),
+            logger=DummyLogger(),
     ):
         # objects
         self.q_dist = q_dist
         self.replay_buffer = replay_buffer
-        self.writer = writer
+        self.logger = logger
         # hyperparameters
         self.eps = eps
         self.exploration = exploration
@@ -94,7 +94,7 @@ def _train(self):
             # update replay buffer priorities
             self.replay_buffer.update_priorities(kl.detach())
             # debugging
-            self.writer.add_loss(
+            self.logger.add_loss(
                 "q_mean", (dist.detach() * self.q_dist.atoms).sum(dim=1).mean()
             )
 

diff --git a/all/agents/ppo.py b/all/agents/ppo.py
@@ -1,6 +1,6 @@
 import torch
 from torch.nn.functional import mse_loss
-from all.logging import DummyWriter
+from all.logging import DummyLogger
 from all.memory import GeneralizedAdvantageBuffer
 from ._agent import Agent
 from ._parallel_agent import ParallelAgent
@@ -24,9 +24,10 @@ class PPO(ParallelAgent):
         epochs (int): Number of times to reuse each sample.
         lam (float): The Generalized Advantage Estimate (GAE) decay parameter.
         minibatches (int): The number of minibatches to split each batch into.
+        compute_batch_size (int): The batch size to use for computations that do not need backpropogation.
         n_envs (int): Number of parallel actors/environments.
         n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout.
-        writer (Writer): Used for logging.
+        logger (Logger): Used for logging.
     """
 
     def __init__(
@@ -40,24 +41,26 @@ def __init__(
             epsilon=0.2,
             lam=0.95,
             minibatches=4,
+            compute_batch_size=256,
             n_envs=None,
             n_steps=4,
-            writer=DummyWriter()
+            logger=DummyLogger()
     ):
         if n_envs is None:
             raise RuntimeError("Must specify n_envs.")
         # objects
         self.features = features
         self.v = v
         self.policy = policy
-        self.writer = writer
+        self.logger = logger
         # hyperparameters
         self.discount_factor = discount_factor
         self.entropy_loss_scaling = entropy_loss_scaling
         self.epochs = epochs
         self.epsilon = epsilon
         self.lam = lam
         self.minibatches = minibatches
+        self.compute_batch_size = compute_batch_size
         self.n_envs = n_envs
         self.n_steps = n_steps
         # private
@@ -82,9 +85,10 @@ def _train(self, next_states):
             states, actions, advantages = self._buffer.advantages(next_states)
 
             # compute target values
-            features = self.features.no_grad(states)
-            pi_0 = self.policy.no_grad(features).log_prob(actions)
-            targets = self.v.no_grad(features) + advantages
+            features = states.batch_execute(self.compute_batch_size, self.features.no_grad)
+            features['actions'] = actions
+            pi_0 = features.batch_execute(self.compute_batch_size, lambda s: self.policy.no_grad(s).log_prob(s['actions']))
+            targets = features.batch_execute(self.compute_batch_size, self.v.no_grad) + advantages
 
             # train for several epochs
             for _ in range(self.epochs):
@@ -115,15 +119,17 @@ def _train_minibatch(self, states, actions, pi_0, advantages, targets):
         policy_gradient_loss = self._clipped_policy_gradient_loss(pi_0, pi_i, advantages)
         entropy_loss = -distribution.entropy().mean()
         policy_loss = policy_gradient_loss + self.entropy_loss_scaling * entropy_loss
+        loss = value_loss + policy_loss
 
         # backward pass
-        self.v.reinforce(value_loss)
-        self.policy.reinforce(policy_loss)
-        self.features.reinforce()
+        loss.backward()
+        self.v.step(loss=value_loss)
+        self.policy.step(loss=policy_loss)
+        self.features.step()
 
         # debugging
-        self.writer.add_loss('policy_gradient', policy_gradient_loss.detach())
-        self.writer.add_loss('entropy', entropy_loss.detach())
+        self.logger.add_info('entropy', -entropy_loss)
+        self.logger.add_info('normalized_value_error', value_loss / targets.var())
 
     def _clipped_policy_gradient_loss(self, pi_0, pi_i, advantages):
         ratios = torch.exp(pi_i - pi_0)
@@ -139,7 +145,8 @@ def _make_buffer(self):
             self.n_steps,
             self.n_envs,
             discount_factor=self.discount_factor,
-            lam=self.lam
+            lam=self.lam,
+            compute_batch_size=self.compute_batch_size
         )
 
 

diff --git a/all/agents/sac.py b/all/agents/sac.py
@@ -1,6 +1,6 @@
 import torch
 from torch.nn.functional import mse_loss
-from all.logging import DummyWriter
+from all.logging import DummyLogger
 from ._agent import Agent
 
 
@@ -20,7 +20,6 @@ class SAC(Agent):
         policy (DeterministicPolicy): An Approximation of a deterministic policy.
         q1 (QContinuous): An Approximation of the continuous action Q-function.
         q2 (QContinuous): An Approximation of the continuous action Q-function.
-        v (VNetwork): An Approximation of the state-value function.
         replay_buffer (ReplayBuffer): The experience replay buffer.
         discount_factor (float): Discount factor for future rewards.
         entropy_target (float): The desired entropy of the policy. Usually -env.action_space.shape[0]
@@ -32,9 +31,8 @@ class SAC(Agent):
 
     def __init__(self,
                  policy,
-                 q_1,
-                 q_2,
-                 v,
+                 q1,
+                 q2,
                  replay_buffer,
                  discount_factor=0.99,
                  entropy_target=-2.,
@@ -43,15 +41,14 @@ def __init__(self,
                  replay_start_size=5000,
                  temperature_initial=0.1,
                  update_frequency=1,
-                 writer=DummyWriter()
+                 logger=DummyLogger()
                  ):
         # objects
         self.policy = policy
-        self.v = v
-        self.q_1 = q_1
-        self.q_2 = q_2
+        self.q1 = q1
+        self.q2 = q2
         self.replay_buffer = replay_buffer
-        self.writer = writer
+        self.logger = logger
         # hyperparameters
         self.discount_factor = discount_factor
         self.entropy_target = entropy_target
@@ -78,34 +75,37 @@ def _train(self):
             (states, actions, rewards, next_states, _) = self.replay_buffer.sample(self.minibatch_size)
 
             # compute targets for Q and V
-            _actions, _log_probs = self.policy.no_grad(states)
-            q_targets = rewards + self.discount_factor * self.v.target(next_states)
-            v_targets = torch.min(
-                self.q_1.target(states, _actions),
-                self.q_2.target(states, _actions),
-            ) - self.temperature * _log_probs
+            next_actions, next_log_probs = self.policy.no_grad(next_states)
+            q_targets = rewards + self.discount_factor * (torch.min(
+                self.q1.target(next_states, next_actions),
+                self.q2.target(next_states, next_actions),
+            ) - self.temperature * next_log_probs)
 
             # update Q and V-functions
-            self.q_1.reinforce(mse_loss(self.q_1(states, actions), q_targets))
-            self.q_2.reinforce(mse_loss(self.q_2(states, actions), q_targets))
-            self.v.reinforce(mse_loss(self.v(states), v_targets))
+            q1_loss = mse_loss(self.q1(states, actions), q_targets)
+            self.q1.reinforce(q1_loss)
+            q2_loss = mse_loss(self.q2(states, actions), q_targets)
+            self.q2.reinforce(q2_loss)
 
             # update policy
-            _actions2, _log_probs2 = self.policy(states)
-            loss = (-self.q_1(states, _actions2) + self.temperature * _log_probs2).mean()
+            new_actions, new_log_probs = self.policy(states)
+            q_values = self.q1(states, new_actions)
+            loss = -(q_values - self.temperature * new_log_probs).mean()
             self.policy.reinforce(loss)
-            self.q_1.zero_grad()
+            self.q1.zero_grad()
 
             # adjust temperature
-            temperature_grad = (_log_probs + self.entropy_target).mean()
+            temperature_grad = (new_log_probs + self.entropy_target).mean() * self.temperature
             self.temperature = max(0, self.temperature + self.lr_temperature * temperature_grad.detach())
 
             # additional debugging info
-            self.writer.add_loss('entropy', -_log_probs.mean())
-            self.writer.add_loss('v_mean', v_targets.mean())
-            self.writer.add_loss('r_mean', rewards.mean())
-            self.writer.add_loss('temperature_grad', temperature_grad)
-            self.writer.add_loss('temperature', self.temperature)
+            self.logger.add_info('entropy', -new_log_probs.mean())
+            self.logger.add_info('q_values', q_values.mean())
+            self.logger.add_info('rewards', rewards.mean())
+            self.logger.add_info('normalized_q1_error', q1_loss / q_targets.var())
+            self.logger.add_info('normalized_q2_error', q2_loss / q_targets.var())
+            self.logger.add_info('temperature', self.temperature)
+            self.logger.add_info('temperature_grad', temperature_grad)
 
     def _should_train(self):
         self._frames_seen += 1

diff --git a/all/agents/vac.py b/all/agents/vac.py
@@ -19,7 +19,7 @@ class VAC(ParallelAgent):
         discount_factor (float): Discount factor for future rewards.
         n_envs (int): Number of parallel actors/environments
         n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout.
-        writer (Writer): Used for logging.
+        logger (Logger): Used for logging.
     '''
 
     def __init__(self, features, v, policy, discount_factor=1):
@@ -53,11 +53,13 @@ def _train(self, state, reward):
             # compute losses
             value_loss = mse_loss(values, targets)
             policy_loss = -(advantages * self._distribution.log_prob(self._action)).mean()
+            loss = value_loss + policy_loss
 
             # backward pass
-            self.v.reinforce(value_loss)
-            self.policy.reinforce(policy_loss)
-            self.features.reinforce()
+            loss.backward()
+            self.v.step(loss=value_loss)
+            self.policy.step(loss=policy_loss)
+            self.features.step()
 
 
 VACTestAgent = A2CTestAgent