Skip to content

Commit

Permalink
Merge pull request #273 from cpnota/release/0.8.0
Browse files Browse the repository at this point in the history
Release/0.8.0
  • Loading branch information
cpnota authored Jun 27, 2022
2 parents aaa5403 + 46a30d6 commit b86676d
Show file tree
Hide file tree
Showing 78 changed files with 801 additions and 841 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand All @@ -27,10 +27,8 @@ jobs:
run: |
sudo apt-get install swig
sudo apt-get install unrar
pip install torch==1.9.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
pip install torch~=1.11 --extra-index-url https://download.pytorch.org/whl/cpu
make install
AutoROM -v
python -m atari_py.import_roms $(python -c 'import site; print(site.getsitepackages()[0])')/multi_agent_ale_py/ROM
- name: Lint code
run: |
make lint
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
install:
pip install -e .[dev]
AutoROM -y --quiet

test: unit-test integration-test

unit-test:
python -m unittest discover -s all -p "*test.py"
python -m unittest discover -s all -p "*test.py" -t .

integration-test:
python -m unittest discover -s integration -p "*test.py"
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,24 @@ It also contains implementations of the following "vanilla" agents, which provid
## Installation

First, you will need a new version of [PyTorch](https://pytorch.org) (>1.3), as well as [Tensorboard](https://pypi.org/project/tensorboard/).
Then, you can install the `autonomous-learning-library` through PyPi:
Then, you can install the core `autonomous-learning-library` through PyPi:

```
pip install autonomous-learning-library
```

Alternately, you can install directly from this repository:
You can also install all of the extras (such as Gym environments) using:

```
git clone https://github.com/cpnota/autonomous-learning-library.git
cd autonomous-learning-library
pip install -e .
pip install autonomous-learning-library[all]
```

You can also install the prerequisites using:
Finally, you can install directly from this repository including the dev dependencies using:

```
pip install autonomous-learning-library[pytorch]
git clone https://github.com/cpnota/autonomous-learning-library.git
cd autonomous-learning-library
pip install -e .[dev]
```

## Running the Presets
Expand All @@ -81,7 +81,7 @@ tensorboard --logdir runs
```

and opening your browser to http://localhost:6006.
Once the model is trained to your satisfaction, you can watch the trained model play using:
Once the model is fully trained, you can watch the trained model play using:

```
all-watch-atari Breakout "runs/a2c_[id]/preset.pt"
Expand Down
22 changes: 12 additions & 10 deletions all/agents/a2c.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from torch.nn.functional import mse_loss
from all.logging import DummyWriter
from all.logging import DummyLogger
from all.memory import NStepAdvantageBuffer
from ._agent import Agent
from ._parallel_agent import ParallelAgent
Expand All @@ -24,7 +24,7 @@ class A2C(ParallelAgent):
discount_factor (float): Discount factor for future rewards.
n_envs (int): Number of parallel actors/environments
n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout.
writer (Writer): Used for logging.
logger (Logger): Used for logging.
"""

def __init__(
Expand All @@ -36,15 +36,15 @@ def __init__(
entropy_loss_scaling=0.01,
n_envs=None,
n_steps=4,
writer=DummyWriter()
logger=DummyLogger()
):
if n_envs is None:
raise RuntimeError("Must specify n_envs.")
# objects
self.features = features
self.v = v
self.policy = policy
self.writer = writer
self.logger = logger
# hyperparameters
self.discount_factor = discount_factor
self.entropy_loss_scaling = entropy_loss_scaling
Expand Down Expand Up @@ -81,15 +81,17 @@ def _train(self, next_states):
policy_gradient_loss = -(distribution.log_prob(actions) * advantages).mean()
entropy_loss = -distribution.entropy().mean()
policy_loss = policy_gradient_loss + self.entropy_loss_scaling * entropy_loss
loss = value_loss + policy_loss

# backward pass
self.v.reinforce(value_loss)
self.policy.reinforce(policy_loss)
self.features.reinforce()
loss.backward()
self.v.step(loss=value_loss)
self.policy.step(loss=policy_loss)
self.features.step()

# debugging
self.writer.add_loss('policy_gradient', policy_gradient_loss.detach())
self.writer.add_loss('entropy', entropy_loss.detach())
# record metrics
self.logger.add_info('entropy', -entropy_loss)
self.logger.add_info('normalized_value_error', value_loss / targets.var())

def _make_buffer(self):
return NStepAdvantageBuffer(
Expand Down
8 changes: 4 additions & 4 deletions all/agents/c51.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
import numpy as np
from all.logging import DummyWriter
from all.logging import DummyLogger
from ._agent import Agent


Expand Down Expand Up @@ -35,12 +35,12 @@ def __init__(
minibatch_size=32,
replay_start_size=5000,
update_frequency=1,
writer=DummyWriter(),
logger=DummyLogger(),
):
# objects
self.q_dist = q_dist
self.replay_buffer = replay_buffer
self.writer = writer
self.logger = logger
# hyperparameters
self.eps = eps
self.exploration = exploration
Expand Down Expand Up @@ -94,7 +94,7 @@ def _train(self):
# update replay buffer priorities
self.replay_buffer.update_priorities(kl.detach())
# debugging
self.writer.add_loss(
self.logger.add_loss(
"q_mean", (dist.detach() * self.q_dist.atoms).sum(dim=1).mean()
)

Expand Down
33 changes: 20 additions & 13 deletions all/agents/ppo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from torch.nn.functional import mse_loss
from all.logging import DummyWriter
from all.logging import DummyLogger
from all.memory import GeneralizedAdvantageBuffer
from ._agent import Agent
from ._parallel_agent import ParallelAgent
Expand All @@ -24,9 +24,10 @@ class PPO(ParallelAgent):
epochs (int): Number of times to reuse each sample.
lam (float): The Generalized Advantage Estimate (GAE) decay parameter.
minibatches (int): The number of minibatches to split each batch into.
compute_batch_size (int): The batch size to use for computations that do not need backpropogation.
n_envs (int): Number of parallel actors/environments.
n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout.
writer (Writer): Used for logging.
logger (Logger): Used for logging.
"""

def __init__(
Expand All @@ -40,24 +41,26 @@ def __init__(
epsilon=0.2,
lam=0.95,
minibatches=4,
compute_batch_size=256,
n_envs=None,
n_steps=4,
writer=DummyWriter()
logger=DummyLogger()
):
if n_envs is None:
raise RuntimeError("Must specify n_envs.")
# objects
self.features = features
self.v = v
self.policy = policy
self.writer = writer
self.logger = logger
# hyperparameters
self.discount_factor = discount_factor
self.entropy_loss_scaling = entropy_loss_scaling
self.epochs = epochs
self.epsilon = epsilon
self.lam = lam
self.minibatches = minibatches
self.compute_batch_size = compute_batch_size
self.n_envs = n_envs
self.n_steps = n_steps
# private
Expand All @@ -82,9 +85,10 @@ def _train(self, next_states):
states, actions, advantages = self._buffer.advantages(next_states)

# compute target values
features = self.features.no_grad(states)
pi_0 = self.policy.no_grad(features).log_prob(actions)
targets = self.v.no_grad(features) + advantages
features = states.batch_execute(self.compute_batch_size, self.features.no_grad)
features['actions'] = actions
pi_0 = features.batch_execute(self.compute_batch_size, lambda s: self.policy.no_grad(s).log_prob(s['actions']))
targets = features.batch_execute(self.compute_batch_size, self.v.no_grad) + advantages

# train for several epochs
for _ in range(self.epochs):
Expand Down Expand Up @@ -115,15 +119,17 @@ def _train_minibatch(self, states, actions, pi_0, advantages, targets):
policy_gradient_loss = self._clipped_policy_gradient_loss(pi_0, pi_i, advantages)
entropy_loss = -distribution.entropy().mean()
policy_loss = policy_gradient_loss + self.entropy_loss_scaling * entropy_loss
loss = value_loss + policy_loss

# backward pass
self.v.reinforce(value_loss)
self.policy.reinforce(policy_loss)
self.features.reinforce()
loss.backward()
self.v.step(loss=value_loss)
self.policy.step(loss=policy_loss)
self.features.step()

# debugging
self.writer.add_loss('policy_gradient', policy_gradient_loss.detach())
self.writer.add_loss('entropy', entropy_loss.detach())
self.logger.add_info('entropy', -entropy_loss)
self.logger.add_info('normalized_value_error', value_loss / targets.var())

def _clipped_policy_gradient_loss(self, pi_0, pi_i, advantages):
ratios = torch.exp(pi_i - pi_0)
Expand All @@ -139,7 +145,8 @@ def _make_buffer(self):
self.n_steps,
self.n_envs,
discount_factor=self.discount_factor,
lam=self.lam
lam=self.lam,
compute_batch_size=self.compute_batch_size
)


Expand Down
56 changes: 28 additions & 28 deletions all/agents/sac.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from torch.nn.functional import mse_loss
from all.logging import DummyWriter
from all.logging import DummyLogger
from ._agent import Agent


Expand All @@ -20,7 +20,6 @@ class SAC(Agent):
policy (DeterministicPolicy): An Approximation of a deterministic policy.
q1 (QContinuous): An Approximation of the continuous action Q-function.
q2 (QContinuous): An Approximation of the continuous action Q-function.
v (VNetwork): An Approximation of the state-value function.
replay_buffer (ReplayBuffer): The experience replay buffer.
discount_factor (float): Discount factor for future rewards.
entropy_target (float): The desired entropy of the policy. Usually -env.action_space.shape[0]
Expand All @@ -32,9 +31,8 @@ class SAC(Agent):

def __init__(self,
policy,
q_1,
q_2,
v,
q1,
q2,
replay_buffer,
discount_factor=0.99,
entropy_target=-2.,
Expand All @@ -43,15 +41,14 @@ def __init__(self,
replay_start_size=5000,
temperature_initial=0.1,
update_frequency=1,
writer=DummyWriter()
logger=DummyLogger()
):
# objects
self.policy = policy
self.v = v
self.q_1 = q_1
self.q_2 = q_2
self.q1 = q1
self.q2 = q2
self.replay_buffer = replay_buffer
self.writer = writer
self.logger = logger
# hyperparameters
self.discount_factor = discount_factor
self.entropy_target = entropy_target
Expand All @@ -78,34 +75,37 @@ def _train(self):
(states, actions, rewards, next_states, _) = self.replay_buffer.sample(self.minibatch_size)

# compute targets for Q and V
_actions, _log_probs = self.policy.no_grad(states)
q_targets = rewards + self.discount_factor * self.v.target(next_states)
v_targets = torch.min(
self.q_1.target(states, _actions),
self.q_2.target(states, _actions),
) - self.temperature * _log_probs
next_actions, next_log_probs = self.policy.no_grad(next_states)
q_targets = rewards + self.discount_factor * (torch.min(
self.q1.target(next_states, next_actions),
self.q2.target(next_states, next_actions),
) - self.temperature * next_log_probs)

# update Q and V-functions
self.q_1.reinforce(mse_loss(self.q_1(states, actions), q_targets))
self.q_2.reinforce(mse_loss(self.q_2(states, actions), q_targets))
self.v.reinforce(mse_loss(self.v(states), v_targets))
q1_loss = mse_loss(self.q1(states, actions), q_targets)
self.q1.reinforce(q1_loss)
q2_loss = mse_loss(self.q2(states, actions), q_targets)
self.q2.reinforce(q2_loss)

# update policy
_actions2, _log_probs2 = self.policy(states)
loss = (-self.q_1(states, _actions2) + self.temperature * _log_probs2).mean()
new_actions, new_log_probs = self.policy(states)
q_values = self.q1(states, new_actions)
loss = -(q_values - self.temperature * new_log_probs).mean()
self.policy.reinforce(loss)
self.q_1.zero_grad()
self.q1.zero_grad()

# adjust temperature
temperature_grad = (_log_probs + self.entropy_target).mean()
temperature_grad = (new_log_probs + self.entropy_target).mean() * self.temperature
self.temperature = max(0, self.temperature + self.lr_temperature * temperature_grad.detach())

# additional debugging info
self.writer.add_loss('entropy', -_log_probs.mean())
self.writer.add_loss('v_mean', v_targets.mean())
self.writer.add_loss('r_mean', rewards.mean())
self.writer.add_loss('temperature_grad', temperature_grad)
self.writer.add_loss('temperature', self.temperature)
self.logger.add_info('entropy', -new_log_probs.mean())
self.logger.add_info('q_values', q_values.mean())
self.logger.add_info('rewards', rewards.mean())
self.logger.add_info('normalized_q1_error', q1_loss / q_targets.var())
self.logger.add_info('normalized_q2_error', q2_loss / q_targets.var())
self.logger.add_info('temperature', self.temperature)
self.logger.add_info('temperature_grad', temperature_grad)

def _should_train(self):
self._frames_seen += 1
Expand Down
10 changes: 6 additions & 4 deletions all/agents/vac.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class VAC(ParallelAgent):
discount_factor (float): Discount factor for future rewards.
n_envs (int): Number of parallel actors/environments
n_steps (int): Number of timesteps per rollout. Updates are performed once per rollout.
writer (Writer): Used for logging.
logger (Logger): Used for logging.
'''

def __init__(self, features, v, policy, discount_factor=1):
Expand Down Expand Up @@ -53,11 +53,13 @@ def _train(self, state, reward):
# compute losses
value_loss = mse_loss(values, targets)
policy_loss = -(advantages * self._distribution.log_prob(self._action)).mean()
loss = value_loss + policy_loss

# backward pass
self.v.reinforce(value_loss)
self.policy.reinforce(policy_loss)
self.features.reinforce()
loss.backward()
self.v.step(loss=value_loss)
self.policy.step(loss=policy_loss)
self.features.step()


VACTestAgent = A2CTestAgent
Loading

0 comments on commit b86676d

Please sign in to comment.