diff --git a/README.md b/README.md index e5e708b39..c036b5fe2 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Read the ALF documentation [here](https://alf.readthedocs.io/). |[MuZero](alf/algorithms/muzero_algorithm.py)|Model-based RL|Schrittwieser et al. "Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model" [arXiv:1911.08265](https://arxiv.org/abs/1911.08265)| |[BC](alf/algorithms/bc_algorithm.py)|Offline RL|Pomerleau "ALVINN: An Autonomous Land Vehicle in a Neural Network" [NeurIPS 1988](https://papers.nips.cc/paper/1988/hash/812b4ba287f5ee0bc9d43bbf5bbe87fb-Abstract.html)
Bain et al. "A framework for behavioural cloning" [Machine Intelligence 1999](http://www.cse.unsw.edu.au/~claude/papers/MI15.pdf)| |[Causal BC](alf/algorithms/causal_bc_algorithm.py)|Offline RL|Swamy et al. "Causal Imitation Learning under Temporally Correlated Noise" [ICML2022](https://proceedings.mlr.press/v162/swamy22a/swamy22a.pdf)| +|[SMODICE](alf/algorithms/smodice_algorithm.py)|Offline RL|Ma et al. "Versatile Offline Imitation Learning via State Occupancy Matching" [ICML2022](https://arxiv.org/abs/2202.02433)| |[IQL](alf/algorithms/iql_algorithm.py)|Offline RL|Kostrikov, et al. "Offline Reinforcement Learning with Implicit Q-Learning" [arXiv:2110.06169](https://arxiv.org/abs/2110.06169)| |[SEditor](alf/algorithms/seditor_algorithm.py)|Offline/Safe RL|Yu et al. "Towards Safe Reinforcement Learning with a Safety Editor Policy" [NeurIPS 2022](https://proceedings.neurips.cc/paper_files/paper/2022/file/11afefdd848d1bc9ac9f1604d9f45817-Paper-Conference.pdf)| |[MERLIN](alf/algorithms/merlin_algorithm.py)|Unsupervised learning|Wayne et al. "Unsupervised Predictive Memory in a Goal-Directed Agent"[arXiv:1803.10760](https://arxiv.org/abs/1803.10760)| diff --git a/alf/algorithms/algorithm.py b/alf/algorithms/algorithm.py index b613b8b0f..6c478fc5c 100644 --- a/alf/algorithms/algorithm.py +++ b/alf/algorithms/algorithm.py @@ -2132,7 +2132,10 @@ def _hybrid_update(self, experience, batch_info, offline_experience, else: loss_info = offline_loss_info - params = self._backward_and_gradient_update(loss_info.loss * weight) + params, gns = self._backward_and_gradient_update( + loss_info.loss * weight) + + loss_info = loss_info._replace(gns=gns) if self._RL_train: # for now, there is no need to do a hybrid after update diff --git a/alf/algorithms/smodice_algorithm.py b/alf/algorithms/smodice_algorithm.py new file mode 100644 index 000000000..6e0afbfd1 --- /dev/null +++ b/alf/algorithms/smodice_algorithm.py @@ -0,0 +1,418 @@ +# Copyright (c) 2023 Horizon Robotics and ALF Contributors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn.functional as F + +import alf +from alf.algorithms.config import TrainerConfig +from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm +from alf.data_structures import TimeStep, LossInfo, namedtuple +from alf.data_structures import AlgStep +from alf.networks import EncodingNetwork, ActorNetwork, ValueNetwork, CriticNetwork, Network +from alf.tensor_specs import TensorSpec, BoundedTensorSpec +from alf.utils import dist_utils +from alf.data_structures import AlgStep, StepType +from alf.utils import losses, tensor_utils +from alf.algorithms.algorithm import Algorithm + +SmoState = namedtuple("SmoState", ["actor"], default_value=()) + +SmoInfo = namedtuple( + "SmoInfo", + [ + "actor", + "value", + "discriminator_loss", + "reward", + "discount", + # online + "action", + "action_distribution" + ], + default_value=()) + +SmoCriticInfo = namedtuple("SmoCriticInfo", + ["values", "initial_v_values", "is_first"]) + +SmoLossInfo = namedtuple( + "SmoLossInfo", ["actor", "grad_penalty"], default_value=()) + + +@alf.configurable +class SmodiceAlgorithm(OffPolicyAlgorithm): + r"""SMODICE algorithm. + SMODICE is an offline imitation approach to learn a policy + :math:`\pi_{\theta}(a|s)`, which is a function that maps an input + observation :math:`s` to an action :math:`a`. The paramerates (:math:`\theta`) + of this policy is learned by maximizing the weighted probability of dataset actions + on the training data :math:`D`: + :math:`\max_{\theta} E_{(s,a)~D} w(s, s') \log \pi_{\theta}(a|s)`, + where (s, s') is a state transition pair. + + Reference: + :: + Jason Ma SMODICE: Versatile Offline Imitation Learning via State Occupancy Matching, + ICML 2022. + """ + + def __init__(self, + observation_spec, + action_spec: BoundedTensorSpec, + reward_spec=TensorSpec(()), + actor_network_cls=ActorNetwork, + v_network_cls=ValueNetwork, + discriminator_network_cls=None, + actor_optimizer=None, + value_optimizer=None, + discriminator_optimizer=None, + gamma: float = 0.99, + f: str = "chi", + gradient_penalty_weight: float = 1, + env=None, + config: TrainerConfig = None, + checkpoint=None, + debug_summaries=False, + epsilon_greedy=None, + name="SmodiceAlgorithm"): + """ + Args: + observation_spec (nested TensorSpec): representing the observations. + action_spec (nested BoundedTensorSpec): representing the actions; can + be a mixture of discrete and continuous actions. The number of + continuous actions can be arbitrary while only one discrete + action is allowed currently. If it's a mixture, then it must be + a tuple/list ``(discrete_action_spec, continuous_action_spec)``. + reward_spec (Callable): a rank-1 or rank-0 tensor spec representing + the reward(s). For interface compatiblity purpose. Not actually + used in SmodiceAlgorithm. + actor_network_cls (Callable): is used to construct the actor network. + The constructed actor network is a determinstic network and + will be used to generate continuous actions. + v_network_cls (Callable): is used to construct the value network. + discriminator_network_cls (Callable): is used to construct the discriminatr. + actor_optimizer (torch.optim.optimizer): The optimizer for actor. + value_optimizer (torch.optim.optimizer): The optimizer for value network. + discriminator_optimizer (torch.optim.optimizer): The optimizer for discriminator. + gamma (float): the discount factor. + f: the function form for f-divergence. Currently support 'chi' and 'kl' + gradient_penalty_weight: the weight for discriminator gradient penalty + env (Environment): The environment to interact with. ``env`` is a + batched environment, which means that it runs multiple simulations + simultateously. ``env` only needs to be provided to the root + algorithm. + config (TrainerConfig): config for training. It only needs to be + provided to the algorithm which performs ``train_iter()`` by + itself. + checkpoint (None|str): a string in the format of "prefix@path", + where the "prefix" is the multi-step path to the contents in the + checkpoint to be loaded. "path" is the full path to the checkpoint + file saved by ALF. Refer to ``Algorithm`` for more details. + debug_summaries (bool): True if debug summaries should be created. + epsilon_greedy (float): a floating value in [0,1], representing the + chance of action sampling instead of taking argmax. This can + help prevent a dead loop in some deterministic environment like + Breakout. Only used for evaluation. If None, its value is taken + from ``config.epsilon_greedy`` and then + ``alf.get_config_value(TrainerConfig.epsilon_greedy)``. + name (str): The name of this algorithm. + """ + + if epsilon_greedy is None: + epsilon_greedy = alf.utils.common.get_epsilon_greedy(config) + self._epsilon_greedy = epsilon_greedy + + actor_network = actor_network_cls( + input_tensor_spec=observation_spec, action_spec=action_spec) + + value_network = v_network_cls(input_tensor_spec=observation_spec) + + discriminator_net = discriminator_network_cls( + input_tensor_spec=(observation_spec, action_spec)) + + action_state_spec = actor_network.state_spec + super().__init__( + observation_spec=observation_spec, + action_spec=action_spec, + reward_spec=reward_spec, + train_state_spec=SmoState(actor=action_state_spec), + predict_state_spec=SmoState(actor=action_state_spec), + reward_weights=None, + env=env, + config=config, + checkpoint=checkpoint, + debug_summaries=debug_summaries, + name=name) + + self._actor_network = actor_network + self._value_network = value_network + self._discriminator_net = discriminator_net + self._gradient_penalty_weight = gradient_penalty_weight + + assert actor_optimizer is not None + if actor_optimizer is not None and actor_network is not None: + self.add_optimizer(actor_optimizer, [actor_network]) + + assert value_optimizer is not None + if value_optimizer is not None and value_network is not None: + self.add_optimizer(value_optimizer, [value_network]) + + assert discriminator_optimizer is not None + if discriminator_optimizer is not None and discriminator_net is not None: + self.add_optimizer(discriminator_optimizer, [discriminator_net]) + + self._gamma = gamma + self._f = f + assert f in ["chi", "kl"], "only support chi or kl form" + + # f-divergence functions + if self._f == 'chi': + self._f_fn = lambda x: 0.5 * (x - 1)**2 + self._f_star_prime = lambda x: torch.relu(x + 1) + self._f_star = lambda x: 0.5 * x**2 + x + elif self._f == 'kl': + self._f_fn = lambda x: x * torch.log(x + 1e-10) + self._f_star_prime = lambda x: torch.exp(x - 1) + + def _predict_action(self, observation, state): + action_dist, actor_network_state = self._actor_network( + observation, state=state) + + return action_dist, actor_network_state + + def rollout_step(self, inputs: TimeStep, state: SmoState): + action_dist, new_state = self._predict_action( + inputs.observation, state=state.actor) + action = dist_utils.epsilon_greedy_sample(action_dist, + self._epsilon_greedy) + + info = SmoInfo() + return AlgStep( + output=action, + state=SmoState(actor=new_state), + info=info._replace(action=action, action_distribution=action_dist)) + + def predict_step(self, inputs: TimeStep, state: SmoState): + action_dist, new_state = self._predict_action( + inputs.observation, state=state.actor) + action = dist_utils.epsilon_greedy_sample(action_dist, + self._epsilon_greedy) + + return AlgStep(output=action, state=SmoState(actor=new_state)) + + def _actor_train_step_imitation(self, inputs: TimeStep, rollout_info, + action_dist): + + exp_action = rollout_info.action + im_loss = -action_dist.log_prob(exp_action) + + actor_info = LossInfo(loss=im_loss, extra=SmoLossInfo(actor=im_loss)) + + return actor_info + + def predict_reward(self, inputs, rollout_info, state=()): + with torch.no_grad(): + observation = inputs.observation + action = rollout_info.action + expert_logits, _ = self._discriminator_net((observation, action), + state) + # self._discriminator_net.eval() + s = torch.sigmoid(expert_logits) + # log(d^E/d^O) + # reward = - (1/s-1).log() + reward = s.log() - (1 - s).log() + return reward + + def _discriminator_train_step(self, inputs: TimeStep, state, rollout_info, + is_expert): + """train discriminator with offline data (expert) + """ + observation = inputs.observation + action = rollout_info.action + + discriminator_inputs = (observation, action) + + if is_expert: + # turn on input gradient for gradient penalty in the case of expert data + for e in discriminator_inputs: + e.requires_grad = True + + expert_logits, _ = self._discriminator_net(discriminator_inputs, state) + + if is_expert: + grads = torch.autograd.grad( + outputs=expert_logits, + inputs=discriminator_inputs, + grad_outputs=torch.ones_like(expert_logits), + create_graph=True, + retain_graph=True, + only_inputs=True) + + grad_pen = 0 + for g in grads: + grad_pen += self._gradient_penalty_weight * ( + g.norm(2, dim=1) - 1).pow(2) + + label = torch.ones(expert_logits.size()) + # turn on input gradient for gradient penalty in the case of expert data + for e in discriminator_inputs: + e.requires_grad = True + else: + label = torch.zeros(expert_logits.size()) + grad_pen = () + + expert_loss = F.binary_cross_entropy_with_logits( + expert_logits, label, reduction='none') + + return LossInfo( + loss=expert_loss if grad_pen == () else expert_loss + grad_pen, + extra=SmoLossInfo(actor=expert_loss, grad_penalty=grad_pen)) + + def value_train_step(self, inputs: TimeStep, state, rollout_info): + observation = inputs.observation + initial_observation = observation + initial_v_values, _ = self._value_network(initial_observation) + + # mini-batch len + v_values, _ = self._value_network(observation) + info = SmoCriticInfo( + initial_v_values=initial_v_values, + values=v_values, + is_first=inputs.is_first()) + return info + + def train_step(self, + inputs: TimeStep, + state, + rollout_info, + pre_train=False): + + action_dist, new_state = self._predict_action( + inputs.observation, state=state.actor) + + actor_loss = self._actor_train_step_imitation(inputs, rollout_info, + action_dist) + + value_info = self.value_train_step( + inputs, state=(), rollout_info=rollout_info) + + expert_disc_loss = self._discriminator_train_step( + inputs, state, rollout_info, is_expert=False) + + if self._debug_summaries and alf.summary.should_record_summaries(): + with alf.summary.scope(self._name): + alf.summary.scalar("imitation_loss_online", + actor_loss.loss.mean()) + alf.summary.scalar("discriminator_loss_online", + expert_disc_loss.extra.actor.mean()) + + # use predicted reward + reward = self.predict_reward(inputs, rollout_info) + + info = SmoInfo( + actor=actor_loss, + value=value_info, + discriminator_loss=expert_disc_loss, + reward=reward, + discount=inputs.discount) + + return AlgStep( + rollout_info.action, state=SmoState(actor=new_state), info=info) + + def train_step_offline(self, + inputs: TimeStep, + state, + rollout_info, + pre_train=False): + action_dist, new_state = self._predict_action( + inputs.observation, state=state.actor) + + actor_loss = self._actor_train_step_imitation(inputs, rollout_info, + action_dist) + + value_info = self.value_train_step( + inputs, state=(), rollout_info=rollout_info) + + expert_disc_loss = self._discriminator_train_step( + inputs, state, rollout_info, is_expert=True) + + if self._debug_summaries and alf.summary.should_record_summaries(): + with alf.summary.scope(self._name): + alf.summary.scalar("imitation_loss_offline", + actor_loss.loss.mean()) + alf.summary.scalar("discriminator_loss_offline", + expert_disc_loss.loss.mean()) + alf.summary.scalar("grad_penalty", + expert_disc_loss.extra.grad_penalty.mean()) + # use predicted reward + reward = self.predict_reward(inputs, rollout_info) + + info = SmoInfo( + actor=actor_loss, + value=value_info, + discriminator_loss=expert_disc_loss, + reward=reward, + discount=inputs.discount) + + return AlgStep( + rollout_info.action, state=SmoState(actor=new_state), info=info) + + def calc_loss( + self, + info, + ): + + # [mini_batch_len, batch_size] + values = info.value.values + initial_v_values = info.value.initial_v_values + is_first = info.value.is_first + + reward = info.reward[1:] + v = values[:-1] + v_next = values[1:] + discount = info.discount[1:] + + e_v = reward + (1 - discount) * self._gamma * v_next - v + + v_loss0 = (1 - self._gamma) * initial_v_values * is_first + + if self._f == 'kl': + v_loss1 = torch.log(torch.mean( + torch.exp(e_v))).unsqueeze(-1).expand(e_v.shape[0], 1) + else: + v_loss1 = (self._f_star(e_v)) + + v_loss1 = tensor_utils.tensor_extend_zero(v_loss1) + + v_loss = v_loss0 + v_loss1 + + # weighted policy loss + # # extracting importance weight (Equation 21 in the paper) + if self._f == 'kl': + w_e = torch.exp(e_v) + else: + w_e = self._f_star_prime(e_v) + + actor_loss_tensor = info.actor.loss + + # [T, B] + discriminator_loss = info.discriminator_loss.loss + + # detach the weight from policy loss + w_a_loss = actor_loss_tensor[:-1] * w_e.detach() + w_a_loss = tensor_utils.tensor_extend_zero(w_a_loss) + + return LossInfo( + loss=w_a_loss + v_loss + discriminator_loss, + extra=SmoLossInfo(actor=info.actor.extra)) diff --git a/alf/bin/train_play_test.py b/alf/bin/train_play_test.py index a0d2e4208..a02a9c86f 100644 --- a/alf/bin/train_play_test.py +++ b/alf/bin/train_play_test.py @@ -697,6 +697,11 @@ def test_causal_bc_pendulum(self): conf_file='./hybrid_rl/causal_bc_pendulum_conf.py', extra_train_params=OFF_POLICY_TRAIN_PARAMS) + def test_smodice_pendulum(self): + self._test( + conf_file='./smodice_pendulum_conf.py', + extra_train_params=OFF_POLICY_TRAIN_PARAMS) + def test_iql_pendulum(self): self._test( conf_file='./hybrid_rl/iql_pendulum_conf.py', diff --git a/alf/environments/make_penv.py b/alf/environments/make_penv.py index ddef5b805..136ce0236 100644 --- a/alf/environments/make_penv.py +++ b/alf/environments/make_penv.py @@ -30,6 +30,7 @@ def gen_penv(): cmd = (f"g++ -O3 -Wall -shared -std=c++17 -fPIC -fvisibility=hidden " f"`{python} -m pybind11 --includes` parallel_environment.cpp " f"-o _penv`{python}-config --extension-suffix` -lrt") + ret = os.system(cmd) assert ret == 0, "Fail to execute " + cmd diff --git a/alf/examples/data_collection_carla_conf.py b/alf/examples/data_collection_carla_conf.py index 64205b07d..8a4f0f5e2 100644 --- a/alf/examples/data_collection_carla_conf.py +++ b/alf/examples/data_collection_carla_conf.py @@ -27,7 +27,7 @@ # This is an example config file for data collection in CARLA. # the desired replay buffer size for collection -# 100 is just an example. Should set it to he actual desired size. +# 100 is just an example. Should set it to the actual desired size. replay_buffer_length = 100 # the desired environment for data collection diff --git a/alf/examples/smodice_bipedal_walker_conf.py b/alf/examples/smodice_bipedal_walker_conf.py new file mode 100644 index 000000000..86a07bac8 --- /dev/null +++ b/alf/examples/smodice_bipedal_walker_conf.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 Horizon Robotics and ALF Contributors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +import torch + +import alf +from alf.algorithms.agent import Agent +from alf.algorithms.smodice_algorithm import SmodiceAlgorithm +from alf.utils import math_ops + +from alf.examples import sac_bipedal_walker_conf + +# default params +lr = 1e-4 +encoding_dim = 256 +fc_layers_params = (encoding_dim, ) * 2 +activation = torch.relu_ + +offline_buffer_length = None +offline_buffer_dir = [ + "/home/haichaozhang/data/DATA/sac_bipedal_baseline/train/algorithm/ckpt-80000-replay_buffer" +] + +alf.config('Agent', rl_algorithm_cls=SmodiceAlgorithm, optimizer=None) + +proj_net = partial( + alf.networks.StableNormalProjectionNetwork, + state_dependent_std=True, + squash_mean=False, + scale_distribution=True, + min_std=1e-3, + max_std=10) + +actor_network_cls = partial( + alf.networks.ActorDistributionNetwork, + fc_layer_params=fc_layers_params, + activation=activation, + continuous_projection_net_ctor=proj_net) + +v_network_cls = partial( + alf.networks.ValueNetwork, + fc_layer_params=fc_layers_params, + activation=activation) + +action_spec = alf.get_action_spec() +discriminator_network_cls = partial( + alf.networks.CriticNetwork, joint_fc_layer_params=fc_layers_params) + +alf.config( + 'SmodiceAlgorithm', + actor_network_cls=actor_network_cls, + v_network_cls=v_network_cls, + discriminator_network_cls=discriminator_network_cls, + actor_optimizer=alf.optimizers.Adam(lr=lr), + # add weight decay to the v_net following smodice paper + value_optimizer=alf.optimizers.Adam(lr=lr, weight_decay=1e-4), + discriminator_optimizer=alf.optimizers.Adam(lr=lr), + gradient_penalty_weight=0.1, +) + +# training config +alf.config( + "TrainerConfig", + offline_buffer_dir=offline_buffer_dir, + offline_buffer_length=offline_buffer_length) diff --git a/alf/examples/smodice_pendulum_conf.py b/alf/examples/smodice_pendulum_conf.py new file mode 100644 index 000000000..0ca91fd97 --- /dev/null +++ b/alf/examples/smodice_pendulum_conf.py @@ -0,0 +1,103 @@ +# Copyright (c) 2022 Horizon Robotics and ALF Contributors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +import torch + +import alf +from alf.algorithms.agent import Agent +from alf.algorithms.smodice_algorithm import SmodiceAlgorithm +from alf.utils import math_ops + +# default params +lr = 1e-4 +encoding_dim = 256 +fc_layers_params = (encoding_dim, ) * 2 +activation = torch.relu_ + +offline_buffer_length = None +offline_buffer_dir = [ + "./hybrid_rl/replay_buffer_data/pendulum_replay_buffer_from_sac_10k" +] + +env_name = "Pendulum-v0" + +alf.config( + "create_environment", env_name=env_name, num_parallel_environments=1) + +alf.config('Agent', rl_algorithm_cls=SmodiceAlgorithm) + +alf.config( + 'TrainerConfig', + algorithm_ctor=Agent, + whole_replay_buffer_training=False, + clear_replay_buffer=False) + +proj_net = partial( + alf.networks.StableNormalProjectionNetwork, + state_dependent_std=True, + squash_mean=False, + scale_distribution=True, + min_std=1e-3, + max_std=10) + +actor_network_cls = partial( + alf.networks.ActorDistributionNetwork, + fc_layer_params=fc_layers_params, + activation=activation, + continuous_projection_net_ctor=proj_net) + +v_network_cls = partial( + alf.networks.ValueNetwork, + fc_layer_params=fc_layers_params, + activation=activation) + +action_spec = alf.get_action_spec() +discriminator_network_cls = partial( + alf.networks.CriticNetwork, joint_fc_layer_params=fc_layers_params) + +alf.config( + 'SmodiceAlgorithm', + actor_network_cls=actor_network_cls, + v_network_cls=v_network_cls, + discriminator_network_cls=discriminator_network_cls, + actor_optimizer=alf.optimizers.Adam(lr=lr), + # add weight decay to the v_net following smodice paper + value_optimizer=alf.optimizers.Adam(lr=lr, weight_decay=1e-4), + discriminator_optimizer=alf.optimizers.Adam(lr=lr), + gradient_penalty_weight=0.1, +) + +num_iterations = 1000000 + +# training config +alf.config( + "TrainerConfig", + initial_collect_steps=1000, + num_updates_per_train_iter=1, + num_iterations=num_iterations, + # disable rl training by setting rl_train_after_update_steps + # to be larger than num_iterations + rl_train_after_update_steps=0, # joint training + mini_batch_size=256, + mini_batch_length=2, + unroll_length=1, + offline_buffer_dir=offline_buffer_dir, + offline_buffer_length=offline_buffer_length, + num_checkpoints=1, + debug_summaries=True, + evaluate=True, + eval_interval=1000, + num_eval_episodes=3, +)