add SPiRL closed-loop model for block stacking, update READMEs

kpertsch · kpertsch · commit da1460c8a020 · 2021-04-28T21:36:22.000+02:00
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ This is the official PyTorch implementation of the paper "**Accelerating Reinfor
 (CoRL 2020).
 
 ## Updates
+- **[Apr 2021]**: extended improved SPiRL version to support image-based observations 
+(see [example commands](spirl/configs/skill_prior_learning/block_stacking/hierarchical_cl/README.md))
 - **[Mar 2021]**: added an improved version of SPiRL with closed-loop skill decoder 
 (see [example commands](spirl/configs/skill_prior_learning/kitchen/hierarchical_cl/README.md))
 
diff --git a/docs/resources/block_stacking_sparse_results.png b/docs/resources/block_stacking_sparse_results.png
diff --git a/spirl/configs/hrl/block_stacking/spirl_cl/conf.py b/spirl/configs/hrl/block_stacking/spirl_cl/conf.py
@@ -0,0 +1,157 @@
+import os
+import copy
+
+from spirl.utils.general_utils import AttrDict
+from spirl.rl.components.agent import FixedIntervalHierarchicalAgent
+from spirl.rl.components.critic import SplitObsMLPCritic
+from spirl.rl.components.sampler import ACMultiImageAugmentedHierarchicalSampler
+from spirl.rl.components.replay_buffer import UniformReplayBuffer
+from spirl.rl.policies.prior_policies import ACLearnedPriorAugmentedPIPolicy
+from spirl.rl.envs.block_stacking import HighStack11StackEnvV0, SparseHighStack11StackEnvV0
+from spirl.rl.agents.ac_agent import SACAgent
+from spirl.rl.agents.prior_sac_agent import ActionPriorSACAgent
+from spirl.rl.policies.cl_model_policies import ACClModelPolicy
+from spirl.models.closed_loop_spirl_mdl import ImageClSPiRLMdl
+from spirl.configs.default_data_configs.block_stacking import data_spec
+
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
+
+notes = 'used to test the RL implementation'
+
+configuration = {
+    'seed': 42,
+    'agent': FixedIntervalHierarchicalAgent,
+    'environment': SparseHighStack11StackEnvV0,
+    'sampler': ACMultiImageAugmentedHierarchicalSampler,
+    'data_dir': '.',
+    'num_epochs': 100,
+    'max_rollout_len': 1000,
+    'n_steps_per_epoch': 1e5,
+    'n_warmup_steps': 5e3,
+}
+configuration = AttrDict(configuration)
+
+
+# Replay Buffer
+replay_params = AttrDict(
+    capacity=1e5,
+    dump_replay=False,
+)
+
+# Observation Normalization
+obs_norm_params = AttrDict(
+)
+
+sampler_config = AttrDict(
+    n_frames=2,
+)
+
+base_agent_params = AttrDict(
+    batch_size=256,
+    replay=UniformReplayBuffer,
+    replay_params=replay_params,
+    clip_q_target=False,
+)
+
+
+###### Low-Level ######
+# LL Policy Model
+ll_model_params = AttrDict(
+    state_dim=data_spec.state_dim,
+    action_dim=data_spec.n_actions,
+    n_rollout_steps=10,
+    kl_div_weight=1e-2,
+    prior_input_res=data_spec.res,
+    n_input_frames=2,
+    cond_decode=True,
+)
+
+# LL Policy
+ll_policy_params = AttrDict(
+    policy_model=ImageClSPiRLMdl,
+    policy_model_params=ll_model_params,
+    policy_model_checkpoint=os.path.join(os.environ["EXP_DIR"], "skill_learning/block_stacking/hierarchical_cl"),
+    initial_log_sigma=-50.,
+)
+ll_policy_params.update(ll_model_params)
+
+# LL Critic
+ll_critic_params = AttrDict(
+    action_dim=data_spec.n_actions,
+    input_dim=data_spec.state_dim,
+    output_dim=1,
+    action_input=True,
+    unused_obs_size=10,     # ignore HL policy z output in observation for LL critic
+)
+
+# LL Agent
+ll_agent_config = copy.deepcopy(base_agent_params)
+ll_agent_config.update(AttrDict(
+    policy=ACClModelPolicy,
+    policy_params=ll_policy_params,
+    critic=SplitObsMLPCritic,
+    critic_params=ll_critic_params,
+))
+
+
+###### High-Level ########
+# HL Policy
+hl_policy_params = AttrDict(
+    action_dim=10,       # z-dimension of the skill VAE
+    input_dim=data_spec.state_dim,
+    max_action_range=2.,        # prior is Gaussian with unit variance
+    prior_model=ll_policy_params.policy_model,
+    prior_model_params=ll_policy_params.policy_model_params,
+    prior_model_checkpoint=ll_policy_params.policy_model_checkpoint,
+)
+
+# HL Critic
+hl_critic_params = AttrDict(
+    action_dim=hl_policy_params.action_dim,
+    input_dim=hl_policy_params.input_dim,
+    output_dim=1,
+    n_layers=2,  # number of policy network layers
+    nz_mid=256,
+    action_input=True,
+    unused_obs_size=ll_model_params.prior_input_res **2 * 3 * ll_model_params.n_input_frames,
+)
+
+# HL Agent
+hl_agent_config = copy.deepcopy(base_agent_params)
+hl_agent_config.update(AttrDict(
+    policy=ACLearnedPriorAugmentedPIPolicy,
+    policy_params=hl_policy_params,
+    critic=SplitObsMLPCritic,
+    critic_params=hl_critic_params,
+    td_schedule_params=AttrDict(p=5.),
+))
+
+
+##### Joint Agent #######
+agent_config = AttrDict(
+    hl_agent=ActionPriorSACAgent,
+    hl_agent_params=hl_agent_config,
+    ll_agent=SACAgent,
+    ll_agent_params=ll_agent_config,
+    hl_interval=ll_model_params.n_rollout_steps,
+    log_videos=True,
+    update_hl=True,
+    update_ll=False,
+)
+
+# Dataset - Random data
+data_config = AttrDict()
+data_config.dataset_spec = data_spec
+
+# Environment
+env_config = AttrDict(
+    name="block_stacking",
+    reward_norm=1.,
+    screen_width=data_spec.res,
+    screen_height=data_spec.res,
+    env_config=AttrDict(camera_name='agentview',
+                        screen_width=data_spec.res,
+                        screen_height=data_spec.res,)
+)
+
diff --git a/spirl/configs/skill_prior_learning/block_stacking/hierarchical_cl/README.md b/spirl/configs/skill_prior_learning/block_stacking/hierarchical_cl/README.md
@@ -0,0 +1,34 @@
+# Image-based SPiRL w/ Closed-Loop Skill Decoder
+
+This version of the SPiRL model uses a [closed-loop action decoder](../../../../models/closed_loop_spirl_mdl.py#L55): 
+in contrast to the original SPiRL model it takes the current environment observation as input in every skill decoding step. 
+
+This image-based model is a direct extension of the 
+[state-based SPiRL model with closed-loop skill decoder](../../kitchen/hierarchical_cl/README.md).
+Similar to the state-based model we find that the image-based closed-loop model improves performance over the original
+image-based SPiRL model, particularly in tasks that require precise control. 
+We evaluate it on a more challenging, sparse reward version of the block stacking environment
+where the agent is rewarded for the height of the tower it built, but does not receive any rewards for picking or lifting
+blocks. We find that on this challenging environment, the closed-loop skill decoder ("SPiRLv2") outperforms the original
+SPiRL model with open-loop skill decoder ("SPiRLv1").
+
+<p align="center">
+<img src="../../../../../docs/resources/block_stacking_sparse_results.png" width="400">
+</p>
+</img>
+
+We also tried the closed-loop model on the image-based maze navigation task, but did not find it to improve performance,
+which we attribute to the easier control task that does not require closed-loop control.
+
+## Example Commands
+
+To train the image-based SPiRL model with closed-loop action decoder on the block stacking environment, run the following command:
+```
+python3 spirl/train.py --path=spirl/configs/skill_prior_learning/block_stacking/hierarchical_cl --val_data_size=160
+```
+
+To train a downstream task policy with RL using the closed-loop image-based SPiRL model
+on the sparse reward block stacking environment, run the following command:
+```
+python3 spirl/rl/train.py --path=spirl/configs/hrl/block_stacking/spirl_cl --seed=0 --prefix=SPIRLv2_block_stacking_seed0
+```
diff --git a/spirl/configs/skill_prior_learning/block_stacking/hierarchical_cl/conf.py b/spirl/configs/skill_prior_learning/block_stacking/hierarchical_cl/conf.py
@@ -0,0 +1,37 @@
+import os
+
+from spirl.models.skill_prior_mdl import SkillSpaceLogger
+from spirl.models.closed_loop_spirl_mdl import ImageClSPiRLMdl
+from spirl.utils.general_utils import AttrDict
+from spirl.configs.default_data_configs.block_stacking import data_spec
+from spirl.components.evaluator import TopOfNSequenceEvaluator
+
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
+
+
+configuration = {
+    'model': ImageClSPiRLMdl,
+    'logger': SkillSpaceLogger,
+    'data_dir': os.path.join(os.environ['DATA_DIR'], 'block_stacking'),
+    'epoch_cycles_train': 10,
+    'evaluator': TopOfNSequenceEvaluator,
+    'top_of_n_eval': 100,
+    'top_comp_metric': 'mse',
+}
+configuration = AttrDict(configuration)
+
+model_config = AttrDict(
+    state_dim=data_spec.state_dim,
+    action_dim=data_spec.n_actions,
+    n_rollout_steps=10,
+    kl_div_weight=1e-3,
+    prior_input_res=data_spec.res,
+    n_input_frames=2,
+    cond_decode=True,
+)
+
+# Dataset
+data_config = AttrDict()
+data_config.dataset_spec = data_spec
+data_config.dataset_spec.subseq_len = model_config.n_rollout_steps + model_config.n_input_frames
diff --git a/spirl/configs/skill_prior_learning/kitchen/hierarchical_cl/README.md b/spirl/configs/skill_prior_learning/kitchen/hierarchical_cl/README.md
@@ -11,15 +11,15 @@ SPiRL model, particularly on tasks that require precise control, like in the kit
 </p>
 </img>
 
+For an implementation of the closed-loop SPiRL model that supports image observations, 
+see [here](../../block_stacking/hierarchical_cl/README.md).
+
 ## Example Commands
 
 To train the SPiRL model with closed-loop action decoder on the kitchen environment, run the following command:
 ```
 python3 spirl/train.py --path=spirl/configs/skill_prior_learning/kitchen/hierarchical_cl --val_data_size=160
 ```
-Our current implementation of the closed-loop SPiRL model only supports state-based inputs, but an extension to
-image observations is straightforward analogous to how we adapted the 
-original SPiRL model for [image inputs](../../../../models/skill_prior_mdl.py#L321).
 
 To train a downstream task policy with RL using the closed-loop SPiRL model we just trained, run the following command:
 ```
diff --git a/spirl/data/block_stacking/src/block_stacking_env.py b/spirl/data/block_stacking/src/block_stacking_env.py
@@ -963,3 +963,65 @@ def _get_reward(self):
     def _has_support(self, block, others):
         return not block.lifted or any([block.stacked_on_loose(b) and self._has_support(b, [bb for bb in others if b.name != bb.name])
                     for b in others])
+
+
+class SparseHighStackBlockStackEnv(NoOrderBlockStackEnv):
+    """Simple reward function that just rewards the highest stacked tower."""
+    REWARD_SCALE = 1.0
+
+    def _reset_internal(self, keep_sim_object=False):
+        super()._reset_internal(keep_sim_object)
+        self._final_height = 0.
+
+    def get_episode_info(self):
+        ep_info = super().get_episode_info()
+        ep_info.final_height = self._final_height
+        return ep_info
+
+    def _get_reward(self):
+        """Compute reward for stacking blocks without order."""
+        rew_dict = AttrDict()
+
+        max_height = 0.
+        heights, supported_heights = np.zeros(len(self._blocks)), np.zeros(len(self._blocks))
+        for i, block in enumerate(self._blocks):
+            height = block.dist_lifted
+            heights[i] = height
+
+            # set flags
+            if not self._grasped_flag[i]:
+                self._grasped_flag[i] = block.grasped(self.gripper_pos, self.gripper_finger_dist,
+                                                      self.gripper_finger_poses)
+            if not self._lifted_flag[i]:
+                self._lifted_flag[i] = (not self._hp.restrict_grasped or self._grasped_flag[i]) and \
+                                       (not self._hp.restrict_upright or block.upright) and block.lifted
+            if not self._delivered_flag[i]:
+                self._delivered_flag[i] = (not self._hp.restrict_grasped or self._grasped_flag[i]) \
+                                          and (not self._hp.restrict_upright or block.upright) \
+                                          and any([block.above(b) for b in self._blocks if b.name != block.name])
+
+            # compute reward
+            if (not self._hp.restrict_grasped or self._grasped_flag[i]) and \
+                    (not self._hp.restrict_upright or block.upright) and \
+                    self._has_support(block, [b for b in self._blocks if block.name != b.name]):
+                self._stacked_flag[i] = True
+                supported_heights[i] = height
+                if height > max_height:
+                    max_height = height
+        self._final_height = max_height / (2 * self._hp.block_size)
+
+        total_rew = max_height * self.REWARD_SCALE
+
+        rew_dict["heights"] = heights.round(3)
+        rew_dict["sup_heights"] = supported_heights.round(3)
+        rew_dict["rew_total"] = np.array(total_rew).round(3)
+        rew_dict["max_height"] = np.array(self._final_height).round(3)
+
+        self._prev_block_pos = [copy.deepcopy(b.pos) for b in self._blocks]  # update for next round of reward comp
+        self._prev_gripper_pos = copy.deepcopy(self.gripper_pos)
+
+        return rew_dict
+
+    def _has_support(self, block, others):
+        return not block.lifted or any([block.stacked_on_loose(b) and self._has_support(b, [bb for bb in others if b.name != bb.name])
+                    for b in others])
diff --git a/spirl/rl/envs/block_stacking.py b/spirl/rl/envs/block_stacking.py
@@ -3,7 +3,7 @@
 from spirl.rl.components.environment import GymEnv
 from spirl.utils.general_utils import AttrDict, ParamDict
 from spirl.data.block_stacking.src.block_stacking_env import BlockStackEnv as UnwrappedBlockStackEnv
-from spirl.data.block_stacking.src.block_stacking_env import NoOrderBlockStackEnv, HighStackBlockStackEnv
+from spirl.data.block_stacking.src.block_stacking_env import HighStackBlockStackEnv, SparseHighStackBlockStackEnv
 from spirl.data.block_stacking.src.block_task_generator import FixedSizeSingleTowerBlockTaskGenerator
 
 
@@ -92,3 +92,12 @@ def _get_default_env_config(self):
         default_env_config.table_size = (1.2, 2.4, 0.8)
         default_env_config.n_blocks = 11
         return default_env_config
+
+
+class SparseHighStack11StackEnvV0(HighStack11StackEnvV0):
+    def _make_env(self, name):
+        default_env_config = self._get_default_env_config()
+        if self._hp.env_config is not None:
+            default_env_config.update(self._hp.env_config)
+
+        return SparseHighStackBlockStackEnv(default_env_config)