Skip to content

Commit 2919a91

Browse files
author
Abdelrahman Ogail
committed
Solution to problem berkeleydeeprlcourse#4
1 parent 0f99079 commit 2919a91

File tree

5 files changed

+37
-10
lines changed

5 files changed

+37
-10
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,5 @@ ENV/
9191
*2019*
9292
*log.txt*
9393
.DS_Store
94+
95+
hw2/logs/

hw2/README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,26 @@ Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the p
1414
The only file that you need to look at is `train_pg_f18.py`, which you will implement.
1515

1616
See the [HW2 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw2.pdf) for further instructions.
17+
18+
# Answers to Homework Experiments
19+
## Problem 4 (CartPole)
20+
### Summary
21+
The benchmark included running multiple experiments with tuning parameters like using [rewards to go, monte carlo rewards], [advantage normalization, no advantage normalization], [large batch size, small batch size]. Then number of iterations were 100 per experiment and each configuration were experimented 3 times to understand variance as well. Below are general observations:
22+
- Convergence: using reward to go resulted into faster convergence than monte carlo reward
23+
- Variance: the following parameters helped reducing the variance: increasing batch size and advantage normalization
24+
25+
### Plots
26+
27+
![](fig/lb_CartPole-v0.png)
28+
29+
![](fig/sb_CartPole-v0.png)
30+
31+
### Answers
32+
Q1- Which gradient estimator has better performance without advantage-centering—the trajectory-centric one, or the one using reward-to-go?
33+
> The reward to go is better because it has lower variance.
34+
35+
Q2- Did advantage centering help?
36+
> Yes it did help reduce the variance and speed up convergence a bit
37+
38+
Q3- Did the batch size make an impact?
39+
> Yes it did, larger batch sizes result in lower variance and low bias

hw2/fig/lb_CartPole-v0.png

74.2 KB
Loading

hw2/fig/sb_CartPole-v0.png

116 KB
Loading

hw2/train_pg_f18.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,12 +202,14 @@ def sample_action(self, policy_parameters):
202202
if self.discrete:
203203
sy_logits_na = policy_parameters
204204
# YOUR_CODE_HERE
205-
_, sy_sampled_ac = tf.nn.top_k(sy_logits_na)
205+
sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), axis=[1])
206+
assert sy_sampled_ac.shape.as_list() == [sy_logits_na.shape.as_list()[0]]
206207
else:
207208
sy_mean, sy_logstd = policy_parameters
208209
# YOUR_CODE_HERE
209210
sy_sampled_ac = sy_mean + tf.multipy(tf.math.exp(sy_logstd),
210211
tf.random_normal(shape=sy_mean.shape))
212+
assert sy_sampled_ac.shape.as_list() == [sy_mean.shape.as_list()]
211213
return sy_sampled_ac
212214

213215
#========================================================================================#
@@ -241,13 +243,16 @@ def get_log_prob(self, policy_parameters, sy_ac_na):
241243
# YOUR_CODE_HERE
242244
sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na,
243245
logits=sy_logits_na)
246+
assert sy_logprob_n.shape.as_list() == [sy_logits_na.shape.as_list()[0]]
244247
else:
245248
sy_mean, sy_logstd = policy_parameters
246249
# YOUR_CODE_HERE
247250
# initialize a single self.ac_dim-variate Gaussian.
248-
mvn = tf.contrib.distributions.MultivariateNormalDiag(
249-
loc=sy_mean, scale_diag=tf.math.exp(sy_logstd))
251+
mvn = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean,
252+
scale_diag=tf.math.exp(sy_logstd))
250253
sy_logprob_n = mvn.log_prob(sy_ac_na)
254+
255+
assert sy_logprob_n.shape.as_list() == sy_mean.shape.as_list()
251256
return sy_logprob_n
252257

253258
def build_computation_graph(self):
@@ -290,7 +295,7 @@ def build_computation_graph(self):
290295
#========================================================================================#
291296
# YOUR CODE HERE
292297
# EXPERIMENT use * instead of tf.multiply operator
293-
self.loss = tf.reduce_mean(tf.multiply(self.sy_logprob_n, self.sy_adv_n))
298+
self.loss = tf.reduce_mean(self.sy_logprob_n * self.sy_adv_n)
294299
self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
295300

296301
# create tf summaries
@@ -433,12 +438,7 @@ def sum_of_rewards(self, re_n):
433438
if self.reward_to_go:
434439
for traj_re in re_n:
435440
for t in range(len(traj_re)):
436-
# rtg = 0
437-
# for t_bar, r in enumerate(traj_re):
438-
# rtg += self.gamma**(t_bar-t) * r
439-
# q_n.append(rtg)
440-
q_n.append(
441-
sum([self.gamma**(t_bar - t) * r for t_bar, r in enumerate(traj_re)]))
441+
q_n.append(sum([self.gamma**(t_ - t) * r for t_, r in enumerate(traj_re[t:])]))
442442
else:
443443
for traj_re in re_n:
444444
q_n.extend([sum([self.gamma**t * r for t, r in enumerate(traj_re)])] * len(traj_re))
@@ -478,6 +478,8 @@ def compute_advantage(self, ob_no, q_n):
478478
adv_n = q_n - b_n
479479
else:
480480
adv_n = q_n.copy()
481+
482+
assert len(adv_n) == len(q_n)
481483
return adv_n
482484

483485
def estimate_return(self, ob_no, re_n):

0 commit comments

Comments
 (0)