-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDDPG_main.py
156 lines (130 loc) · 6.18 KB
/
DDPG_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from unityagents import UnityEnvironment
import numpy as np
import os
import platform
import matplotlib.pyplot as plt
from DDPG_agent import Agent
import time
import datetime
import torch
from utils import log_me, Logger
import torch.multiprocessing as mp
from DDPG_Model import Critic, Actor
import sys
sys.stdout = Logger('ddpg_log.log')
print('************************************************************************************')
dir = os.getcwd()
dir = dir + os.sep + "Reacher_Windows_x86_64"
env = UnityEnvironment(file_name=dir + os.sep + "Reacher.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
def DDPG(env, kwargs):
agent = Agent(state_size=state_size, action_size=action_size, random_seed=0, kwargs=kwargs)
max_episodes = kwargs['max_episodes']
epsilon_decay = kwargs['epsilon_decay']
#TODO: refactor kwargs. It is really a dictionary and won't work as ** variable.
max_t = 1000
scores_hist = []
epsilon = 1
epsilon_min = .05
print('time :' + str(datetime.datetime.now()))
print('main file: ' + os.path.basename(__file__))
print('platform = ' + platform.system())
print('device = ' + str(agent.device))
print(kwargs)
line, = plt.plot([],[])
axes = plt.gca()
plt.ion()
plt.xlabel = 'Episode'
plt.ylabel = 'Mean score'
#TODO: these labels aren't showing up on figure.
for episode in range(max_episodes):
start_t = time.time() #capture the episode start time TODO: it would be nice to capture start of train time and report clock duration of solve.
env_info = env.reset(train_mode=True)[brain_name] # reset the environment
agent.reset()
scores = np.zeros(num_agents) #reset the episode scores tally
states = env_info.vector_observations # get the current state (for each agent)
for t in range(max_t):
actions = agent.act(states, epsilon) # get actions (20 in this case)
env_info = env.step(actions)[brain_name] # get return from environment
next_states = env_info.vector_observations # get next state (for each agent)
rewards = env_info.rewards # get reward (for each agent)
dones = env_info.local_done # see if episode finished
scores += env_info.rewards # update the score (for each agent)
agent.step(states, actions, rewards, next_states, dones) #save SARSD to buffer and learn from buffered SARSD
states = next_states # prep for next pass
epsilon *= epsilon_decay # explore a wee bit less
epsilon = max(epsilon, epsilon_min) # always explore a little
if np.any(dones): # exit loop if episode finished
break
episode_t = time.time()-start_t # wow that took a long time.
scores_hist.append(np.mean(scores)) # keep track of score history
time_remain = np.mean(episode_t)*(max_episodes - episode)# are we there yet?
if episode % 1 == 0:
print('Score ([min, mean, max] over agents) for ep. {}: [{:0.2f},{:0.2f},{:0.1f}] \tT: {:00.0f}:{:02.0f}(m:s)\tEst remain: {:00.0f}:{:02.0f}(h:m)' \
.format(episode, np.min(scores), np.mean(scores), np.max(scores),
episode_t // 60, episode_t % 60, time_remain // 3600, time_remain % 3600 / 60))
line.set_xdata(np.arange(0,len(scores_hist)))
line.set_ydata(scores_hist)
axes.set_xlim(0,len(scores_hist))
axes.set_ylim(0,np.max(scores_hist)*1.05)
plt.draw()
plt.pause(.1)
if episode % 10 == 0 and episode > 20: #Lets occasionally save the weights just in case
torch.save(agent.critic_target.state_dict(), 'critic_target.pth')
torch.save(agent.critic_local.state_dict(), 'critic_local.pth')
torch.save(agent.actor_local.state_dict(), 'actor_target.pth')
torch.save(agent.actor_target.state_dict(), 'actor_local.pth')
if len(scores_hist) > 100 and np.min(scores_hist[-100:]) >= 30: # yippee!
print('Met project requirement in {} episodes'.format(episode + 1)) #TODO: we coule probably stop the training here, or only send this message once.
return scores_hist
# the args are done this way as a convenient method of recording
# the hyper paramaters associated with each run
ddpg_args = {"buffer_size": int(1e5), # replay buffer size
"batch_size": 128*20, # minibatch size
"gamma" : 0.99, # discount factor
"tau" : 1e-3, # for soft update of target parameters
"LR_actor" : 1e-3, # learning rate of the actor
"LR_critic" : 1e-3, # learning rate of the critic
"weight_decay" : .00 , # L2 weight decay
"max_episodes": 250,
"epsilon_decay" : .99995,
"fc1_units" : 400,
"fc2_units" : 300,
"sigma" : 0.1
}
done = False
DDPG(env, kwargs=ddpg_args)
plt.show()
## The code below was used for an unattended grid search. At one point is was a loop with the hyperparameters selected randomly.
# ddpg_args["max_episodes"] = 50
# ddpg_args['fc1_units'] = 128
# ddpg_args['fc2_units'] = 128
# DDPG(env, kwargs=ddpg_args)
#
# ddpg_args['fc1_units'] = 400
# ddpg_args['fc2_units'] = 300
# ddpg_args['LR_actor'] = (1e-3)*2
# DDPG(env, kwargs=ddpg_args)
#
# ddpg_args['LR_actor'] = (1e-3)/2
# DDPG(env, kwargs=ddpg_args)
#
# ddpg_args['LR_actor'] = 1e-3
# ddpg_args['LR_critic'] = (1e-3)*2
# DDPG(env, kwargs=ddpg_args)
#
# ddpg_args['LR_critic'] = (1e-3)/2
# DDPG(env, kwargs=ddpg_args)