-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgfootball_ppo_load.py
109 lines (94 loc) · 2.83 KB
/
gfootball_ppo_load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gfootball.env as football_env
import numpy as np
import torch
import torch.nn as nn
import pfrl
from pfrl import agents, experiments, explorers
from pfrl import nn as pnn
from pfrl import replay_buffers, utils
from pfrl.initializers import init_chainer_default
from pfrl.q_functions import DiscreteActionValueHead
from pfrl.policies import SoftmaxCategoricalHead
from pfrl.wrappers import atari_wrappers
from pfrl.agents import PPO
env = football_env.create_environment(env_name="academy_empty_goal_close", stacked=False, logdir='/tmp/football', write_goal_dumps=False, write_full_episode_dumps=False, render=True)
env = pfrl.wrappers.CastObservationToFloat32(env)
env.reset()
steps = 0
obs_space = env.observation_space
action_space = env.action_space
print(obs_space)
obs_size = obs_space.low.size
# Normalize observations based on their empirical mean and variance
obs_normalizer = pfrl.nn.EmpiricalNormalization(
obs_space.low.size, clip_threshold=5
)
policy = torch.nn.Sequential(
nn.Linear(obs_size, 64),
nn.Tanh(),
nn.Linear(64, 64),
nn.Tanh(),
nn.Linear(64, 19),
pfrl.policies.SoftmaxCategoricalHead(),
)
vf = torch.nn.Sequential(
nn.Linear(obs_size, 64),
nn.Tanh(),
nn.Linear(64, 64),
nn.Tanh(),
nn.Linear(64, 1),
)
# While the original paper initialized weights by normal distribution,
# we use orthogonal initialization as the latest openai/baselines does.
def ortho_init(layer, gain):
nn.init.orthogonal_(layer.weight, gain=gain)
nn.init.zeros_(layer.bias)
ortho_init(policy[0], gain=1)
ortho_init(policy[2], gain=1)
ortho_init(policy[4], gain=1e-2)
ortho_init(vf[0], gain=1)
ortho_init(vf[2], gain=1)
ortho_init(vf[4], gain=1)
# Combine a policy and a value function into a single model
model = pfrl.nn.Branched(policy, vf)
opt = torch.optim.Adam(model.parameters(), lr=3e-4, eps=1e-5)
agent = PPO(
model,
opt,
obs_normalizer=obs_normalizer,
gpu=0,
update_interval=2048,
minibatch_size=64,
epochs=10,
clip_eps_vf=None,
entropy_coef=0,
standardize_advantages=True,
gamma=0.995,
lambd=0.97,
)
agent.load('FBPPO')
n_episodes = 10000
max_episode_len = 200
for i in range(1, n_episodes + 1):
obs = env.reset()
obs = np.reshape(obs, (27648))
R = 0 # return (sum of rewards)
t = 0 # time step
while True:
# Uncomment to watch the behavior in a GUI window
# env.render()
action = agent.act(obs)
#action = action[0]
obs, reward, done, _ = env.step(action)
R += reward
t += 1
reset = t == max_episode_len
obs = np.reshape(obs, (27648))
agent.observe(obs, reward, done, reset)
if done or reset:
break
if i % 10 == 0:
print('episode:', i, 'R:', R)
if i % 50 == 0:
print('statistics:', agent.get_statistics())
print('Finished.')