Skip to content

Commit

Permalink
sync
Browse files Browse the repository at this point in the history
  • Loading branch information
abaybektursun committed May 27, 2018
1 parent 812b0b3 commit c12a2c8
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 174 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*.png
*.jpg
free4all/
*.tar

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
203 changes: 29 additions & 174 deletions fc_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
from pommerman import constants
from pommerman import utility

import os
import math
import random

import matplotlib
import matplotlib.pyplot as plt

from collections import namedtuple
from itertools import count

Expand All @@ -18,119 +22,20 @@
import torch.nn.functional as F
import torchvision.transforms as T

from utils import _utils


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class _utils:
def __init__(self, board_h, board_w):
self.num_actions = 6
self.board_area = board_h * board_w

self.int2vec = {
1 : np.zeros((self.board_area,)),
2 : np.zeros((self.board_area,)),
4 : np.zeros((self.board_area,)),
6 : np.zeros((self.board_area,)),
7 : np.zeros((self.board_area,)),
8 : np.zeros((self.board_area,)),
10 : np.zeros((self.board_area,)),
11 : np.zeros((self.board_area,)),
12 : np.zeros((self.board_area,)),
13 : np.zeros((self.board_area,))
}
self.blast_strength_vec = np.zeros((max(board_h, board_w)+1,))

self.max_ammo = 4
self.ammo = np.zeros((self.max_ammo,))

self.this_agent = np.zeros((5,))
self.friend = np.zeros((5,))
self.enemy1 = np.zeros((5,))
self.enemy2 = np.zeros((5,))
self.enemy3 = np.zeros((5,))

# Different symbolic objects
self.input_size = self.board_area*len(self.int2vec) + \
max(board_h, board_w)+1 + \
self.max_ammo + \
5*5 + \
self.board_area + \
self.board_area
# Action and reward
#self.input_size += (6 + 1)



def input(self, obs):
blast_strength = int(obs['blast_strength'])
ammo = int(obs['ammo'])
my_position = tuple(obs['position'])
teammate = int(obs['teammate'].value) - 9
enemies = np.array([e.value for e in obs['enemies']]) - 9
board = np.array(obs['board'])
bombs = np.array(obs['bomb_blast_strength'])/2.0
bombs_life = np.array(obs['bomb_life'])/9.0

# Symbolic objects to vector of boards
for idx, cell in enumerate(board.flatten().tolist()):
if cell in self.int2vec:
self.int2vec[cell][idx] = 1.0

# !TODO Test this assumption
self.blast_strength_vec[blast_strength] = 1.0

# If ammo > 10, ammo = 10 (as one hot)
self.ammo[min(self.max_ammo,ammo)-1] = 1.0

agent_ids = [0,1,2,3,4]
# Agents
for an_enemy_id, an_enemy_vec in zip(enemies, [self.enemy1, self.enemy2, self.enemy3]):
an_enemy_vec[an_enemy_id] = 1.0
agent_ids.remove(an_enemy_id)
self.friend[teammate] = 1.0
agent_ids.remove(teammate)
# DEBUG
if len(agent_ids) != 1: raise ValueError('Error! agent_ids has more/less than one id left!')
# DEBUG
self.this_agent[agent_ids[0]] = 1.0


# !TODO Concatenate all the vectors
input_data = np.array([])
for idx in self.int2vec:
input_data = np.concatenate((input_data, self.int2vec[idx]))

input_data = np.concatenate((input_data, self.blast_strength_vec))
input_data = np.concatenate((input_data, self.ammo))
input_data = np.concatenate((input_data, self.this_agent))
input_data = np.concatenate((input_data, self.friend))
input_data = np.concatenate((input_data, self.enemy1))
input_data = np.concatenate((input_data, self.enemy2))
input_data = np.concatenate((input_data, self.enemy3))
input_data = np.concatenate((input_data, bombs.flatten()))
input_data = np.concatenate((input_data, bombs_life.flatten()))

#print("Data vector: {} v.s. input_size: {}".format(input_data.shape, self.input_size))

return torch.Tensor(input_data.flatten(), device=device)

def action_onehot(self, action):
action_vec = [0]*self.num_actions
action_vec[action] = 1
return torch.tensor(action_vec, device=device, dtype=torch.long)



class _ReplayMemory(object):
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
def push(self, *args):
"""Saves a transition."""
if len(self.memory) < self.capacity:
Expand All @@ -147,10 +52,10 @@ def __len__(self):

class FCAgent(BaseAgent):
def __init__(self, board_h=11, board_w=11, *args, **kwargs):

self.name = 'FC Agent'
super(FCAgent, self).__init__(*args, **kwargs)
# Common functionalities among learning agents
self.utils = _utils(board_h, board_w)
self.utils = _utils(board_h, board_w, 'fc_agent/save.tar')
self.input_size = self.utils.input_size
self.prev_x_np = None

Expand Down Expand Up @@ -186,9 +91,14 @@ def __init__(self, board_h=11, board_w=11, *args, **kwargs):
self.optimizer = optim.RMSprop(self.policy_net.parameters())
self.memory = _ReplayMemory(10000)


self.episode_durations = []

if os.path.isfile(self.utils.save_file):
checkpoint = torch.load(self.utils.save_file)
self.policy_net.load_state_dict(checkpoint['policy_net'])
self.target_net.load_state_dict(checkpoint['target_net'])
print("=> loaded checkpoint '{}'".format(checkpoint['iter']))

#def optimize_model():
def _train(self):
if len(self.memory) < self.BATCH_SIZE:
Expand All @@ -198,33 +108,31 @@ def _train(self):
# detailed explanation).
batch = Transition(*zip(*transitions))

# Compute a mask of non-final states and concatenate the batch elements
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=device, dtype=torch.uint8)
# This mask indicates whether vector at corresponding index is final or not
non_final_mask = torch.tensor(
tuple(map(
lambda s: s is not None,
batch.next_state
)), device=device, dtype=torch.uint8)

non_final_next_states = torch.stack([s for s in batch.next_state
if s is not None]).to(device)
print("non_final_mask", non_final_mask.shape)
print("non_final_next_states", non_final_next_states.shape)
state_batch = torch.stack(batch.state).to(device)
action_batch = torch.stack(batch.action).to(device)
action_batch = torch.stack(batch.action).to(device)
reward_batch = torch.stack(batch.reward).to(device)

# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken
print("state_batch shape: ", state_batch.shape)
state_action_values = self.policy_net(state_batch).gather(0, action_batch)
state_action_values = self.policy_net(state_batch).gather(1, action_batch.view(-1, 1))

# Compute V(s_{t+1}) for all next states.
next_state_values = torch.zeros(self.BATCH_SIZE, device=device)
next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
next_state_values[non_final_mask] = self.target_net(non_final_next_states).detach().max(1)[0]
# Compute the expected Q values
reward_batch = reward_batch.view(-1)
expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch

# Compute Huber loss
print("State action: ", state_action_values.shape)
print("Ex. State action: ", expected_state_action_values.shape)
print("Ex. State action (unsq): ", expected_state_action_values.unsqueeze(0).shape)
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(0))
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

# Optimize the model
self.optimizer.zero_grad()
Expand All @@ -247,59 +155,6 @@ def _select_action(self, state):

def act(self, obs, action_space):
x_torch = self.utils.input(obs)
# Initialize the environment ae
action = self._select_action(x_torch)
return action.cpu().numpy()[0][0]


def episode_end(self, reward):
pass

if __name__ == '__main__':
# Training
import pommerman
from pommerman import agents

# Hyperparams
EPISODES = 2

fc_agent = FCAgent()
agent_list = [fc_agent, agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent()]
env = pommerman.make('PommeFFACompetition-v0', agent_list)

target_update = 10
for an_episode in range(EPISODES):
state = env.reset()

current_x = fc_agent.utils.input(state[0])
last_x = fc_agent.utils.input(state[0])

#-------------------------------------------------------------------
done = False
while not done:
#env.render()
actions = env.act(state)
state, reward, done, info = env.step(actions)

fca_reward = torch.tensor([float(reward[0])], device=device)
fca_action = fc_agent.utils.action_onehot(actions[0])
# Observe new state
last_x = current_x
current_x = fc_agent.utils.input(state[0])

# Store the transition in memory
fc_agent.memory.push(last_x, fca_action, current_x, fca_reward)

# Perform one step of the optimization (on the target network)
fc_agent._train()
#-------------------------------------------------------------------

#for agent in agent_list:
# agent.episode_end(reward[agent.agent_id], obs[agent.agent_id])

env.close()
print(info)

# Update the target network
if an_episode % target_update == 0:
fc_agent.target_net.load_state_dict(fc_agent.policy_net.state_dict())
return action

68 changes: 68 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Training
import pommerman
import torch

from pommerman import agents

from fc_agent import FCAgent

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparams
EPISODES = 300


fc_agent = FCAgent()
agent_list = [fc_agent, agents.SimpleAgent(), agents.RandomAgent(), agents.SimpleAgent()]
env = pommerman.make('PommeFFACompetition-v0', agent_list)

wins = {}; iter_num = 0
target_update = 10
for an_episode in range(EPISODES):
state = env.reset()

current_x = fc_agent.utils.input(state[0])
last_x = fc_agent.utils.input(state[0])

#-------------------------------------------------------------------
done = False
memory_stop = False
while not done:
#env.render()
actions = env.act(state);
if fc_agent.is_alive: actions[0] = actions[0].item()
state, reward, done, info = env.step(actions)

fca_reward = torch.tensor([float(reward[0])], device=device)
fca_action = torch.tensor(actions[0], device=device)
# Observe new state
last_x = current_x
current_x = fc_agent.utils.input(state[0])

# Store the transition in memory
# Game over
if done or (not fc_agent.is_alive and not memory_stop):
fc_agent.memory.push(last_x, fca_action, None, fca_reward)
memory_stop = True
# Game on
else:
fc_agent.memory.push(last_x, fca_action, current_x, fca_reward)

# Perform one step of the optimization (on the target network)
fc_agent._train()
iter_num += 1
#-------------------------------------------------------------------

#for agent in agent_list:
# agent.episode_end(reward[agent.agent_id], obs[agent.agent_id])

env.close()
print(info)
if 'winners' in info:
wins[info['winners'][0]] = wins.get(info['winners'][0], 0) + 1
print(wins)

# Update the target network
if an_episode % target_update == 0:
fc_agent.target_net.load_state_dict(fc_agent.policy_net.state_dict())

Loading

0 comments on commit c12a2c8

Please sign in to comment.