mahowald
diff --git a/‎Dockerfile
+11 b/‎Dockerfile
+11
diff --git a/‎pytorch_dqn.pt
277 KB b/‎pytorch_dqn.pt
277 KB
diff --git a/‎requirements.txt
+2 b/‎requirements.txt
+2
diff --git a/‎setup.py
+11 b/‎setup.py
+11
diff --git a/‎tictactoe/__init__.py b/‎tictactoe/__init__.py
diff --git a/‎tictactoe/__main__.py
+20 b/‎tictactoe/__main__.py
+20
diff --git a/‎tictactoe/env.py
+116 b/‎tictactoe/env.py
+116
diff --git a/‎tictactoe/fit.py
+206 b/‎tictactoe/fit.py
+206
@@ -0,0 +1,11 @@
+FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
+
+WORKDIR /src/
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY setup.py .
+COPY tictactoe/ ./tictactoe/
+RUN python setup.py install
+COPY pytorch_dqn.pt .
+
+ENTRYPOINT ["python", "-m", "tictactoe"]
@@ -0,0 +1,2 @@
+gym>=0.15.6
+click>=7.0
@@ -0,0 +1,11 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="TicTacToe",
+    version="0.0",
+    description="Learn to play Tic Tac Toe",
+    author="Matthew Mahowald",
+    author_email="",
+    packages=find_packages(),
+    install_requires=[]
+)
@@ -0,0 +1,20 @@
+import click
+from .fit import fit
+from .play import play
+import sys
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+@click.command()
+@click.option("--mode", default="play", help="fit or play")
+def main(mode="play"):
+    if mode == "fit":
+        res = fit()
+        sys.stdout.buffer.write(res)
+        sys.stdout.flush()
+    elif mode == "play":
+        play()
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,116 @@
+import numpy as np
+import gym
+from gym import spaces
+
+
+class TicTacToe(gym.Env):
+
+    reward_range = (-np.inf, np.inf)
+    observation_space = spaces.MultiDiscrete([2 for _ in range(0, 9 * 3)])
+    action_space = spaces.Discrete(9)
+
+    """
+    Board looks like:
+    [0, 1, 2,
+     3, 4, 5,
+     6, 7, 8]
+    """
+    winning_streaks = [
+        [0, 1, 2],
+        [3, 4, 5],
+        [6, 7, 8],
+        [0, 3, 6],
+        [1, 4, 7],
+        [2, 5, 8],
+        [0, 4, 8],
+        [2, 4, 6],
+    ]
+
+    def __init__(self, summary: dict = None):
+        super().__init__()
+        if summary is None:
+            summary = {
+                "total games": 0,
+                "ties": 0,
+                "illegal moves": 0,
+                "player 0 wins": 0,
+                "player 1 wins": 0,
+            }
+        self.summary = summary
+
+    def seed(self, seed=None):
+        pass
+
+    def _one_hot_board(self):
+        if self.current_player == 0:
+            return np.eye(3)[self.board].reshape(-1)
+        if self.current_player == 1:
+            # permute for symmetry
+            return np.eye(3)[self.board][:, [0, 2, 1]].reshape(-1)
+
+    def reset(self):
+        self.current_player = 0
+        self.board = np.zeros(9, dtype="int")
+        return self._one_hot_board()
+
+    def step(self, actions):
+        exp = {"state": "in progress"}
+
+        # get the current player's action
+        action = actions
+
+        reward = 0
+        done = False
+        # illegal move
+        if self.board[action] != 0:
+            reward = -10  # illegal moves are really bad
+            exp = {"state": "done", "reason": "Illegal move"}
+            done = True
+            self.summary["total games"] += 1
+            self.summary["illegal moves"] += 1
+            return self._one_hot_board(), reward, done, exp
+
+        self.board[action] = self.current_player + 1
+
+        # check if the other player can win on the next turn:
+        for streak in self.winning_streaks:
+            if ((self.board[streak] == 2 - self.current_player).sum() >= 2) and (
+                self.board[streak] == 0
+            ).any():
+                reward = -2
+                exp = {
+                    "state": "in progress",
+                    "reason": "Player {} can lose on the next turn".format(
+                        self.current_player
+                    ),
+                }
+
+        # check if we won
+        for streak in self.winning_streaks:
+            if (self.board[streak] == self.current_player + 1).all():
+                reward = 1  # player wins!
+                exp = {
+                    "state": "in progress",
+                    "reason": "Player {} has won".format(self.current_player),
+                }
+                self.summary["total games"] += 1
+                self.summary["player {} wins".format(self.current_player)] += 1
+                done = True
+        # check if we tied, which ends the game
+        if (self.board != 0).all():
+            reward = 0
+            exp = {
+                "state": "in progress",
+                "reason": "Player {} has tied".format(self.current_player),
+            }
+            done = True
+            self.summary["total games"] += 1
+            self.summary["ties"] += 1
+
+        # move to the next player
+        self.current_player = 1 - self.current_player
+
+        return self._one_hot_board(), reward, done, exp
+
+    def render(self, mode: str = "human"):
+        print("{}|{}|{}\n-----\n{}|{}|{}\n-----\n{}|{}|{}".format(*self.board.tolist()))
@@ -0,0 +1,206 @@
+from .env import TicTacToe
+from .model import Policy, Transition, ReplayMemory
+
+import torch
+import torch.optim as optim
+import torch.nn.functional as F
+import numpy as np
+from typing import Tuple
+import random
+import logging
+import io
+
+def fit(
+    n_steps: int = 500_000,
+    batch_size: int = 128,
+    gamma: float = 0.99,
+    eps_start: float = 1.0,
+    eps_end: float = 0.1,
+    eps_steps: int = 200_000,
+) -> bytes:
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    logging.info("Beginning training on: {}".format(device))
+
+    target_update = int((1e-2) * n_steps)
+    policy = Policy(n_inputs=3 * 9, n_outputs=9).to(device)
+    target = Policy(n_inputs=3 * 9, n_outputs=9).to(device)
+    target.load_state_dict(policy.state_dict())
+    target.eval()
+
+    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
+    memory = ReplayMemory(50_000)
+
+    env = TicTacToe()
+    state = torch.tensor([env.reset()], dtype=torch.float).to(device)
+    old_summary = {
+        "total games": 0,
+        "ties": 0,
+        "illegal moves": 0,
+        "player 0 wins": 0,
+        "player 1 wins": 0,
+    }
+    _randoms = 0
+    summaries = []
+
+    for step in range(n_steps):
+        t = np.clip(step / eps_steps, 0, 1)
+        eps = (1 - t) * eps_start + t * eps_end
+
+        action, was_random = select_model_action(device, policy, state, eps)
+        if was_random:
+            _randoms += 1
+        next_state, reward, done, _ = env.step(action.item())
+
+        # player 2 goes
+        if not done:
+            next_state, _, done, _ = env.step(select_dummy_action(next_state))
+            next_state = torch.tensor([next_state], dtype=torch.float).to(device)
+        if done:
+            next_state = None
+
+        memory.push(state, action, next_state, torch.tensor([reward], device=device))
+
+        state = next_state
+        optimize_model(
+            device=device,
+            optimizer=optimizer,
+            policy=policy,
+            target=target,
+            memory=memory,
+            batch_size=batch_size,
+            gamma=gamma,
+        )
+        if done:
+            state = torch.tensor([env.reset()], dtype=torch.float).to(device)
+        if step % target_update == 0:
+            target.load_state_dict(policy.state_dict())
+        if step % 5000 == 0:
+            delta_summary = {k: env.summary[k] - old_summary[k] for k in env.summary}
+            delta_summary["random actions"] = _randoms
+            old_summary = {k: env.summary[k] for k in env.summary}
+            logging.info("{} : {}".format(step, delta_summary))
+            summaries.append(delta_summary)
+            _randoms = 0
+
+    logging.info("Complete")
+
+    res = io.BytesIO()
+    torch.save(policy.state_dict(), res)
+
+    return res.getbuffer()
+
+
+def optimize_model(
+    device: torch.device,
+    optimizer: optim.Optimizer,
+    policy: Policy,
+    target: Policy,
+    memory: ReplayMemory,
+    batch_size: int,
+    gamma: float,
+):
+    """Model optimization step, copied verbatim from the Torch DQN tutorial.
+    
+    Arguments:
+        device {torch.device} -- Device
+        optimizer {torch.optim.Optimizer} -- Optimizer
+        policy {Policy} -- Policy module
+        target {Policy} -- Target module
+        memory {ReplayMemory} -- Replay memory
+        batch_size {int} -- Number of observations to use per batch step
+        gamma {float} -- Reward discount factor
+    """
+    if len(memory) < batch_size:
+        return
+    transitions = memory.sample(batch_size)
+    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
+    # detailed explanation). This converts batch-array of Transitions
+    # to Transition of batch-arrays.
+    batch = Transition(*zip(*transitions))
+
+    # Compute a mask of non-final states and concatenate the batch elements
+    # (a final state would've been the one after which simulation ended)
+    non_final_mask = torch.tensor(
+        tuple(map(lambda s: s is not None, batch.next_state)),
+        device=device,
+        dtype=torch.bool,
+    )
+    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
+    state_batch = torch.cat(batch.state)
+    action_batch = torch.cat(batch.action)
+    reward_batch = torch.cat(batch.reward)
+
+    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
+    # columns of actions taken. These are the actions which would've been taken
+    # for each batch state according to policy_net
+    state_action_values = policy(state_batch).gather(1, action_batch)
+
+    # Compute V(s_{t+1}) for all next states.
+    # Expected values of actions for non_final_next_states are computed based
+    # on the "older" target_net; selecting their best reward with max(1)[0].
+    # This is merged based on the mask, such that we'll have either the expected
+    # state value or 0 in case the state was final.
+    next_state_values = torch.zeros(batch_size, device=device)
+    next_state_values[non_final_mask] = target(non_final_next_states).max(1)[0].detach()
+    # Compute the expected Q values
+    expected_state_action_values = (next_state_values * gamma) + reward_batch
+
+    # Compute Huber loss
+    loss = F.smooth_l1_loss(
+        state_action_values, expected_state_action_values.unsqueeze(1)
+    )
+
+    # Optimize the model
+    optimizer.zero_grad()
+    loss.backward()
+    for param in policy.parameters():
+        param.grad.data.clamp_(-1, 1)
+    optimizer.step()
+
+
+def select_dummy_action(state: np.array) -> int:
+    """Select a random (valid) move, given a board state.
+    
+    Arguments:
+        state {np.array} -- Board state observation
+    
+    Returns:
+        int -- Move to make.
+    """
+    state = state.reshape(3, 3, 3)
+    open_spots = state[:, :, 0].reshape(-1)
+    p = open_spots / open_spots.sum()
+    return np.random.choice(np.arange(9), p=p)
+
+
+def select_model_action(
+    device: torch.device, model: Policy, state: torch.tensor, eps: float
+) -> Tuple[torch.tensor, bool]:
+    """Selects an action for the model: either using the policy, or
+    by choosing a random valid action (as controlled by `eps`)
+    
+    Arguments:
+        device {torch.device} -- Device
+        model {Policy} -- Policy module
+        state {torch.tensor} -- Current board state, as a torch tensor
+        eps {float} -- Probability of choosing a random state.
+    
+    Returns:
+        Tuple[torch.tensor, bool] -- The action, and a bool indicating whether
+                                     the action is random or not.
+    """
+
+    sample = random.random()
+    if sample > eps:
+        return model.act(state), False
+    else:
+        return (
+            torch.tensor(
+                [[select_dummy_action(state.cpu().numpy())]],
+                device=device,
+                dtype=torch.long,
+            ),
+            True,
+        )
+