eytan · johnmyleswhite · Oct 8, 2015
diff --git a/src/04_algorithms/01_algorithm.jl b/src/04_algorithms/01_algorithm.jl
@@ -30,6 +30,13 @@ function learn!(
     return
 end
 
+@doc """
+Update the probability of choosing each arm.
+""" ->
+function update_policy!(algorithm::Algorithm, context::Context)
+    error("update_policy!(algorithm, context) is not implemented abstractly")
+end
+
 @doc """
 Choose one of K arms given the current context.
 """ ->

diff --git a/src/04_algorithms/02_baseline/01_random_choice.jl b/src/04_algorithms/02_baseline/01_random_choice.jl
@@ -4,6 +4,18 @@ uniformly at random.
 """ ->
 immutable RandomChoice{L <: Learner} <: Algorithm
     learner::L
+    policy::Vector{Float64}
+end
+
+@doc """
+Prepare to choose an arm uniformly at random.
+""" ->
+function update_policy!(algorithm::RandomChoice, context::Context)
+    K = context.K
+    for i in 1:K
+        algorithm.policy[i] = 1 / K
+    end
+    return
 end
 
 @doc """

diff --git a/src/04_algorithms/03_e-greedy/01_standard.jl b/src/04_algorithms/03_e-greedy/01_standard.jl
@@ -4,6 +4,23 @@ An EpsilonGreedy object represents the standard, constant-ε bandit algorithm.
 immutable EpsilonGreedy{L <: Learner} <: Algorithm
     learner::L
     ε::Float64
+    policy::Vector{Float64}
+end
+
+@doc """
+Update policy based on belief about best arm.
+""" ->
+function update_policy!(algorithm::EpsilonGreedy, context::Context)
+    ε, K = algorithm.ε, length(algorithm.policy)
+    a_star = preferred_arm(algorithm, context)
+    for i in 1:K
+        if i != a_star
+            algorithm.policy[i] = ε / K
+        else
+            algorithm.policy[i] = (1 - ε) + ε / K
+        end
+    end
+    return
 end
 
 @doc """

diff --git a/src/04_algorithms/03_e-greedy/02_annealing.jl b/src/04_algorithms/03_e-greedy/02_annealing.jl
@@ -4,6 +4,23 @@ algorithm in which ε decreases with a logarithmic annealing schedule.
 """ ->
 immutable AnnealingEpsilonGreedy{L <: Learner} <: Algorithm
     learner::L
+    policy::Vector{Float64}
+end
+
+@doc """
+Update policy based on belief about best arm.
+""" ->
+function update_policy!(algorithm::AnnealingEpsilonGreedy, context::Context)
+    ε, K = 1 / log(e + context.t - 1), length(algorithm.policy)
+    a_star = preferred_arm(algorithm, context)
+    for i in 1:K
+        if i != a_star
+            algorithm.policy[i] = ε / K
+        else
+            algorithm.policy[i] = (1 - ε) + ε / K
+        end
+    end
+    return
 end
 
 @doc """

diff --git a/src/04_algorithms/03_e-greedy/03_decreasing.jl b/src/04_algorithms/03_e-greedy/03_decreasing.jl
@@ -6,6 +6,24 @@ immutable DecreasingEpsilonGreedy{L <: Learner} <: Algorithm
     learner::L
     c::Float64
     d::Float64
+    policy::Vector{Float64}
+end
+
+@doc """
+Update policy based on belief about best arm.
+""" ->
+function update_policy!(algorithm::DecreasingEpsilonGreedy, context::Context)
+    ε = min(1.0, (algorithm.c * context.K) / (algorithm.d^2 * context.t))
+    K = length(algorithm.policy)
+    a_star = preferred_arm(algorithm, context)
+    for i in 1:K
+        if i != a_star
+            algorithm.policy[i] = ε / K
+        else
+            algorithm.policy[i] = (1 - ε) + ε / K
+        end
+    end
+    return
 end
 
 @doc """

diff --git a/src/04_algorithms/04_softmax/01_standard.jl b/src/04_algorithms/04_softmax/01_standard.jl
@@ -31,6 +31,19 @@ function initialize!(algorithm::Softmax, K::Integer)
     return
 end
 
+@doc """
+Update policy based on empirical means and temperature.
+""" ->
+function update_policy!(algorithm::Softmax, context::Context)
+    μs = means(algorithm.learner)
+    τ = algorithm.τ
+    for i in 1:context.K
+        algorithm.tmeans[i] = μs[i] / τ
+    end
+    softmax!(algorithm.probs, algorithm.tmeans)
+    return
+end
+
 @doc """
 Select an arm according to the softmax rule. Recompute temperature adjusted
 means to make sure that the softmax selection probabilities are correct.

diff --git a/src/04_algorithms/04_softmax/02_annealing.jl b/src/04_algorithms/04_softmax/02_annealing.jl
@@ -33,6 +33,19 @@ function initialize!(algorithm::AnnealingSoftmax, K::Integer)
     return
 end
 
+@doc """
+Update policy based on empirical means and temperature.
+""" ->
+function update_policy!(algorithm::AnnealingSoftmax, context::Context)
+    μs = means(algorithm.learner)
+    τ = algorithm.τ₀ / log(e + context.t - 1)
+    for i in 1:context.K
+        algorithm.tmeans[i] = μs[i] / τ
+    end
+    softmax!(algorithm.probs, algorithm.tmeans)
+    return
+end
+
 @doc """
 Select an arm according to the softmax rule. First, the current temperature
 is computed. Then we recompute temperature adjusted means to make sure that the

diff --git a/src/04_algorithms/05_ucb/01_ucb1.jl b/src/04_algorithms/05_ucb/01_ucb1.jl
@@ -1,12 +1,54 @@
 immutable UCB1{L <: Learner} <: Algorithm
     learner::L
+    policy::Vector{Float64}
 end
 
 function initialize!(algorithm::UCB1, K::Integer)
     initialize!(algorithm.learner, K)
     return
 end
 
+@doc """
+Update policy based on empirical means and temperature.
+""" ->
+function update_policy!(algorithm::UCB1, context::Context)
+    μs = means(algorithm.learner)
+    ns = counts(algorithm.learner)
+
+    for a in 1:context.K
+        if ns[a] == 0
+            for i in 1:K
+                if a == i
+                    algorithm.policy[i] = 1.0
+                else
+                    algorithm.policy[i] = 0.0
+                end
+            end
+            return
+        end
+    end
+
+    max_score, chosen_a = -Inf, 0
+    for a in 1:context.K
+        bonus = sqrt(2 * log(context.t) / ns[a])
+        score = μs[a] + bonus
+        if score > max_score
+            max_score = score
+            chosen_a = a
+        end
+    end
+
+    for a in 1:context.K
+        if chosen_a == a
+            algorithm.policy[a] = 1.0
+        else
+            algorithm.policy[a] = 0.0
+        end
+    end
+
+    return
+end
+
 function choose_arm(algorithm::UCB1, context::Context)
     μs = means(algorithm.learner)
     ns = counts(algorithm.learner)