diff --git a/src/04_algorithms/01_algorithm.jl b/src/04_algorithms/01_algorithm.jl index 16ba72f..ee91c48 100644 --- a/src/04_algorithms/01_algorithm.jl +++ b/src/04_algorithms/01_algorithm.jl @@ -30,6 +30,13 @@ function learn!( return end +@doc """ +Update the probability of choosing each arm. +""" -> +function update_policy!(algorithm::Algorithm, context::Context) + error("update_policy!(algorithm, context) is not implemented abstractly") +end + @doc """ Choose one of K arms given the current context. """ -> diff --git a/src/04_algorithms/02_baseline/01_random_choice.jl b/src/04_algorithms/02_baseline/01_random_choice.jl index 28b6acf..051388d 100644 --- a/src/04_algorithms/02_baseline/01_random_choice.jl +++ b/src/04_algorithms/02_baseline/01_random_choice.jl @@ -4,6 +4,18 @@ uniformly at random. """ -> immutable RandomChoice{L <: Learner} <: Algorithm learner::L + policy::Vector{Float64} +end + +@doc """ +Prepare to choose an arm uniformly at random. +""" -> +function update_policy!(algorithm::RandomChoice, context::Context) + K = context.K + for i in 1:K + algorithm.policy[i] = 1 / K + end + return end @doc """ diff --git a/src/04_algorithms/03_e-greedy/01_standard.jl b/src/04_algorithms/03_e-greedy/01_standard.jl index a49af8c..a5e43d3 100644 --- a/src/04_algorithms/03_e-greedy/01_standard.jl +++ b/src/04_algorithms/03_e-greedy/01_standard.jl @@ -4,6 +4,23 @@ An EpsilonGreedy object represents the standard, constant-ε bandit algorithm. immutable EpsilonGreedy{L <: Learner} <: Algorithm learner::L ε::Float64 + policy::Vector{Float64} +end + +@doc """ +Update policy based on belief about best arm. +""" -> +function update_policy!(algorithm::EpsilonGreedy, context::Context) + ε, K = algorithm.ε, length(algorithm.policy) + a_star = preferred_arm(algorithm, context) + for i in 1:K + if i != a_star + algorithm.policy[i] = ε / K + else + algorithm.policy[i] = (1 - ε) + ε / K + end + end + return end @doc """ diff --git a/src/04_algorithms/03_e-greedy/02_annealing.jl b/src/04_algorithms/03_e-greedy/02_annealing.jl index 4032578..717fcc7 100644 --- a/src/04_algorithms/03_e-greedy/02_annealing.jl +++ b/src/04_algorithms/03_e-greedy/02_annealing.jl @@ -4,6 +4,23 @@ algorithm in which ε decreases with a logarithmic annealing schedule. """ -> immutable AnnealingEpsilonGreedy{L <: Learner} <: Algorithm learner::L + policy::Vector{Float64} +end + +@doc """ +Update policy based on belief about best arm. +""" -> +function update_policy!(algorithm::AnnealingEpsilonGreedy, context::Context) + ε, K = 1 / log(e + context.t - 1), length(algorithm.policy) + a_star = preferred_arm(algorithm, context) + for i in 1:K + if i != a_star + algorithm.policy[i] = ε / K + else + algorithm.policy[i] = (1 - ε) + ε / K + end + end + return end @doc """ diff --git a/src/04_algorithms/03_e-greedy/03_decreasing.jl b/src/04_algorithms/03_e-greedy/03_decreasing.jl index c078ceb..96d46e8 100644 --- a/src/04_algorithms/03_e-greedy/03_decreasing.jl +++ b/src/04_algorithms/03_e-greedy/03_decreasing.jl @@ -6,6 +6,24 @@ immutable DecreasingEpsilonGreedy{L <: Learner} <: Algorithm learner::L c::Float64 d::Float64 + policy::Vector{Float64} +end + +@doc """ +Update policy based on belief about best arm. +""" -> +function update_policy!(algorithm::DecreasingEpsilonGreedy, context::Context) + ε = min(1.0, (algorithm.c * context.K) / (algorithm.d^2 * context.t)) + K = length(algorithm.policy) + a_star = preferred_arm(algorithm, context) + for i in 1:K + if i != a_star + algorithm.policy[i] = ε / K + else + algorithm.policy[i] = (1 - ε) + ε / K + end + end + return end @doc """ diff --git a/src/04_algorithms/04_softmax/01_standard.jl b/src/04_algorithms/04_softmax/01_standard.jl index 9172653..6d5db59 100644 --- a/src/04_algorithms/04_softmax/01_standard.jl +++ b/src/04_algorithms/04_softmax/01_standard.jl @@ -31,6 +31,19 @@ function initialize!(algorithm::Softmax, K::Integer) return end +@doc """ +Update policy based on empirical means and temperature. +""" -> +function update_policy!(algorithm::Softmax, context::Context) + μs = means(algorithm.learner) + τ = algorithm.τ + for i in 1:context.K + algorithm.tmeans[i] = μs[i] / τ + end + softmax!(algorithm.probs, algorithm.tmeans) + return +end + @doc """ Select an arm according to the softmax rule. Recompute temperature adjusted means to make sure that the softmax selection probabilities are correct. diff --git a/src/04_algorithms/04_softmax/02_annealing.jl b/src/04_algorithms/04_softmax/02_annealing.jl index 3c170b9..c7cdd87 100644 --- a/src/04_algorithms/04_softmax/02_annealing.jl +++ b/src/04_algorithms/04_softmax/02_annealing.jl @@ -33,6 +33,19 @@ function initialize!(algorithm::AnnealingSoftmax, K::Integer) return end +@doc """ +Update policy based on empirical means and temperature. +""" -> +function update_policy!(algorithm::AnnealingSoftmax, context::Context) + μs = means(algorithm.learner) + τ = algorithm.τ₀ / log(e + context.t - 1) + for i in 1:context.K + algorithm.tmeans[i] = μs[i] / τ + end + softmax!(algorithm.probs, algorithm.tmeans) + return +end + @doc """ Select an arm according to the softmax rule. First, the current temperature is computed. Then we recompute temperature adjusted means to make sure that the diff --git a/src/04_algorithms/05_ucb/01_ucb1.jl b/src/04_algorithms/05_ucb/01_ucb1.jl index 2886954..a0a5871 100644 --- a/src/04_algorithms/05_ucb/01_ucb1.jl +++ b/src/04_algorithms/05_ucb/01_ucb1.jl @@ -1,5 +1,6 @@ immutable UCB1{L <: Learner} <: Algorithm learner::L + policy::Vector{Float64} end function initialize!(algorithm::UCB1, K::Integer) @@ -7,6 +8,47 @@ function initialize!(algorithm::UCB1, K::Integer) return end +@doc """ +Update policy based on empirical means and temperature. +""" -> +function update_policy!(algorithm::UCB1, context::Context) + μs = means(algorithm.learner) + ns = counts(algorithm.learner) + + for a in 1:context.K + if ns[a] == 0 + for i in 1:K + if a == i + algorithm.policy[i] = 1.0 + else + algorithm.policy[i] = 0.0 + end + end + return + end + end + + max_score, chosen_a = -Inf, 0 + for a in 1:context.K + bonus = sqrt(2 * log(context.t) / ns[a]) + score = μs[a] + bonus + if score > max_score + max_score = score + chosen_a = a + end + end + + for a in 1:context.K + if chosen_a == a + algorithm.policy[a] = 1.0 + else + algorithm.policy[a] = 0.0 + end + end + + return +end + function choose_arm(algorithm::UCB1, context::Context) μs = means(algorithm.learner) ns = counts(algorithm.learner)