# We organise them in two levels
# - sampling rule; a factory for sampling rule states
# - sampling rule state; keeps track of i.e. tracking information etc.

import Distributions;

include("../regret.jl");
include("../tracking.jl");
include("../expfam.jl");
include("helpers.jl");
include("envelope.jl");

```
Uniform sampling
```

struct RoundRobin # used as factory and state
end

long(sr::RoundRobin) = "Uniform";
abbrev(sr::RoundRobin) = "RR";

function start(sr::RoundRobin, N)
    return sr;
end

function nextsample(sr::RoundRobin, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    return 1 + (sum(N) % length(N));
end

```
Oracle sampling
```

struct FixedWeights # used as factory and state
    w;
    function FixedWeights(w)
        @assert all(w .≥ 0) && sum(w) ≈ 1 "$w not in simplex";
        new(w)
    end
end

long(sr::FixedWeights) = "Oracle Weigths";
abbrev(sr::FixedWeights) = "opt";

function start(sr::FixedWeights, N)
    return sr;
end

function nextsample(sr::FixedWeights, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    argmin(N .- sum(N).*sr.w);
end


```
TaS
```

struct TaS
    TrackingRule;
end

long(sr::TaS) = "TaS " * abbrev(sr.TrackingRule);
abbrev(sr::TaS) = "TaS-" * abbrev(sr.TrackingRule);

struct TaSState
    t;
    TaSState(TrackingRule, N) = new(ForcedExploration(TrackingRule(N)));
end

function start(sr::TaS, N)
    TaSState(sr.TrackingRule, N);
end

function nextsample(sr::TaSState, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    _, w = oracle(pep, ξ);

    # tracking
    return track(sr.t, N, w);
end


```
LUCB
```

struct LUCB
end

long(sr::LUCB) = "LUCB";
abbrev(sr::LUCB) = "LUCB";

function start(sr::LUCB, N)
    sr;
end

function nextsample(sr::LUCB, pep, ξ, N, β, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);
    astar = argmax(ξ);

    UCBs = zeros(K);
    LCB = ddn(expfam, ξ[astar], β / N[astar]);
    for a in 1:K
        if a != astar
            UCBs[a] = dup(expfam, ξ[a], β / N[a]);
        else
            UCBs[a] = -Inf;
        end
    end

    # Best challenger to astar
    k = argmax(UCBs);
    @assert k != astar "Problem"

    # Compute gap
    ucb = UCBs[k] - LCB;
    return astar, k, ucb;
end

```
LUCBhalf
```

struct LUCBhalf
end

long(sr::LUCBhalf) = "LUCBhalf";
abbrev(sr::LUCBhalf) = "LUCBhalf";

function start(sr::LUCBhalf, N)
    sr;
end

function nextsample(sr::LUCBhalf, pep, ξ, N, β, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);
    astar = argmax(ξ);

    UCBs = zeros(K);
    LCB = ddn(expfam, ξ[astar], β / N[astar]);
    for a in 1:K
        if a != astar
            UCBs[a] = dup(expfam, ξ[a], β / N[a]);
        else
            UCBs[a] = -Inf;
        end
    end

    # Best challenger to astar
    challenger = argmax(UCBs);
    @assert challenger != astar "Problem"

    # Compute gap
    ucb = UCBs[challenger] - LCB;
    u = rand(rng);
    if u <= 0.5
        k = astar;
    else
        k = challenger;
    end
    return astar, k, ucb;
end


```
DKM
```

struct DKM
    TrackingRule;
end

long(sr::DKM) = "DKM " * abbrev(sr.TrackingRule);
abbrev(sr::DKM) = "DKM-" * abbrev(sr.TrackingRule);

struct DKMState
    h; # one online learner in total
    t;
    DKMState(TrackingRule, N) = new(AdaHedge(length(N)), TrackingRule(N));
end

function start(sr::DKM, N)
    DKMState(sr.TrackingRule, N);
end

function optimistic_gradient(expfam, hμ, t, N, λs)
    [let ↑ = dup(expfam, hμ[k], log(t)/N[k]),
    ↓ = ddn(expfam, hμ[k], log(t)/N[k])
    max(d(expfam, ↑, λs[k]), d(expfam, ↓, λs[k]), log(t)/N[k])
    end
    for k in eachindex(hμ)];
end

function nextsample(sr::DKMState, pep, rsp, astar, aalt, ξ, N, Zs, rng)
    expfam = getexpfam(pep, 1);

    # query the learner
    w = act(sr.h);

    # best response λ-player to w
    _, (k, λs), (_, _) = glrt(pep, w, ξ);

    # optimistic gradient
    ∇ = optimistic_gradient(expfam, ξ, sum(N), N, λs);
    incur!(sr.h, -∇);

    # tracking
    return track(sr.t, N, w);
end

```
AdaTopTwo
```

struct AdaTopTwo
    βtype;
    leader;
    challenger;
    trackingtype;
end

long(sr::AdaTopTwo) = sr.βtype * "-" * sr.leader * "-" * sr.challenger * (sr.trackingtype == "" ? "" : "-T$(sr.trackingtype)");
abbrev(sr::AdaTopTwo) = sr.βtype * "-" * sr.leader * "-" * sr.challenger * (sr.trackingtype == "" ? "" : "-T$(sr.trackingtype)");

struct AdaTopTwoState
    βtype;
    leader;
    challenger;
    trackingtype;
    counts;     # pulling counts when leader
    βs;         # cumulative sums of β depending on leader
    AdaTopTwoState(βtype, leader, challenger, trackingtype, K) = new(βtype, leader, challenger, trackingtype,
        (trackingtype == "C" ? zeros((K, K)) : zeros(K)), (trackingtype == "C" ? zeros((K, K)) : zeros(K)));
end

function start(sr::AdaTopTwo, N)
    AdaTopTwoState(sr.βtype, sr.leader, sr.challenger, sr.trackingtype, length(N));
end

function nextsample(ssr::AdaTopTwoState, pep, rsp, astar, aalt, ξ, N, Zs, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);

    # Leader
    if ssr.leader == "EB"
        B = astar;
    elseif ssr.leader == "TS"
        if typeof(expfam) == Gaussian
            hξ = [rand(rng, Distributions.Normal(ξ[k], 1. / sqrt(N[k]))) for k in 1:K];
        elseif typeof(expfam) == Bernoulli
            hξ = [rand(rng, Distributions.Beta(1 + ξ[k] * N[k], 1 + N[k] - ξ[k] * N[k])) for k in 1:K];
        else
            @error "Undefined Exfam for TS";
        end
        B = argmax(hξ);
    elseif ssr.leader == "UCB"
        t = sum(N);
        UCBs = [dup(expfam, ξ[k], 2 * 1.2 * (1 + 1.2) * log(t) / N[k]) for k in 1:K];
        B = argmax(UCBs);
    elseif ssr.leader == "UCBI"
        t = sum(N);
        UCBs = [dup(expfam, ξ[k], bonus_ucb(t, 1.2, 1.2) / N[k]) for k in 1:K];
        B = argmax(UCBs);
    elseif ssr.leader == "IMED"
        Kinfs = [N[k] * d(expfam, ξ[k], ξ[astar]) for k in 1:K];
        B = argmin(Kinfs .+ log.(N));
    else
        @error "Undefined Adaptive Top Two leader";
    end

    # Challenger
    if ssr.challenger == "TC"
        if B != astar
            # Sampling uniformly at random among the arms with highest mean, since they have null transporation cost
            # Most of the time, this will be a singleton, hence returning astar
            ks = [a for a in 1:K if ξ[a] >= ξ[B] && a != B];
            C = ks[rand(rng, 1:length(ks))];
        else
            # When a_1 == astar, the computations from the stopping rule can be used as they rely on the same transportation costs
            C = aalt;
        end
    elseif ssr.challenger == "TCI"
        if B != astar
            # I need to recompute the TC costs for this arm. It is not enough to sample among arms with higher mean.
            # Indeed under-sampled arms with lower mean can have lower indices compared to over-sampled arms with higher mean. There is a trade-off.
            Zbis = [a != B ? (ξ[a] >= ξ[B] ? 0 :
                    N[B] * d(expfam, ξ[B], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a])) +
                    N[a] * d(expfam, ξ[a], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a]))) : Inf for a in 1:K];
            C = argmin(Zbis .+ log.(N));
        else
            # When a_1 == astar, we can re-use the GLR stopping computations.
            C = argmin(Zs .+ log.(N));
        end
    elseif ssr.challenger == "TCIs"
        # Compared to TCI, we apply the log penalization to a squared root transformation, i.e. x → √2x
        if B != astar
            # I need to recompute the TC costs for this arm. It is not enough to sample among arms with higher mean.
            # Indeed under-sampled arms with lower mean can have lower indices compared to over-sampled arms with higher mean. There is a trade-off.
            Zbis = [a != B ? (ξ[a] >= ξ[B] ? 0 :
                    N[B] * d(expfam, ξ[B], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a])) +
                    N[a] * d(expfam, ξ[a], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a]))) : Inf for a in 1:K];
            C = argmin(sqrt.(2 * Zbis) .+ log.(N));
        else
            # When a_1 == astar, we can re-use the GLR stopping computations.
            C = argmin(sqrt.(2 * Zs) .+ log.(N));
        end
    elseif ssr.challenger == "TCIs+"
        # Compared to TCI, we apply the poly log penalization (with κ = 1.2) to a squared root transformation, i.e. x → √2x
        if B != astar
            # I need to recompute the TC costs for this arm. It is not enough to sample among arms with higher mean.
            # Indeed under-sampled arms with lower mean can have lower indices compared to over-sampled arms with higher mean. There is a trade-off.
            Zbis = [a != B ? (ξ[a] >= ξ[B] ? 0 :
                    N[B] * d(expfam, ξ[B], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a])) +
                    N[a] * d(expfam, ξ[a], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a]))) : Inf for a in 1:K];
            C = argmin(sqrt.(2 * Zbis) .+ log.(N) .^ (1.2 / 2));
        else
            # When a_1 == astar, we can re-use the GLR stopping computations.
            C = argmin(sqrt.(2 * Zs) .+ log.(N) .^ (1.2 / 2));
        end
    elseif ssr.challenger == "TCIsp"
        # Compared to TCIs, we apply a polynomial penalization (with α = 1.2) to a transformation by x → √2x
        if B != astar
            # I need to recompute the TC costs for this arm. It is not enough to sample among arms with higher mean.
            # Indeed under-sampled arms with lower mean can have lower indices compared to over-sampled arms with higher mean. There is a trade-off.
            Zbis = [a != B ? (ξ[a] >= ξ[B] ? 0 :
                    N[B] * d(expfam, ξ[B], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a])) +
                    N[a] * d(expfam, ξ[a], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a]))) : Inf for a in 1:K];
            C = argmin(sqrt.(2 * Zbis) .+ N .^ (1 / (2 * 1.2)));
        else
            # When a_1 == astar, we can re-use the GLR stopping computations.
            C = argmin(sqrt.(2 * Zs) .+  N .^ (1 / (2 * 1.2)));
        end
    elseif ssr.challenger == "TCIsp+"
        # Compared to TCIs, we apply a polynomial penalization (with α = 2) to a transformation by x → √2x
        if B != astar
            # I need to recompute the TC costs for this arm. It is not enough to sample among arms with higher mean.
            # Indeed under-sampled arms with lower mean can have lower indices compared to over-sampled arms with higher mean. There is a trade-off.
            Zbis = [a != B ? (ξ[a] >= ξ[B] ? 0 :
                    N[B] * d(expfam, ξ[B], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a])) +
                    N[a] * d(expfam, ξ[a], (N[B] * ξ[B] + N[a] * ξ[a]) / (N[B] + N[a]))) : Inf for a in 1:K];
            C = argmin(sqrt.(2 * Zbis) .+ N .^ (1 / (2 * 2)));
        else
            # When a_1 == astar, we can re-use the GLR stopping computations.
            C = argmin(sqrt.(2 * Zs) .+  N .^ (1 / (2 * 2)));
        end
    elseif ssr.challenger == "RS"
        ks = [B];
        count = 0;
        while B in ks
            if typeof(expfam) == Gaussian
                hξ = [rand(rng, Distributions.Normal(ξ[a], 1. / sqrt(N[a]))) for a in 1:K];
            elseif typeof(expfam) == Bernoulli
                hξ = [rand(rng, Distributions.Beta(1 + ξ[a] * N[a], 1 + N[a] - ξ[a] * N[a])) for a in 1:K];
            end
            max_hξ = maximum(hξ);
            ks = [a for a in 1:K if hξ[a] == max_hξ];
            count += 1;

            if count > 1e7
                @warn "RS challenger is taking too much time, hence sample uniformly.";
                return 1 + (sum(N) % length(N));
            end
        end
        C = ks[rand(rng, 1:length(ks))];
    else
        @error "Undefined Adaptive Top Two challenger";
    end

    # β choice
    if ssr.βtype == "cst"
        β = 0.5;
    elseif ssr.βtype == "ada"
        if ξ[B] > ξ[C]
            altλ = (N[B] * ξ[B] + N[C] * ξ[C]) / (N[B] + N[C]);
            ratio = N[C] * d(expfam, ξ[C], altλ) / (N[B] * d(expfam, ξ[B], altλ));
            β = 1 / (1 + ratio);
        else
            β = 0.5;
        end
    else
        @error "Undefined Adaptive Top Two β choice";
    end

    if ssr.trackingtype == "B"
        ssr.βs[B] += β;
        if ssr.counts[B] <= ssr.βs[B]
            k = B;
            ssr.counts[B] += 1;
        else
            k = C;
        end
    elseif ssr.trackingtype == "C"
        ssr.βs[B, C] += (1 - β);
        if ssr.counts[B, C] <= ssr.βs[B, C]
            k = C;
            ssr.counts[B, C] += 1;
        else
            k = B;
        end
    elseif ssr.trackingtype == ""
        u = rand(rng);
        if u <= β
            k = B;
        else
            k = C;
        end
    else
        @error "Undefined Tracking mode";
    end

    return k;
end



```
Frank-Wolfe based Sampling
```

struct FWSampling
    TrackingRule;
end

long(sr::FWSampling) = "FW-Sampling " * abbrev(sr.TrackingRule);
abbrev(sr::FWSampling) = "FWS-" * abbrev(sr.TrackingRule);

mutable struct FWSamplingState
    x;
    t;
    FWSamplingState(TrackingRule, N) = new(ones(length(N)) / length(N), TrackingRule(N));
end

function start(sr::FWSampling, N)
    FWSamplingState(sr.TrackingRule, N);
end

# Computing f and ∇f for FWSampling
function compute_f_∇f_bai(expfam, hw, ξ, astar, r, K)
    # Alternative parameters
    λs = [(ξ[astar] * hw[astar] + ξ[k] * hw[k]) / (hw[astar] + hw[k]) for k=1:K] ;
    suboptimal = [k for k=1:K if k!=astar];

    # construct ∇f
    ∇f = [[0.0 for j=1:K] for i=1:K];
    for k in suboptimal
        ∇f[k][astar] = d(expfam, ξ[astar], λs[k]);
        ∇f[k][k] = d(expfam, ξ[k], λs[k]);
    end

    # construct f
    f = [hw'∇f[k] for k in suboptimal];
    fmin = minimum(f);
    if r > eps()
        fidx = [j for (idxj,j) in enumerate(suboptimal) if (f[idxj]<fmin+r)]
    elseif abs(r)<eps()
        fidx = [suboptimal[argmin(f)]];
    else
        fidx = suboptimal;
    end
    return f, ∇f, fidx;
end

function nextsample(ssr::FWSamplingState, pep, rsp, astar, aalt, ξ, N, Zs, rng)
    expfam = getexpfam(pep, 1);
    K, t = length(N), sum(N);
    r = t^(-9.0/10)/K;

    z = zeros(K);
    if !hμ_in_lambda(ξ, astar, K) || is_complete_square(floor(Int, t/K))
        z = ones(K) / K;
    else
        f, ∇f, fidx = compute_f_∇f_bai(expfam, ssr.x, ξ, astar, r, K);
        if length(fidx) == 1 # best challenger
            challenger_idx = argmax(∇f[fidx[1]]);
            z = [(challenger_idx==j) ? 1 : 0 for j=1:K];
        else # solve LP of the zero-sum matrix game
            Σ = [[(i==j) ? 1 : 0 for j=1:K] - ssr.x for i=1:K];
            A = [[Σ[i]'∇f[j] for i=1:K] for j in fidx]; # construct payoff matrix
            z = solveZeroSumGame(A, K, length(fidx));
        end
    end
    setfield!(ssr, :x, ssr.x*((t-1.0)/t) + z*1.0/t);
    return track(ssr.t, N, ssr.x);
end



```
UGapEc
```

struct UGapEc
end

long(sr::UGapEc) = "UGapEc";
abbrev(sr::UGapEc) = "UGapEc";

function start(sr::UGapEc, N)
    sr;
end

function nextsample(sr::UGapEc, pep, ξ, N, β, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);
    astar = argmax(ξ);

    # UCB and LCB
    UCBs = zeros(K);
    LCBs = zeros(K);
    for a in 1:K
        UCBs[a] = dup(expfam, ξ[a], β / N[a]);
        LCBs[a] = ddn(expfam, ξ[a], β / N[a]);
    end

    # Compute gaps
    gaps = zeros(K);
    for a in 1:K
        idx = [k for k in 1:K if k != a];
        gaps[a] = maximum(UCBs[idx]) - LCBs[a];
    end

    # Compute leader
    gap = minimum(gaps);
    B = argmin(gaps);

    # Compute challenger
    UCBs[B] = 0;
    C = argmax(UCBs);

    # Choose arm to sample from
    k = (N[B] < N[C]) ? B : C;
    return astar, k, gap;
end


```
BAITopTwo
```

struct BAITopTwo
    is_cst;         # Choice of proportions, true means fixed, false means adaptive
    is_sampling;    # Type of selection
    leader;         # Type of leader
    challenger;     # Typoe of challenger
end

long(sr::BAITopTwo) = (sr.is_cst ? "cst-" : "") * sr.leader * "-B" * sr.challenger * (sr.is_sampling ? "" : "-T");
abbrev(sr::BAITopTwo) = (sr.is_cst ? "cst-" : "") * sr.leader * "-B" * sr.challenger * (sr.is_sampling ? "" : "-T");

struct BAITopTwoState
    is_cst;
    is_sampling;
    leader;
    challenger;
    counts;     # pulling counts when leader
    βs;         # cumulative sums of β depending on leader
    BAITopTwoState(is_cst, is_sampling, leader, challenger, K) = new(is_cst, is_sampling, leader, challenger, zeros(K), zeros(K));
end

function start(sr::BAITopTwo, N)
    BAITopTwoState(sr.is_cst, sr.is_sampling, sr.leader, sr.challenger, length(N));
end

function nextsample(ssr::BAITopTwoState, pep, rsp, astar, aalt, ξ, N, Zs, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);

    # Leader
    if ssr.leader == "EB"
        B = astar;
    elseif ssr.leader == "TS"
        if typeof(expfam) == Gaussian
            hξ = [rand(rng, Distributions.Normal(ξ[k], 1. / sqrt(N[k]))) for k in 1:K];
        else
            @error "Undefined Exfam for TS";
        end
        B = argmax(hξ);
    elseif ssr.leader == "UCB"
        t = sum(N);
        UCBs = [dup(expfam, ξ[k], bonus_ucb(t, 1.2, 1.2) / N[k]) for k in 1:K];
        B = argmax(UCBs);
    else
        @error "Undefined Adaptive Top Two leader";
    end

    # Challenger
    if ssr.challenger == "TC"
        ks = [a for a in 1:K if ξ[a] >= ξ[B] && a != B];
        if length(ks) >= 1
            # Those arms have null empirical TC, hence we sample among them uniformly at random
            C = ks[rand(rng, 1:length(ks))];
        else
            # All arms have strictly positive empirical TC, and we need to compute them
            TCs = [a != B ? (ξ[a] >= ξ[B] ? 0 : (ξ[B] - ξ[a])^2 / (1 / N[B] + 1 / N[a])) : Inf for a in 1:K];
            C = argmin(TCs);
        end
    elseif ssr.challenger == "TCI"
        TCs = [a != B ? (ξ[a] >= ξ[B] ? 0 : (ξ[B] - ξ[a])^2 / (1 / N[B] + 1 / N[a])) : Inf for a in 1:K];
        C = argmin(TCs .+ log.(N));
    elseif ssr.challenger == "RS"
        ks = [B];
        count = 0;
        while B in ks
            hξ = [rand(rng, Distributions.Normal(ξ[a], 1. / sqrt(N[a]))) for a in 1:K];
            max_hξ = maximum(hξ);
            ks = [a for a in 1:K if hξ[a] == max_hξ];
            count += 1;

            if count > 1e7
                @warn "RS challenger is taking too much time, hence sample uniformly.";
                return 1 + (sum(N) % length(N));
            end
        end
        C = ks[rand(rng, 1:length(ks))];
    else
        @error "Undefined Adaptive Top Two challenger";
    end

    # β choice
    β = ssr.is_cst ? 0.5 : N[C] / (N[C] + N[B]);

    if ssr.is_sampling
        u = rand(rng);
        if u <= β
            k = B;
        else
            k = C;
        end
    else
        ssr.βs[B] += β;
        if ssr.counts[B] <= ssr.βs[B]
            k = B;
            ssr.counts[B] += 1;
        else
            k = C;
        end
    end

    return k;
end


```
Some ϵ-BAI algorithms
```


```
AdaEpsTopTwo
```

struct AdaEpsTopTwo
    βtype;
    leader;
    challenger;
    trackingtype;
end

long(sr::AdaEpsTopTwo) = sr.βtype * "-" * sr.leader * "-" * sr.challenger * (sr.trackingtype == "" ? "" : "-T$(sr.trackingtype)");
abbrev(sr::AdaEpsTopTwo) = sr.βtype * "-" * sr.leader * "-" * sr.challenger * (sr.trackingtype == "" ? "" : "-T$(sr.trackingtype)");

struct AdaEpsTopTwoState
    βtype;
    leader;
    challenger;
    trackingtype;
    counts;     # pulling counts when leader
    βs;         # cumulative sums of β depending on leader
    AdaEpsTopTwoState(βtype, leader, challenger, trackingtype, K) = new(βtype, leader, challenger, trackingtype,
        (trackingtype == "C" ? zeros((K, K)) : zeros(K)), (trackingtype == "C" ? zeros((K, K)) : zeros(K)));
end

function start(sr::AdaEpsTopTwo, N)
    AdaEpsTopTwoState(sr.βtype, sr.leader, sr.challenger, sr.trackingtype, length(N));
end

function nextsample(ssr::AdaEpsTopTwoState, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);
    astar = argmax(ξ);

    # Leader
    if ssr.leader == "EB"
        B = astar;
    elseif ssr.leader == "IF"
        if typeof(rsp) == GLRT
            IFZs, (IFaalt, _), (aF, ξ) = higher_glrt(pep, N, hμ);
            B = aF;
        elseif typeof(rsp) == IFGLRT
            B = ahat;
        else
            @error "Undefined RSP"
        end
    elseif ssr.leader == "TS"
        if typeof(expfam) == Gaussian
            hξ = [rand(rng, Distributions.Normal(ξ[k], 1. / sqrt(N[k]))) for k in 1:K];
        else
            @error "Undefined Exfam for TS in AdaEpsTopTwo";
        end
        B = argmax(hξ);
    elseif ssr.leader == "UCB"
        t = sum(N);
        UCBs = [dup(expfam, ξ[k], bonus_ucb(t, 1.2, 1.2) / N[k]) for k in 1:K];
        B = argmax(UCBs);
    else
        @error "Undefined Adaptive Top Two leader";
    end

    # Challenger
    if ssr.challenger == "TC"
        if B == ahat
            # When B == astar, the computations from the stopping rule can be used as they rely on the same transportation costs
            C = aalt;
        elseif B != ahat && ssr.leader == "IF" && typeof(rsp) == GLRT
            C = IFaalt;
        else
            # When B != astar, I need to compute other transportations costs
            ks = [a for a in 1:K if (ξ[a] >= ((pep.opt == "mul") ? ξ[B] / (1 - pep.ϵ) : ξ[B] + pep.ϵ)) && a != B];
            if length(ks) >= 1
                # Those arms have null empirical TC, hence we sample among them uniformly at random
                C = ks[rand(rng, 1:length(ks))];
            else
                # All arms have strictly positive empirical TC, and we need to compute them
                if pep.opt == "mul"
                    Zbis = [a != B ? (ξ[a] >= ξ[B] / (1 - pep.ϵ) ? 0 :
                            (ξ[B] - (1 - pep.ϵ) * ξ[a])^2 / (1 / N[B] + (1 - pep.ϵ)^2 / N[a])) : Inf for a in 1:K];
                else
                    Zbis = [a != B ? (ξ[a] >= ξ[B] + pep.ϵ ? 0 :
                            (ξ[B] - ξ[a] + pep.ϵ)^2 / (1 / N[B] + 1 / N[a])) : Inf for a in 1:K];
                end
                C = argmin(Zbis);
            end
        end
    elseif ssr.challenger == "TCI"
        if B == ahat
            # When B == astar, the computations from the stopping rule can be used as they rely on the same transportation costs
            C = argmin(Zs .+ log.(N));
        elseif B != ahat && ssr.leader == "IF" && typeof(rsp) == GLRT
            C = argmin(IFZs .+ log.(N));
        else
            # When B != astar, I need to compute other transportations costs
            if pep.opt == "mul"
                Zbis = [a != B ? (ξ[a] >= ξ[B] / (1 - pep.ϵ) ? 0 :
                            (ξ[B] - (1 - pep.ϵ) * ξ[a])^2 / (1 / N[B] + (1 - pep.ϵ)^2 / N[a])) : Inf for a in 1:K];
            else
                Zbis = [a != B ? (ξ[a] >= ξ[B] + pep.ϵ ? 0 :
                            (ξ[B] - ξ[a] + pep.ϵ)^2 / (1 / N[B] + 1 / N[a])) : Inf for a in 1:K];
            end
            C = argmin(Zbis .+ log.(N));
        end
    elseif ssr.challenger == "RS"
        ks = [B];
        count = 0;
        while B in ks
            if typeof(expfam) == Gaussian
                hξ = [rand(rng, Distributions.Normal(ξ[a], 1. / sqrt(N[a]))) for a in 1:K];
            else
                @error "Undefined Exfam for TS in AdaEpsTopTwo";
            end
            max_hξ = maximum(hξ);
            ks = [a for a in 1:K if hξ[a] == max_hξ];
            count += 1;

            if count > 1e7
                @warn "RS challenger is taking too much time, hence sample uniformly.";
                return 1 + (sum(N) % length(N));
            end
        end
        C = ks[rand(rng, 1:length(ks))];
    else
        @error "Undefined Adaptive Top Two challenger";
    end

    # β choice
    if ssr.βtype == "cst"
        β = 0.5;
    elseif ssr.βtype == "ada"
        if pep.opt == "mul"
            if ξ[B] > ξ[C] * (1 - pep.ϵ)
                β = N[C] / (N[C] + N[B] * (1 - pep.ϵ)^2);
            else
                β = 0.5;
            end
        else
            if ξ[B] > ξ[C] - pep.ϵ
                β = N[C] / (N[C] + N[B]);
            else
                β = 0.5;
            end
        end
    else
        @error "Undefined Adaptive Top Two β choice";
    end

    if ssr.trackingtype == "B"
        ssr.βs[B] += β;
        if ssr.counts[B] <= ssr.βs[B]
            k = B;
            ssr.counts[B] += 1;
        else
            k = C;
        end
    elseif ssr.trackingtype == "C"
        ssr.βs[B, C] += (1 - β);
        if ssr.counts[B, C] <= ssr.βs[B, C]
            k = C;
            ssr.counts[B, C] += 1;
        else
            k = B;
        end
    elseif ssr.trackingtype == ""
        u = rand(rng);
        if u <= β
            k = B;
        else
            k = C;
        end
    else
        @error "Undefined Tracking mode";
    end

    return k;
end


```
AnytimeTopTwo
```

struct AnytimeTopTwo
    is_cst;     # Choice of proportions, true means fixed, false means adaptive
    ϵ;          # Slack parameter
end

long(sr::AnytimeTopTwo) = (sr.is_cst ? "cst-" : "") * "EB-TC-e" * split(string(sr.ϵ), ".")[2];
abbrev(sr::AnytimeTopTwo) = (sr.is_cst ? "cst-" : "") * "EB-TC-e" * split(string(sr.ϵ), ".")[2];

struct AnytimeTopTwoState
    is_cst;
    ϵ;
    counts;     # pulling counts when leader
    βs;         # cumulative sums of β depending on leader
    AnytimeTopTwoState(is_cst, ϵ, K) = new(is_cst, ϵ, zeros((K, K)), zeros((K, K)));
end

function start(sr::AnytimeTopTwo, N)
    AnytimeTopTwoState(sr.is_cst, sr.ϵ, length(N));
end

function nextsample(ssr::AnytimeTopTwoState, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);
    astar = argmax(ξ);

    # EB Leader
    B = astar

    # TCa or TCm Challenger
    ks = [a for a in 1:K if (ξ[a] >= ((pep.opt == "mul") ? ξ[B] / (1 - ssr.ϵ) : ξ[B] + ssr.ϵ)) && a != B];
    if length(ks) >= 1
        # Those arms have null empirical TC, hence we sample among them uniformly at random
        C = ks[rand(rng, 1:length(ks))];
    else
        # All arms have strictly positive empirical TC, and we need to compute them
        if pep.opt == "mul"
            TCs = [a != B ? (ξ[a] >= ξ[B] / (1 - ssr.ϵ) ? 0 :
                    (ξ[B] - (1 - ssr.ϵ) * ξ[a])^2 / (1 / N[B] + (1 - ssr.ϵ)^2 / N[a])) : Inf for a in 1:K];
        else
            TCs = [a != B ? (ξ[a] >= ξ[B] + ssr.ϵ ? 0 :
                    (ξ[B] - ξ[a] + ssr.ϵ)^2 / (1 / N[B] + 1 / N[a])) : Inf for a in 1:K];
        end
        C = argmin(TCs);
    end

    # Fixed or IDS proportions
    β = (ssr.is_cst ? 0.5 : (pep.opt == "mul" ? N[C] / (N[C] + N[B] * (1 - ssr.ϵ)^2) : N[C] / (N[C] + N[B])));
    ssr.βs[B, C] += (1 - β);

    # Tracking procedure, K(K-1) of them based on the challenger
    if ssr.counts[B, C] <= ssr.βs[B, C]
        k = C;
        ssr.counts[B, C] += 1;
    else
        k = B;
    end

    return k;
end


```
VSAnytimeTopTwo
```

struct VSAnytimeTopTwo
    is_cst;     # Choice of proportions, true means fixed, false means adaptive
    α;          # Varying slack parameter
    VS_type;    # Type of slack decrease
end

long(sr::VSAnytimeTopTwo) = (sr.is_cst ? "cst-" : "") * "EB-TC" * (sr.VS_type == "log" ? "-La" : "-Pa") * split(string(sr.α), ".")[2];
abbrev(sr::VSAnytimeTopTwo) = (sr.is_cst ? "cst-" : "") * "EB-TC" * (sr.VS_type == "log" ? "-La" : "-Pa") * split(string(sr.α), ".")[2];

struct VSAnytimeTopTwoState
    is_cst;
    α;
    VS_type;
    counts;     # pulling counts when leader
    βs;         # cumulative sums of β depending on leader
    VSAnytimeTopTwoState(is_cst, α, VS_type, K) = new(is_cst, α, VS_type, zeros((K, K)), zeros((K, K)));
end

function start(sr::VSAnytimeTopTwo, N)
    VSAnytimeTopTwoState(sr.is_cst, sr.α, sr.VS_type, length(N));
end

function nextsample(ssr::VSAnytimeTopTwoState, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    K = nanswers(pep, ξ);
    expfam = getexpfam(pep, 1);
    astar = argmax(ξ);

    # Slack parameter
    n = sum(N);
    if ssr.VS_type == "log"
        eps = n^(-ssr.α/2);
    else
        eps = log(n)^(-ssr.α/2);
    end

    # EB Leader
    B = astar

    # TCa or TCm Challenger
    ks = [a for a in 1:K if (ξ[a] >= ((pep.opt == "mul") ? ξ[B] / (1 - eps) : ξ[B] + eps)) && a != B];
    if length(ks) >= 1
        # Those arms have null empirical TC, hence we sample among them uniformly at random
        C = ks[rand(rng, 1:length(ks))];
    else
        # All arms have strictly positive empirical TC, and we need to compute them
        if pep.opt == "mul"
            TCs = [a != B ? (ξ[a] >= ξ[B] / (1 - eps) ? 0 :
                    (ξ[B] - (1 - eps) * ξ[a])^2 / (1 / N[B] + (1 - eps)^2 / N[a])) : Inf for a in 1:K];
        else
            TCs = [a != B ? (ξ[a] >= ξ[B] + eps ? 0 :
                    (ξ[B] - ξ[a] + eps)^2 / (1 / N[B] + 1 / N[a])) : Inf for a in 1:K];
        end
        C = argmin(TCs);
    end

    # Fixed or IDS proportions
    β = (ssr.is_cst ? 0.5 : (pep.opt == "mul" ? N[C] / (N[C] + N[B] * (1 - eps)^2) : N[C] / (N[C] + N[B])));
    ssr.βs[B, C] += (1 - β);

    # Tracking procedure, K(K-1) of them based on the challenger
    if ssr.counts[B, C] <= ssr.βs[B, C]
        k = C;
        ssr.counts[B, C] += 1;
    else
        k = B;
    end

    return k;
end

```
EpsFWS
```

struct EpsFWSampling
    TrackingRule;
end

long(sr::EpsFWSampling) = "EpsFW-Sampling " * abbrev(sr.TrackingRule);
abbrev(sr::EpsFWSampling) = "EpsFWS-" * abbrev(sr.TrackingRule);

mutable struct EpsFWSamplingState
    x;
    t;
    EpsFWSamplingState(TrackingRule, N) = new(ones(length(N)) / length(N), TrackingRule(N));
end

function start(sr::EpsFWSampling, N)
    EpsFWSamplingState(sr.TrackingRule, N);
end

function alt_eps_λ(μ1, w1, μa, wa, ϵ, opt)
    if opt == "add"
        λ = alt_λ(μ1, w1, μa - ϵ, wa);
        return λ, λ + ϵ;
    else
        λ = alt_λ(μ1, w1, μa * (1 - ϵ), wa / (1 - ϵ)^2);
        return λ, λ / (1 - ϵ);
    end
end

# Computing f and ∇f for FWSampling
function compute_f_∇f_epsbai(expfam, hw, ξ, astar, r, K, ϵ, opt)
    # Alternative parameters
    λs = [alt_eps_λ(ξ[astar], hw[astar], ξ[k], hw[k], ϵ, opt) for k=1:K] ;
    suboptimal = [k for k=1:K if k!=astar];

    # construct ∇f
    ∇f = [[0.0 for j=1:K] for i=1:K];
    for k in suboptimal
        ∇f[k][astar] = d(expfam, ξ[astar], λs[k][1]);
        ∇f[k][k] = d(expfam, ξ[k], λs[k][2]);
    end

    # construct f
    f = [hw'∇f[k] for k in suboptimal];
    fmin = minimum(f);
    if r > eps()
        fidx = [j for (idxj,j) in enumerate(suboptimal) if (f[idxj]<fmin+r)]
    elseif abs(r)<eps()
        fidx = [suboptimal[argmin(f)]];
    else
        fidx = suboptimal;
    end
    return f, ∇f, fidx;
end

function nextsample(ssr::EpsFWSamplingState, pep, rsp, ahat, aalt, ξ, N, Zs, rng)
    expfam = getexpfam(pep, 1);
    K, t = length(N), sum(N);
    astar = argmax(ξ);
    r = t^(-9.0/10)/K;

    z = zeros(K);
    if !hμ_in_lambda(ξ, astar, K) || is_complete_square(floor(Int, t/K))
        z = ones(K) / K;
    else
        f, ∇f, fidx = compute_f_∇f_epsbai(expfam, ssr.x, ξ, astar, r, K, pep.ϵ, pep.opt);
        if length(fidx) == 1 # best challenger
            challenger_idx = argmax(∇f[fidx[1]]);
            z = [(challenger_idx==j) ? 1 : 0 for j=1:K];
        else # solve LP of the zero-sum matrix game
            Σ = [[(i==j) ? 1 : 0 for j=1:K] - ssr.x for i=1:K];
            A = [[Σ[i]'∇f[j] for i=1:K] for j in fidx]; # construct payoff matrix
            z = solveZeroSumGame(A, K, length(fidx));
        end
    end
    setfield!(ssr, :x, ssr.x*((t-1.0)/t) + z*1.0/t);
    return track(ssr.t, N, ssr.x);
end



```
Doubling SR
```

struct DSR
end

long(sr::DSR) = "DSR";
abbrev(sr::DSR) = "DSR";

mutable struct DSRState
    substract_Sa;
    substract_Na;
    reco;
    active_arms;
    phase;
    past_horizon;
    current_horizon;
    episode;
    current_budget;
    logK;
    DSRState(K) = new(zeros(K), zeros(Int64, K), -1, [a for a in 1:K], 1, 0, K * ceil((2 * K * ceil(log2(K)) - K) / (K * get_logK(K))), 1, 2 * K * ceil(log2(K)), get_logK(K));
end

function start(sr::DSR, N)
    DSRState(length(N));
end

function nextsample(sr::DSRState, pep, S, N)
    K = length(sr.substract_Sa);

    if sum(N) >= sr.current_budget || length(sr.active_arms) == 1
        # Compute mean on this SR instance
        hμ = (S .- sr.substract_Sa) ./ (N .- sr.substract_Na);

        # Compute candidate answer
        idx_astar = argmax(hμ[sr.active_arms]);
        sr.reco = sr.active_arms[Int64(idx_astar)];

        # Change SR algorithm
        sr.episode += 1;

        # Reset the parameters from SR algo, next instance will be running between sr.current_budget and 2*sr.current_budget.
        # Therefore, the definition of SR's phases length will be based on a budget of sr.current_budget
        sr.current_horizon = sr.current_budget + ceil((sr.current_budget - K) / (K * sr.logK));
        sr.active_arms = [a for a in 1:K];
        sr.phase = 1;

        # Define the new horizon after which we will change SR instance, i.e. double sr.current_budget
        sr.current_budget *= 2;

        # Update the counts and sums to substract to deal with forgetting between instances of SR
        sr.substract_Na = copy(N);
        sr.substract_Sa = copy(S);

        # Init by sampling all arms once, previous reco is kept in case one needs to log a reco (slightly more fair)
        return [a for a in 1:K], sr.reco;
    elseif sum(N) >= sr.current_horizon
        # Compute mean for this SR instance
        hμ = (S .- sr.substract_Sa) ./ (N .- sr.substract_Na);

        # Remove the arm with worst empirical mean
        minhμ = minimum(hμ[sr.active_arms]);
        filter!(a -> hμ[a] > minhμ, sr.active_arms);

        # Compute size next phase, the instance of SR is ran for sr.current_budget/2 timestep
        nk_old = get_phasesize(sr.phase, sr.current_budget/2, K, sr.logK);
        nk_new = get_phasesize(sr.phase + 1, sr.current_budget/2, K, sr.logK);
        sr.phase += 1;

        # Compute new horizon
        sr.past_horizon = sr.current_horizon;
        sr.current_horizon += (nk_new - nk_old) * length(sr.active_arms);

        # Sanity check
        @assert sr.phase + length(sr.active_arms) - 1 == K "Missmatch between the number of active arms and the phase";
    end

    # If first episode, we do the best inside the this epsiode
    if sr.episode == 1
        # Compute mean for this SR instance
        hμ = (S .- sr.substract_Sa) ./ (N .- sr.substract_Na);

        # Compute candidate answer
        idx_astar = argmax(hμ[sr.active_arms]);
        sr.reco = sr.active_arms[Int64(idx_astar)];
    end

    # Round robin on the remaining arms
    idx_k = 1 + ((sum(N) - sr.past_horizon) % length(sr.active_arms));
    k = sr.active_arms[Int64(idx_k)];

    return [k], sr.reco;
end

get_logK(K) = 0.5 + sum([1 / a for a in 2:K]);
get_phasesize(phase, budget, K, logK) = ceil((budget - K) / (logK * (K + 1 - phase)));


```
Doubling SH
```

struct DSH
end

long(sr::DSH) = "DSH";
abbrev(sr::DSH) = "DSH";

mutable struct DSHState
    substract_Sa;
    substract_Na;
    reco;
    active_arms;
    phase;
    past_horizon;
    current_horizon;
    current_budget;
    episode;
    DSHState(K) = new(zeros(K), zeros(Int64, K), -1, [a for a in 1:K], 1, 0, 2 * K, 2 * K * ceil(log2(K)), 1);
end
# Since floor(2 * K * ceil(log2(K)) / (K * ceil(log2(K)))) = 2

function start(sr::DSH, N)
    DSHState(length(N));
end

function nextsample(sr::DSHState, pep, S, N)
    K = length(sr.substract_Sa);

    if sum(N) >= sr.current_budget || length(sr.active_arms) == 1
        # Compute mean on this SH instance
        hμ = (S .- sr.substract_Sa) ./ (N .- sr.substract_Na);

        # Compute candidate answer
        idx_astar = argmax(hμ[sr.active_arms]);
        sr.reco = sr.active_arms[Int64(idx_astar)];

        # Change SH algorithm
        sr.episode += 1;

        # Reset the parameters from SH algo, next instance will be running between sr.current_budget and 2*sr.current_budget.
        # Therefore, the definition of SH's phases length will be based on a budget of sr.current_budget, i.e. sr.current_budget/2 after update done
        sr.current_horizon = sr.current_budget + K * floor(sr.current_budget / (K * ceil(log2(K))));
        sr.active_arms = [a for a in 1:K];
        sr.phase = 1;

        # Define the new horizon after which we will change SR instance, i.e. double sr.current_budget
        sr.current_budget *= 2;

        # Update the counts and sums to substract to deal with forgetting between instances of SH
        sr.substract_Na = copy(N);
        sr.substract_Sa = copy(S);

        # Init by sampling all arms once, previous reco is kept in case one needs to log a reco (slightly more fair)
        return [a for a in 1:K], sr.reco;
    elseif sum(N) >= sr.current_horizon
        # Compute mean for this phase of this SH instance
        hμ = (S .- sr.substract_Sa) ./ (N .- sr.substract_Na);

        # Remove half the arms
        for i in 1:floor(length(sr.active_arms) / 2)
            minhμ = minimum(hμ[sr.active_arms]);
            filter!(a -> hμ[a] > minhμ, sr.active_arms);
        end

        # Update horizon
        sr.past_horizon = sr.current_horizon;
        sr.current_horizon += length(sr.active_arms) * floor(sr.current_budget / (2 * length(sr.active_arms) * ceil(log2(K))));;

        # Update the counts and sums to substract to deal with forgetting between phases of SH
        sr.substract_Na = copy(N);
        sr.substract_Sa = copy(S);

        # Init by sampling all active arms once, previous reco is kept in case one needs to log a reco (slightly more fair)
        return sr.active_arms, sr.reco;
    end

    # If first episode, we do the best inside the this epsiode
    if sr.episode == 1
        # Compute mean for this phase of this SH instance
        hμ = (S .- sr.substract_Sa) ./ (N .- sr.substract_Na);

        # Compute candidate answer
        maxhμ = maximum(hμ[sr.active_arms]);
        sr.reco = filter(a -> hμ[a] >= maxhμ, sr.active_arms)[1];
    end

    # Round robin on the remaining arms
    idx_k = 1 + ((sum(N) - sr.past_horizon) % length(sr.active_arms));
    k = sr.active_arms[Int64(idx_k)];

    return [k], sr.reco;
end
