do a tutorial - RL Bandits

Ulf Hamster 3 min.
python reinforcement learning multi-arm bandits

Load Packages

import numpy as np
from typing import List
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

Model the Agent: Bandit class

class Bandit(object):
    def __init__(self, mu: float, sigma: float = 1.0):
        # settings of the game
        self.mu = mu
        self.sigma = sigma
        # play results for the bandit
        self.mean = 0  # updated average
        self.N = 0  # count number of obsevations
    
    def pull(self):
        # pull the bandit arm
        return self.mu + self.sigma * np.random.randn()

    def update(self, x: float):
        # update the mean(x_t) of game outcomes so far
        self.N += 1
        self.mean = (1. - 1. / self.N) * self.mean + 1. / self.N * x

Simulation Loop

def run_experiment(mu: List[float], 
                   eps: float, 
                   n_steps: int = 100):
    # store game play history
    history = []
    # instantiate three bandits
    bandits = [Bandit(m) for m in mu]

    for t in range(n_steps):
        # epsilon greedy
        p = np.random.random()
        if eps < p:
            # explore: pick a bandit randomly
            i = np.random.choice(3)
        else:
            # exploit: pick the best bandit
            i = np.argmax([b.mean for b in bandits])
        # play with the i-th bandit
        x = bandits[i].pull()
        # update the mean of all play outcomes for the i-th bandit
        bandits[i].update(x)
        # store results
        history.append([t, i, x])
    
    # done
    return bandits, history

Example

mu = [1., 2., 3.]
eps = 0.1
n_steps = 10000
bandits, history = run_experiment(mu, eps, n_steps)
print([b.mean for b in bandits])
[1.0351217341529002, 1.977450250452284, 3.0137372064974697]

post-process the history list of list. the first column contains the time step, the 2nd is the bandit index, and the last column contains the game play results.

data = np.array(history)
t = data[:, 0].astype(np.uint64)
b = data[:, 1].astype(np.uint8)
x = data[:, 2].astype(np.float32)

Plot the results

Plot the average mean of total game outcome at each time step t so far.

cummean = np.cumsum(x) / (1 + np.arange(n_steps))
plt.plot(t, cummean)
plt.xscale('log')
for m in mu:
    plt.plot(np.ones(n_steps) * m);

png

Plot the how often a bandit was selected

for i, m in enumerate(mu):
    pctbandit = np.cumsum(b == i) / (1 + np.arange(n_steps))
    plt.plot(t, pctbandit, label=f"{i}-th bandit, mu={m}");
plt.xscale('log');
plt.legend();

png

Plot as moving average

window = 1000

cm1 = np.cumsum(x[:window]) / (1 + np.arange(window))
cm2 = np.convolve(x, np.ones(window), 'valid') / window

plt.plot(np.hstack([cm1, cm2]))
plt.xscale('log')
for m in mu:
    plt.plot(np.ones(n_steps) * m);

png

window = 1000
for i, m in enumerate(mu):
    bi = (b == i).astype(float)
    pct1 = np.cumsum(bi[:window]) / (1 + np.arange(window))
    pct2 = np.convolve(bi, np.ones(window), 'valid') / window
    plt.plot(np.hstack([pct1, pct2]), label=f"{i}-th bandit, mu={m}");
plt.xscale('log');
plt.legend();

png

Wrap into plot functions

def post_process(history: list):
    data = np.array(history)
    t = data[:, 0].astype(np.uint64)
    b = data[:, 1].astype(np.uint8)
    x = data[:, 2].astype(np.float32)
    return t, b, x
def plot_cummean(x: np.array, n_steps=None):
    if n_steps is None: n_steps = len(x)
    cummean = np.cumsum(x) / (1 + np.arange(n_steps))
    plt.plot(cummean)
    plt.xscale('log')
    for m in mu:
        plt.plot(np.ones(n_steps) * m);
    plt.show();
def plot_pctbandits(mu: list, b: np.array, window: int = 1000):
    for i, m in enumerate(mu):
        bi = (b == i).astype(float)
        pct1 = np.cumsum(bi[:window]) / (1 + np.arange(window))
        pct2 = np.convolve(bi, np.ones(window), 'valid') / window
        plt.plot(np.hstack([pct1, pct2]), label=f"{i}-th bandit, mu={m}");
    plt.xscale('log');
    plt.legend();
    plt.show();

More examples

eps=0.15

mu = [1., 2., 3.]
eps = 0.15
n_steps = 10000
bandits, history = run_experiment(mu, eps, n_steps)

print([b.mean for b in bandits])
t, b, x = post_process(history)
plot_cummean(x, n_steps);
plot_pctbandits(mu, b, int(n_steps)//10);
[0.9799013166914247, 1.9573426807689844, 3.0086857117007133]

png

png

eps=.075

mu = [1., 2., 3.]
eps = 0.075
n_steps = 10000
bandits, history = run_experiment(mu, eps, n_steps)

print([b.mean for b in bandits])
t, b, x = post_process(history)
plot_cummean(x, n_steps);
plot_pctbandits(mu, b, int(n_steps)//10);
[0.9725830628807496, 2.01569925430601, 2.967674083064589]

png

png

eps=0.025

mu = [1., 2., 3.]
eps = 0.025
n_steps = 10000
bandits, history = run_experiment(mu, eps, n_steps)

print([b.mean for b in bandits])
t, b, x = post_process(history)
plot_cummean(x, n_steps);
plot_pctbandits(mu, b, int(n_steps)//10);
[1.0082301063978205, 1.9718555365844332, 3.0385625833229595]

png

png

Links