MVA-2021 / reinforcement_learning / hw3_exploration / assignment3_code / assignment3.py
assignment3.py
Raw
# -*- coding: utf-8 -*-
"""Assignment3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1rDTGiQpC4NrlCzreiQktEdl32mJOmgmR
"""

import numpy as np
import matplotlib.pyplot as plt

from numba.decorators import jit
from math import log, sqrt

@jit
def bernoulli(mu_arm):
  return np.random.binomial(n=1, p=mu_arm)

"""# UCB"""

@jit
def ucb(horizon, n_arms, true_mus):
  n = horizon
  m = n_arms
  assert(len(true_mus) == m)

  deltas = np.amax(true_mus)-true_mus 

  mus = np.zeros(m)
  pulls = np.zeros(m)

  # init by taking each arm once
  for t in range(m):
    pulls[t] += 1
    mus[t] = bernoulli(true_mus[t])

  # ucb loop
  for t in range(m, n):

    # compute upper-confidence-bound
    ucbs = mus + np.sqrt( log(1+t*log(t)**2)/(2*pulls) )
    # take action
    i = np.argmax(ucbs)
    # update statistics
    pulls[i] += 1
    mus[i] = ( (pulls[i]-1)*mus[i] + bernoulli(true_mus[i]) ) / pulls[i]

  # return regret
  return np.sum(pulls*deltas)

"""# KL-UCB"""

@jit
def kl(x, y, eps=1e-15):
  x = min(max(x, eps), 1 - eps)
  y = min(max(y, eps), 1 - eps)
  return x * log(x / y) + (1 - x) * log((1 - x) / (1 - y))
 
@jit
def solve_inner_max(mu, pull, t, eps=1e-4, max_iter=50):
  l, r = mu, 1
  val= np.inf
  iter = 0
  d = log(1+t*log(t)**2)/pull
  while np.abs(val)>eps and iter<max_iter:
    mid = (l+r)/2.0
    if kl(mu, mid)>d: r=mid
    else: l=mid
    iter+=1
  return mid

@jit
def klucb(horizon, n_arms, true_mus):
  n = horizon
  m = n_arms
  assert(len(true_mus) == m)

  deltas = np.amax(true_mus)-true_mus 

  mus = np.zeros(m)
  pulls = np.zeros(m)

  # init by taking each arm once
  for t in range(m):
    pulls[t] += 1
    mus[t] = bernoulli(true_mus[t])

  # ucb loop
  for t in range(m, n):
    # compute kullback-leibler-upper-confidence-bound
    ucbs = np.zeros(m)
    for j in range(m):
      ucbs[j] = solve_inner_max(mus[j], pulls[j], t)
    # take action
    i = np.argmax(ucbs)
    # update statistics
    pulls[i] += 1
    mus[i] = ( (pulls[i]-1)*mus[i] + bernoulli(true_mus[i]) ) / pulls[i]
  
  # return regret
  return np.sum(pulls*deltas)

def expected_regret(method, mu1, delta, horizon=10000, n_arms=2, n_iter=100):
  regret = 0
  for _ in range(n_iter):
    if method == 'ucb': regret += ucb(horizon,n_arms, np.array([mu1, mu1+delta]))
    else: regret += klucb(horizon, n_arms, np.array([mu1, mu1+delta]))
  return regret / n_iter

x = np.arange(-0.5,0.5, 0.01)
y_ucb = np.zeros(len(x))
y_klucb = np.zeros(len(x))
for i in range(len(x)):
  if i%10 == 0:
    print('computing step ', i, ' ...')
  y_ucb[i] = expected_regret('ucb', 0.5, x[i])
  y_klucb[i] = expected_regret('klucb', 0.5, x[i])

plt.plot(x, y_ucb, label='ucb')
plt.plot(x, y_klucb, label='kl-ucb')
plt.legend()
plt.title(r'KL-UCB vs UCB for $\mu_1 =0.5$')
plt.show()

x = np.arange(-0.1,0.9, 0.01)
y_ucb = np.zeros(len(x))
y_klucb = np.zeros(len(x))
for i in range(len(x)):
  if i%10 == 0:
    print('computing step ', i, ' ...')
  y_ucb[i] = expected_regret('ucb', 0.1, x[i])
  y_klucb[i] = expected_regret('klucb', 0.1, x[i])

plt.plot(x, y_ucb, label='ucb')
plt.plot(x, y_klucb, label='kl-ucb')
plt.legend()
plt.title(r'KL-UCB vs UCB for $\mu_1 =0.1$')
plt.show()

x = np.arange(-0.9,0.1, 0.01)
y_ucb = np.zeros(len(x))
y_klucb = np.zeros(len(x))
for i in range(len(x)):
  if i%10 == 0:
    print('computing step ', i, ' ...')
  y_ucb[i] = expected_regret('ucb', 0.9, x[i])
  y_klucb[i] = expected_regret('klucb', 0.9, x[i])

plt.plot(x, y_ucb, label='ucb')
plt.plot(x, y_klucb, label='kl-ucb')
plt.legend()
plt.title(r'KL-UCB vs UCB for $\mu_1 =0.9$')
plt.show()