import numpy as np
import pyspiel
from open_spiel.python import rl_agent
class Agent(rl_agent.AbstractAgent):
"""
Adaptive agent for Repeated Rock-Paper-Scissors (RRPS).
Combines:
- Variable-order Markov prediction (longest history matching)
- Iterative best-response reasoning
- Multi-armed bandit for online strategy selection
"""
class MarkovModel:
"""
Variable-order Markov model over joint action history.
Tracks opponent move frequencies conditioned on recent history.
"""
def __init__(self, max_order: int = 6):
self.max_order = max_order
self.counts = {} # history tuple -> [R, P, S] counts
self.history = [] # joint action history encoded as 0..8
def update(self, my_move: int, opponent_move: int) -> None:
"""Update statistics using the most recent joint action."""
code = my_move * 3 + opponent_move
self.history.append(code)
n = len(self.history)
if n < 2:
return
limit = min(self.max_order, n)
for length in range(1, limit + 1):
key = tuple(self.history[n - 1 - length : n - 1])
if key not in self.counts:
self.counts[key] = np.zeros(3)
self.counts[key][opponent_move] += 1.0
def predict_distribution(self) -> np.ndarray:
"""
Predict opponent move distribution using longest matching history.
Falls back to uniform distribution if no match is found.
"""
n = len(self.history)
if n == 0:
# No history yet, return Uniform distribution
return np.ones(3) / 3.0
limit = min(self.max_order, n)
for length in range(limit, 0, -1):
key = tuple(self.history[n - length : n])
if key in self.counts:
stats = self.counts[key]
total = stats.sum()
if total > 0:
return stats / total
# If no pattern found, return Uniform distribution
return np.ones(3) / 3.0
def __init__(self, num_actions: int, name: str = "agent"):
assert num_actions == 3, "RRPS requires exactly 3 actions."
self._num_actions = num_actions
self._name = name
# Internal history
self.my_moves = []
self.opponent_moves = []
# Markov Model
self.markov = self.MarkovModel()
# Iterative best-response depth
self.max_depth = 6
# Strategy set: random + depth levels
self.num_strategies = 1 + self.max_depth
# Bandit Weights
self.weights = np.ones(self.num_strategies)
# Predictions for each strategy
self.predictions = [0] * self.num_strategies
# Bandit decay factor
self.decay = 0.8
def restart(self):
"""Reset internal state between episodes."""
self.my_moves.clear()
self.opponent_moves.clear()
self.markov = self.MarkovModel()
self.weights = np.ones(self.num_strategies)
self.predictions = [0] * self.num_strategies
def _one_hot(self, action: int) -> np.ndarray:
probs = np.zeros(self._num_actions)
probs[action] = 1.0
return probs
def step(self, time_step, is_evaluation: bool = False):
"""Select an action for the current timestep."""
if time_step.last():
return
_, state = pyspiel.deserialize_game_and_state(
time_step.observations["serialized_state"]
)
history = state.history()
# First move: uniform random
if not history:
action = np.random.randint(0, 3)
self.my_moves.append(action)
return rl_agent.StepOutput(
action=action,
probs=self._one_hot(action)
)
# Observe opponent's last move
opponent_move = history[-1]
self.opponent_moves.append(opponent_move)
# Update Markov Model based on last move
self.markov.update(self.my_moves[-1], opponent_move)
# Update bandit weights based on last outcome
winning_move = (opponent_move + 1) % 3
losing_move = (opponent_move + 2) % 3
for i in range(self.num_strategies):
pred = self.predictions[i]
# Apply Decay
self.weights[i] *= self.decay
if pred == winning_move:
self.weights[i] += 1.0 # Reward
elif pred == losing_move:
self.weights[i] -= 1.0 # Penalty
# Strategy 0: random (Nash fallback)
self.predictions[0] = np.random.randint(0, 3)
# Predict opponent move via Markov model
dist = self.markov.predict_distribution()
# Add small noise to break ties randomly (avoid bias)
predicted_opponent = np.argmax(dist + np.random.uniform(0, 0.01, 3))
# Iterative best-response strategies
# Level 0: Play the move that beats the predicted move
current_move = (predicted_opponent + 1) % 3
# Iterate for Levels 1 to 5
for d in range(self.max_depth):
self.predictions[d + 1] = current_move
# Calculate next level, which is the best response to the
# best response of the current level's opponent move
current_move = (current_move + 2) % 3
# Select best response strategy depth with highest weight
best_idx = np.argmax(self.weights)
# If the best weight is negative, it means we are being exploited
# Therefore, play a random move (strategy 0) to defend
action = (
self.predictions[0]
if self.weights[best_idx] < 0
else self.predictions[best_idx]
)
self.my_moves.append(action)
return rl_agent.StepOutput(
action=action,
probs=self._one_hot(action)
)