import numpy as np import pyspiel from open_spiel.python import rl_agent class Agent(rl_agent.AbstractAgent): """ Adaptive agent for Repeated Rock-Paper-Scissors (RRPS). Combines: - Variable-order Markov prediction (longest history matching) - Iterative best-response reasoning - Multi-armed bandit for online strategy selection """ class MarkovModel: """ Variable-order Markov model over joint action history. Tracks opponent move frequencies conditioned on recent history. """ def __init__(self, max_order: int = 6): self.max_order = max_order self.counts = {} # history tuple -> [R, P, S] counts self.history = [] # joint action history encoded as 0..8 def update(self, my_move: int, opponent_move: int) -> None: """Update statistics using the most recent joint action.""" code = my_move * 3 + opponent_move self.history.append(code) n = len(self.history) if n < 2: return limit = min(self.max_order, n) for length in range(1, limit + 1): key = tuple(self.history[n - 1 - length : n - 1]) if key not in self.counts: self.counts[key] = np.zeros(3) self.counts[key][opponent_move] += 1.0 def predict_distribution(self) -> np.ndarray: """ Predict opponent move distribution using longest matching history. Falls back to uniform distribution if no match is found. """ n = len(self.history) if n == 0: # No history yet, return Uniform distribution return np.ones(3) / 3.0 limit = min(self.max_order, n) for length in range(limit, 0, -1): key = tuple(self.history[n - length : n]) if key in self.counts: stats = self.counts[key] total = stats.sum() if total > 0: return stats / total # If no pattern found, return Uniform distribution return np.ones(3) / 3.0 def __init__(self, num_actions: int, name: str = "agent"): assert num_actions == 3, "RRPS requires exactly 3 actions." self._num_actions = num_actions self._name = name # Internal history self.my_moves = [] self.opponent_moves = [] # Markov Model self.markov = self.MarkovModel() # Iterative best-response depth self.max_depth = 6 # Strategy set: random + depth levels self.num_strategies = 1 + self.max_depth # Bandit Weights self.weights = np.ones(self.num_strategies) # Predictions for each strategy self.predictions = [0] * self.num_strategies # Bandit decay factor self.decay = 0.8 def restart(self): """Reset internal state between episodes.""" self.my_moves.clear() self.opponent_moves.clear() self.markov = self.MarkovModel() self.weights = np.ones(self.num_strategies) self.predictions = [0] * self.num_strategies def _one_hot(self, action: int) -> np.ndarray: probs = np.zeros(self._num_actions) probs[action] = 1.0 return probs def step(self, time_step, is_evaluation: bool = False): """Select an action for the current timestep.""" if time_step.last(): return _, state = pyspiel.deserialize_game_and_state( time_step.observations["serialized_state"] ) history = state.history() # First move: uniform random if not history: action = np.random.randint(0, 3) self.my_moves.append(action) return rl_agent.StepOutput( action=action, probs=self._one_hot(action) ) # Observe opponent's last move opponent_move = history[-1] self.opponent_moves.append(opponent_move) # Update Markov Model based on last move self.markov.update(self.my_moves[-1], opponent_move) # Update bandit weights based on last outcome winning_move = (opponent_move + 1) % 3 losing_move = (opponent_move + 2) % 3 for i in range(self.num_strategies): pred = self.predictions[i] # Apply Decay self.weights[i] *= self.decay if pred == winning_move: self.weights[i] += 1.0 # Reward elif pred == losing_move: self.weights[i] -= 1.0 # Penalty # Strategy 0: random (Nash fallback) self.predictions[0] = np.random.randint(0, 3) # Predict opponent move via Markov model dist = self.markov.predict_distribution() # Add small noise to break ties randomly (avoid bias) predicted_opponent = np.argmax(dist + np.random.uniform(0, 0.01, 3)) # Iterative best-response strategies # Level 0: Play the move that beats the predicted move current_move = (predicted_opponent + 1) % 3 # Iterate for Levels 1 to 5 for d in range(self.max_depth): self.predictions[d + 1] = current_move # Calculate next level, which is the best response to the # best response of the current level's opponent move current_move = (current_move + 2) % 3 # Select best response strategy depth with highest weight best_idx = np.argmax(self.weights) # If the best weight is negative, it means we are being exploited # Therefore, play a random move (strategy 0) to defend action = ( self.predictions[0] if self.weights[best_idx] < 0 else self.predictions[best_idx] ) self.my_moves.append(action) return rl_agent.StepOutput( action=action, probs=self._one_hot(action) )