import numpy as np
import pyspiel
from open_spiel.python import rl_agent


class Agent(rl_agent.AbstractAgent):
    """
    Adaptive agent for Repeated Rock-Paper-Scissors (RRPS).

    Combines:
    - Variable-order Markov prediction (longest history matching)
    - Iterative best-response reasoning
    - Multi-armed bandit for online strategy selection
    """

    class MarkovModel:
        """
        Variable-order Markov model over joint action history.
        Tracks opponent move frequencies conditioned on recent history.
        """

        def __init__(self, max_order: int = 6):
            self.max_order = max_order
            self.counts = {}          # history tuple -> [R, P, S] counts
            self.history = []         # joint action history encoded as 0..8

        def update(self, my_move: int, opponent_move: int) -> None:
            """Update statistics using the most recent joint action."""
            code = my_move * 3 + opponent_move
            self.history.append(code)

            n = len(self.history)
            if n < 2:
                return

            limit = min(self.max_order, n)
            for length in range(1, limit + 1):
                key = tuple(self.history[n - 1 - length : n - 1])
                if key not in self.counts:
                    self.counts[key] = np.zeros(3)
                self.counts[key][opponent_move] += 1.0

        def predict_distribution(self) -> np.ndarray:
            """
            Predict opponent move distribution using longest matching history.
            Falls back to uniform distribution if no match is found.
            """
            n = len(self.history)
            if n == 0:
                # No history yet, return Uniform distribution
                return np.ones(3) / 3.0

            limit = min(self.max_order, n)
            for length in range(limit, 0, -1):
                key = tuple(self.history[n - length : n])
                if key in self.counts:
                    stats = self.counts[key]
                    total = stats.sum()
                    if total > 0:
                        return stats / total

            # If no pattern found, return Uniform distribution
            return np.ones(3) / 3.0

    def __init__(self, num_actions: int, name: str = "agent"):
        assert num_actions == 3, "RRPS requires exactly 3 actions."

        self._num_actions = num_actions
        self._name = name

        # Internal history
        self.my_moves = []
        self.opponent_moves = []

        # Markov Model
        self.markov = self.MarkovModel()

        # Iterative best-response depth
        self.max_depth = 6

        # Strategy set: random + depth levels
        self.num_strategies = 1 + self.max_depth
        
        # Bandit Weights
        self.weights = np.ones(self.num_strategies)
        
        # Predictions for each strategy
        self.predictions = [0] * self.num_strategies

        # Bandit decay factor
        self.decay = 0.8

    def restart(self):
        """Reset internal state between episodes."""
        self.my_moves.clear()
        self.opponent_moves.clear()
        self.markov = self.MarkovModel()
        self.weights = np.ones(self.num_strategies)
        self.predictions = [0] * self.num_strategies

    def _one_hot(self, action: int) -> np.ndarray:
        probs = np.zeros(self._num_actions)
        probs[action] = 1.0
        return probs

    def step(self, time_step, is_evaluation: bool = False):
        """Select an action for the current timestep."""
        if time_step.last():
            return

        _, state = pyspiel.deserialize_game_and_state(
            time_step.observations["serialized_state"]
        )
        history = state.history()

        # First move: uniform random
        if not history:
            action = np.random.randint(0, 3)
            self.my_moves.append(action)
            return rl_agent.StepOutput(
                action=action,
                probs=self._one_hot(action)
            )

        # Observe opponent's last move
        opponent_move = history[-1]
        self.opponent_moves.append(opponent_move)

        # Update Markov Model based on last move
        self.markov.update(self.my_moves[-1], opponent_move)

        # Update bandit weights based on last outcome
        winning_move = (opponent_move + 1) % 3
        losing_move = (opponent_move + 2) % 3

        for i in range(self.num_strategies):
            pred = self.predictions[i]
            
            # Apply Decay
            self.weights[i] *= self.decay

            if pred == winning_move:
                self.weights[i] += 1.0 # Reward
            elif pred == losing_move:
                self.weights[i] -= 1.0 # Penalty

        # Strategy 0: random (Nash fallback)
        self.predictions[0] = np.random.randint(0, 3)

        # Predict opponent move via Markov model
        dist = self.markov.predict_distribution()

        # Add small noise to break ties randomly (avoid bias)
        predicted_opponent = np.argmax(dist + np.random.uniform(0, 0.01, 3))

        # Iterative best-response strategies
        # Level 0: Play the move that beats the predicted move
        current_move = (predicted_opponent + 1) % 3

        # Iterate for Levels 1 to 5
        for d in range(self.max_depth):
            self.predictions[d + 1] = current_move
            
            # Calculate next level, which is the best response to the 
            # best response of the current level's opponent move
            current_move = (current_move + 2) % 3

        # Select best response strategy depth with highest weight
        best_idx = np.argmax(self.weights)

        # If the best weight is negative, it means we are being exploited
        # Therefore, play a random move (strategy 0) to defend
        action = (
            self.predictions[0]
            if self.weights[best_idx] < 0
            else self.predictions[best_idx]
        )

        self.my_moves.append(action)
        return rl_agent.StepOutput(
            action=action,
            probs=self._one_hot(action)
        )