from collections import namedtuple

import numpy as np
import tensorflow as tf
from gym.spaces import Discrete

from deepq.playground.policies.base import BaseModelMixin, Policy, Config
from deepq.playground.policies.memory import ReplayMemory
from deepq.playground.utils.misc import plot_learning_curve
from deepq.playground.utils.tf_ops import dense_nn


class PPOPolicy(Policy, BaseModelMixin):

    def __init__(self, env, name, training=True, gamma=0.99, lam=0.95,
                 actor_layers=[64, 32], critic_layers=[128, 64], clip_norm=None, **kwargs):
        Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs)
        BaseModelMixin.__init__(self, name)

        assert isinstance(self.env.action_space, Discrete), \
            "Current PPOPolicy implementation only works for discrete action space."

        self.lam = lam  # lambda for GAE.
        self.actor_layers = actor_layers
        self.critic_layers = critic_layers
        self.clip_norm = clip_norm

    def act(self, state, **kwargs):
        probas = self.sess.run(self.actor_proba, {self.s: [state]})[0]
        action = np.random.choice(range(self.act_size), size=1, p=probas)[0]
        return action

    def _build_networks(self):
        # Define input placeholders
        self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state')
        self.a = tf.placeholder(tf.int32, shape=(None,), name='action')
        self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state')
        self.r = tf.placeholder(tf.float32, shape=(None,), name='reward')
        self.done = tf.placeholder(tf.float32, shape=(None,), name='done_flag')

        self.old_logp_a = tf.placeholder(tf.float32, shape=(None,), name='old_logp_actor')
        self.v_target = tf.placeholder(tf.float32, shape=(None,), name='v_target')
        self.adv = tf.placeholder(tf.float32, shape=(None,), name='return')

        with tf.variable_scope('actor'):
            # Actor: action probabilities
            self.actor = dense_nn(self.s, self.actor_layers + [self.act_size], name='actor')
            self.actor_proba = tf.nn.softmax(self.actor)
            a_ohe = tf.one_hot(self.a, self.act_size, 1.0, 0.0, name='action_ohe')
            self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe,
                                        reduction_indices=-1, name='new_logp_actor')
            self.actor_vars = self.scope_vars('actor')

        with tf.variable_scope('critic'):
            # Critic: action value (V value)
            self.critic = tf.squeeze(dense_nn(self.s, self.critic_layers + [1], name='critic'))
            self.critic_next = tf.squeeze(dense_nn(self.s_next, self.critic_layers + [1], name='critic', reuse=True))
            self.critic_vars = self.scope_vars('critic')

    def _build_train_ops(self):
        self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_actor')
        self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic')
        self.clip_range = tf.placeholder(tf.float32, shape=None, name='ratio_clip_range')

        with tf.variable_scope('actor_train'):
            ratio = tf.exp(self.logp_a - self.old_logp_a)
            ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range)
            loss_a = - tf.reduce_mean(tf.minimum(self.adv * ratio, self.adv * ratio_clipped))

            optim_a = tf.train.AdamOptimizer(self.lr_a)
            grads_a = optim_a.compute_gradients(loss_a, var_list=self.actor_vars)
            if self.clip_norm:
                grads_a = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_a]
            self.train_op_a = optim_a.apply_gradients(grads_a)

        with tf.variable_scope('critic_train'):
            loss_c = tf.reduce_mean(tf.square(self.v_target - self.critic))

            optim_c = tf.train.AdamOptimizer(self.lr_c)
            grads_c = optim_c.compute_gradients(loss_c, var_list=self.critic_vars)
            if self.clip_norm:
                grads_c = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_c]
            self.train_op_c = optim_c.apply_gradients(grads_c)

        self.train_ops = [self.train_op_a, self.train_op_c]

        with tf.variable_scope('summary'):
            self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')

            self.summary = [
                tf.summary.scalar('loss/adv', tf.reduce_mean(self.adv)),
                tf.summary.scalar('loss/ratio', tf.reduce_mean(ratio)),
                tf.summary.scalar('loss/loss_actor', loss_a),
                tf.summary.scalar('loss/loss_critic', loss_c),
                tf.summary.scalar('episode_reward', self.ep_reward)
            ]

            # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g))
            #                 for g, v in grads_a if g is not None]
            # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g))
            #                 for g, v in grads_c if g is not None]

            self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES)

        self.sess.run(tf.global_variables_initializer())

    def build(self):
        self._build_networks()
        self._build_train_ops()

    class TrainConfig(Config):
        lr_a = 0.005
        lr_c = 0.005
        batch_size = 32
        n_iterations = 100
        n_rollout_workers = 5
        train_epoches = 5
        log_every_iteration = 10
        ratio_clip_range = 0.2
        ratio_clip_decay = True

    def _generate_rollout(self, buffer):
        # generate one trajectory.
        ob = self.env.reset()
        done = False
        rewards = []
        episode_reward = 0.0
        obs = []
        actions = []

        while not done:
            a = self.act(ob)
            ob_next, r, done, info = self.env.step(a)
            obs.append(ob)
            actions.append(a)
            rewards.append(r)
            episode_reward += r
            ob = ob_next

        # length of the episode.
        T = len(rewards)

        # compute the current log pi(a|s) and predicted v values.
        with self.sess.as_default():
            logp_a = self.logp_a.eval({self.a: np.array(actions), self.s: np.array(obs)})
            v_pred = self.critic.eval({self.s: np.array(obs)})

        # Compute TD errors
        td_errors = [rewards[t] + self.gamma * v_pred[t + 1] - v_pred[t] for t in range(T - 1)]
        td_errors += [rewards[T - 1] + self.gamma * 0.0 - v_pred[T - 1]]  # handle the terminal state.

        assert len(logp_a) == len(v_pred) == len(td_errors) == T

        # Estimate advantage backwards.
        advs = []
        adv_so_far = 0.0
        for delta in td_errors[::-1]:
            adv_so_far = delta + self.gamma * self.lam * adv_so_far
            advs.append(adv_so_far)
        advs = advs[::-1]
        assert len(advs) == T

        # add into the memory buffer
        v_targets = np.array(advs) + np.array(v_pred)
        for i, (s, a, s_next, r, old_logp_a, v_target, adv) in enumerate(zip(
                obs, actions, np.array(obs[1:] + [ob_next]), rewards,
                np.squeeze(logp_a), v_targets, advs)):
            done = float(i == T - 1)
            buffer.add(buffer.tuple_class(s, a, s_next, r, done, old_logp_a, v_target, adv))

        return episode_reward, len(advs)

    def train(self, config: TrainConfig):
        BufferRecord = namedtuple('BufferRecord', ['s', 'a', 's_next', 'r', 'done',
                                                   'old_logp_actor', 'v_target', 'adv'])
        buffer = ReplayMemory(tuple_class=BufferRecord)

        reward_history = []
        reward_averaged = []
        step = 0
        total_rec = 0

        clip = config.ratio_clip_range
        if config.ratio_clip_decay:
            clip_delta = clip / config.n_iterations
        else:
            clip_delta = 0.0

        for n_iteration in range(config.n_iterations):

            # we should have multiple rollout_workers running in parallel.
            for _ in range(config.n_rollout_workers):
                episode_reward, n_rec = self._generate_rollout(buffer)
                # One trajectory is complete.
                reward_history.append(episode_reward)
                reward_averaged.append(np.mean(reward_history[-10:]))
                total_rec += n_rec

            # now let's train the model for some steps.
            for batch in buffer.loop(config.batch_size, epoch=config.train_epoches):
                _, summ_str = self.sess.run(
                    [self.train_ops, self.merged_summary], feed_dict={
                        self.lr_a: config.lr_a,
                        self.lr_c: config.lr_c,
                        self.clip_range: clip,
                        self.s: batch['s'],
                        self.a: batch['a'],
                        self.s_next: batch['s_next'],
                        self.r: batch['r'],
                        self.done: batch['done'],
                        self.old_logp_a: batch['old_logp_actor'],
                        self.v_target: batch['v_target'],
                        self.adv: batch['adv'],
                        self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0,
                    })

                self.writer.add_summary(summ_str, step)
                step += 1

            clip = max(0.0, clip - clip_delta)

            if (reward_history and config.log_every_iteration and
                    n_iteration % config.log_every_iteration == 0):
                # Report the performance every `log_every_iteration` steps
                print("[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions.".format(
                    n_iteration, step, np.max(reward_history), np.mean(reward_history[-10:]),
                    list(map(lambda x: round(x, 2), reward_history[-5:])), clip, total_rec
                ))
                # self.save_checkpoint(step=step)

        self.save_checkpoint(step=step)

        print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
            len(reward_history), np.max(reward_history), np.mean(reward_history)))

        data_dict = {
            'reward': reward_history,
            'reward_smooth10': reward_averaged,
        }
        plot_learning_curve(self.model_name, data_dict, xlabel='episode')