from collections import namedtuple import numpy as np import tensorflow as tf from gym.spaces import Discrete from deepq.playground.policies.base import BaseModelMixin, Policy, Config from deepq.playground.policies.memory import ReplayMemory from deepq.playground.utils.misc import plot_learning_curve from deepq.playground.utils.tf_ops import dense_nn class PPOPolicy(Policy, BaseModelMixin): def __init__(self, env, name, training=True, gamma=0.99, lam=0.95, actor_layers=[64, 32], critic_layers=[128, 64], clip_norm=None, **kwargs): Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs) BaseModelMixin.__init__(self, name) assert isinstance(self.env.action_space, Discrete), \ "Current PPOPolicy implementation only works for discrete action space." self.lam = lam # lambda for GAE. self.actor_layers = actor_layers self.critic_layers = critic_layers self.clip_norm = clip_norm def act(self, state, **kwargs): probas = self.sess.run(self.actor_proba, {self.s: [state]})[0] action = np.random.choice(range(self.act_size), size=1, p=probas)[0] return action def _build_networks(self): # Define input placeholders self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') self.a = tf.placeholder(tf.int32, shape=(None,), name='action') self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') self.r = tf.placeholder(tf.float32, shape=(None,), name='reward') self.done = tf.placeholder(tf.float32, shape=(None,), name='done_flag') self.old_logp_a = tf.placeholder(tf.float32, shape=(None,), name='old_logp_actor') self.v_target = tf.placeholder(tf.float32, shape=(None,), name='v_target') self.adv = tf.placeholder(tf.float32, shape=(None,), name='return') with tf.variable_scope('actor'): # Actor: action probabilities self.actor = dense_nn(self.s, self.actor_layers + [self.act_size], name='actor') self.actor_proba = tf.nn.softmax(self.actor) a_ohe = tf.one_hot(self.a, self.act_size, 1.0, 0.0, name='action_ohe') self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe, reduction_indices=-1, name='new_logp_actor') self.actor_vars = self.scope_vars('actor') with tf.variable_scope('critic'): # Critic: action value (V value) self.critic = tf.squeeze(dense_nn(self.s, self.critic_layers + [1], name='critic')) self.critic_next = tf.squeeze(dense_nn(self.s_next, self.critic_layers + [1], name='critic', reuse=True)) self.critic_vars = self.scope_vars('critic') def _build_train_ops(self): self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_actor') self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic') self.clip_range = tf.placeholder(tf.float32, shape=None, name='ratio_clip_range') with tf.variable_scope('actor_train'): ratio = tf.exp(self.logp_a - self.old_logp_a) ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range) loss_a = - tf.reduce_mean(tf.minimum(self.adv * ratio, self.adv * ratio_clipped)) optim_a = tf.train.AdamOptimizer(self.lr_a) grads_a = optim_a.compute_gradients(loss_a, var_list=self.actor_vars) if self.clip_norm: grads_a = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_a] self.train_op_a = optim_a.apply_gradients(grads_a) with tf.variable_scope('critic_train'): loss_c = tf.reduce_mean(tf.square(self.v_target - self.critic)) optim_c = tf.train.AdamOptimizer(self.lr_c) grads_c = optim_c.compute_gradients(loss_c, var_list=self.critic_vars) if self.clip_norm: grads_c = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_c] self.train_op_c = optim_c.apply_gradients(grads_c) self.train_ops = [self.train_op_a, self.train_op_c] with tf.variable_scope('summary'): self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') self.summary = [ tf.summary.scalar('loss/adv', tf.reduce_mean(self.adv)), tf.summary.scalar('loss/ratio', tf.reduce_mean(ratio)), tf.summary.scalar('loss/loss_actor', loss_a), tf.summary.scalar('loss/loss_critic', loss_c), tf.summary.scalar('episode_reward', self.ep_reward) ] # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g)) # for g, v in grads_a if g is not None] # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g)) # for g, v in grads_c if g is not None] self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) self.sess.run(tf.global_variables_initializer()) def build(self): self._build_networks() self._build_train_ops() class TrainConfig(Config): lr_a = 0.005 lr_c = 0.005 batch_size = 32 n_iterations = 100 n_rollout_workers = 5 train_epoches = 5 log_every_iteration = 10 ratio_clip_range = 0.2 ratio_clip_decay = True def _generate_rollout(self, buffer): # generate one trajectory. ob = self.env.reset() done = False rewards = [] episode_reward = 0.0 obs = [] actions = [] while not done: a = self.act(ob) ob_next, r, done, info = self.env.step(a) obs.append(ob) actions.append(a) rewards.append(r) episode_reward += r ob = ob_next # length of the episode. T = len(rewards) # compute the current log pi(a|s) and predicted v values. with self.sess.as_default(): logp_a = self.logp_a.eval({self.a: np.array(actions), self.s: np.array(obs)}) v_pred = self.critic.eval({self.s: np.array(obs)}) # Compute TD errors td_errors = [rewards[t] + self.gamma * v_pred[t + 1] - v_pred[t] for t in range(T - 1)] td_errors += [rewards[T - 1] + self.gamma * 0.0 - v_pred[T - 1]] # handle the terminal state. assert len(logp_a) == len(v_pred) == len(td_errors) == T # Estimate advantage backwards. advs = [] adv_so_far = 0.0 for delta in td_errors[::-1]: adv_so_far = delta + self.gamma * self.lam * adv_so_far advs.append(adv_so_far) advs = advs[::-1] assert len(advs) == T # add into the memory buffer v_targets = np.array(advs) + np.array(v_pred) for i, (s, a, s_next, r, old_logp_a, v_target, adv) in enumerate(zip( obs, actions, np.array(obs[1:] + [ob_next]), rewards, np.squeeze(logp_a), v_targets, advs)): done = float(i == T - 1) buffer.add(buffer.tuple_class(s, a, s_next, r, done, old_logp_a, v_target, adv)) return episode_reward, len(advs) def train(self, config: TrainConfig): BufferRecord = namedtuple('BufferRecord', ['s', 'a', 's_next', 'r', 'done', 'old_logp_actor', 'v_target', 'adv']) buffer = ReplayMemory(tuple_class=BufferRecord) reward_history = [] reward_averaged = [] step = 0 total_rec = 0 clip = config.ratio_clip_range if config.ratio_clip_decay: clip_delta = clip / config.n_iterations else: clip_delta = 0.0 for n_iteration in range(config.n_iterations): # we should have multiple rollout_workers running in parallel. for _ in range(config.n_rollout_workers): episode_reward, n_rec = self._generate_rollout(buffer) # One trajectory is complete. reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) total_rec += n_rec # now let's train the model for some steps. for batch in buffer.loop(config.batch_size, epoch=config.train_epoches): _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.lr_a: config.lr_a, self.lr_c: config.lr_c, self.clip_range: clip, self.s: batch['s'], self.a: batch['a'], self.s_next: batch['s_next'], self.r: batch['r'], self.done: batch['done'], self.old_logp_a: batch['old_logp_actor'], self.v_target: batch['v_target'], self.adv: batch['adv'], self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, }) self.writer.add_summary(summ_str, step) step += 1 clip = max(0.0, clip - clip_delta) if (reward_history and config.log_every_iteration and n_iteration % config.log_every_iteration == 0): # Report the performance every `log_every_iteration` steps print("[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions.".format( n_iteration, step, np.max(reward_history), np.mean(reward_history[-10:]), list(map(lambda x: round(x, 2), reward_history[-5:])), clip, total_rec )) # self.save_checkpoint(step=step) self.save_checkpoint(step=step) print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) data_dict = { 'reward': reward_history, 'reward_smooth10': reward_averaged, } plot_learning_curve(self.model_name, data_dict, xlabel='episode')