diff --git a/.gitignore b/.gitignore index 485bab7c5..f7129a03d 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,11 @@ comparison/ # Plotting -examples/scripts/plot* \ No newline at end of file +examples/scripts/plot* + + +#pycharm +.idea* + +#models +nav_models diff --git a/examples/train/enjoy_husky_navigate_ppo2_image.py b/examples/train/enjoy_husky_navigate_ppo2_image.py new file mode 100644 index 000000000..5866fa568 --- /dev/null +++ b/examples/train/enjoy_husky_navigate_ppo2_image.py @@ -0,0 +1,104 @@ +#add parent dir to find package. Only needed for source code build, pip install doesn't need it. +import os, inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(os.path.dirname(currentdir)) +os.sys.path.insert(0,parentdir) + +import gym, logging +from mpi4py import MPI +from realenv.envs.husky_env import HuskyNavigateEnv +from baselines.common import set_global_seeds +import pposgd_simple +import baselines.common.tf_util as U +from fuse_policy2 import CnnPolicy, MlpPolicy +from baselines.common.atari_wrappers import make_atari, wrap_deepmind +import utils +import datetime +from baselines import logger +#from baselines.ppo2 import ppo2 +import ppo2_imgs +from monitor import Monitor +import os.path as osp +import tensorflow as tf +import random +import sys +import numpy as np + +## Training code adapted from: https://github.com/openai/baselines/blob/master/baselines/ppo1/run_atari.py + +def enjoy(num_timesteps, seed): + rank = MPI.COMM_WORLD.Get_rank() + #sess = U.single_threaded_session() + sess = utils.make_gpu_session(args.num_gpu) + sess.__enter__() + if args.meta != "": + saver = tf.train.import_meta_graph(args.meta) + saver.restore(sess,tf.train.latest_checkpoint('./')) + + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + set_global_seeds(workerseed) + + use_filler = not args.disable_filler + + #raw_env = HuskyNavigateEnv(human=args.human, is_discrete=True, mode=args.mode, gpu_count=args.gpu_count, use_filler=use_filler, resolution=args.resolution) + + #env = Monitor(raw_env, logger.get_dir() and + # osp.join(logger.get_dir(), str(rank))) + #env.seed(workerseed) + + gym.logger.setLevel(logging.WARN) + + ppo2_imgs.enjoy(policy=CnnPolicy, imgs=np.zeros((100,256,256,3)), nsteps=600, nminibatches=4, + lam=0.95, gamma=0.996, noptepochs=4, log_interval=1, + ent_coef=.01, + lr=lambda f : f * 2.5e-4, + cliprange=lambda f : f * 0.2, + total_timesteps=int(num_timesteps * 1.1), + save_interval=10, + reload_name=args.reload_name) + + ''' + pposgd_fuse.learn(env, policy_fn, + max_timesteps=int(num_timesteps * 1.1), + timesteps_per_actorbatch=1024, + clip_param=0.2, entcoeff=0.0001, + optim_epochs=10, optim_stepsize=3e-6, optim_batchsize=64, + gamma=0.995, lam=0.95, + schedule='linear', + save_name=args.save_name, + save_per_acts=10000, + reload_name=args.reload_name + ) + + env.close() + ''' + +def callback(lcl, glb): + # stop training if reward exceeds 199 + total = sum(lcl['episode_rewards'][-101:-1]) / 100 + totalt = lcl['t'] + is_solved = totalt > 2000 and total >= -50 + return is_solved + + +def main(): + enjoy(num_timesteps=10000000, seed=5) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--mode', type=str, default="RGB") + parser.add_argument('--num_gpu', type=int, default=1) + parser.add_argument('--human', action='store_true', default=False) + parser.add_argument('--gpu_count', type=int, default=0) + parser.add_argument('--disable_filler', action='store_true', default=False) + parser.add_argument('--meta', type=str, default="") + parser.add_argument('--resolution', type=str, default="SMALL") + parser.add_argument('--reload_name', type=str, default=None) + parser.add_argument('--save_name', type=str, default=None) + args = parser.parse_args() + main() diff --git a/examples/train/ppo2.py b/examples/train/ppo2.py index 154a066a8..45e7d3a40 100644 --- a/examples/train/ppo2.py +++ b/examples/train/ppo2.py @@ -94,7 +94,8 @@ class Runner(object): nenv = 1 self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name) self.obs_sensor = np.zeros((nenv,) + env.sensor_space.shape, dtype=model.train_model.X.dtype.name) - + print(self.obs.shape) + print(self.obs_sensor.shape) self.obs[:], self.obs_sensor[:] = env.reset() self.gamma = gamma self.lam = lam @@ -189,6 +190,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, nenvs = 1 ob_space = env.observation_space ac_space = env.action_space + + + nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches @@ -293,6 +297,11 @@ def enjoy(*, policy, env, nsteps, total_timesteps, ent_coef, lr, nenvs = 1 ob_space = env.observation_space ac_space = env.action_space + + print(env.observation_space) + print(env.action_space) + + nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches diff --git a/examples/train/ppo2_imgs.py b/examples/train/ppo2_imgs.py new file mode 100644 index 000000000..2ee9e5f65 --- /dev/null +++ b/examples/train/ppo2_imgs.py @@ -0,0 +1,195 @@ +import os +import time +import joblib +import numpy as np +import os.path as osp +import tensorflow as tf +from baselines import logger +from collections import deque +from baselines.common import explained_variance +import gym +from realenv.core.render.profiler import Profiler + +class Model(object): + def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, + nsteps, ent_coef, vf_coef, max_grad_norm): + sess = tf.get_default_session() + + act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) + train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) + + A = train_model.pdtype.sample_placeholder([None]) + ADV = tf.placeholder(tf.float32, [None]) + R = tf.placeholder(tf.float32, [None]) + OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) + OLDVPRED = tf.placeholder(tf.float32, [None]) + LR = tf.placeholder(tf.float32, []) + CLIPRANGE = tf.placeholder(tf.float32, []) + + neglogpac = train_model.pd.neglogp(A) + entropy = tf.reduce_mean(train_model.pd.entropy()) + + vpred = train_model.vf + vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) + vf_losses1 = tf.square(vpred - R) + vf_losses2 = tf.square(vpredclipped - R) + vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) + ratio = tf.exp(OLDNEGLOGPAC - neglogpac) + pg_losses = -ADV * ratio + pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) + pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) + approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) + clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) + loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + with tf.variable_scope('model'): + params = tf.trainable_variables() + grads = tf.gradients(loss, params) + if max_grad_norm is not None: + grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) + grads = list(zip(grads, params)) + trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) + _train = trainer.apply_gradients(grads) + + def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): + advs = returns - values + advs = (advs - advs.mean()) / (advs.std() + 1e-8) + td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, + CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} + if states is not None: + td_map[train_model.S] = states + td_map[train_model.M] = masks + return sess.run( + [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], + td_map + )[:-1] + self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] + + def save(save_path): + ps = sess.run(params) + joblib.dump(ps, save_path) + + def load(load_path): + loaded_params = joblib.load(load_path) + restores = [] + for p, loaded_p in zip(params, loaded_params): + restores.append(p.assign(loaded_p)) + sess.run(restores) + + self.train = train + self.train_model = train_model + self.act_model = act_model + self.step = act_model.step + self.value = act_model.value + self.initial_state = act_model.initial_state + self.save = save + self.load = load + tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101 + +class Runner(object): + + def __init__(self, *, imgs, model, nsteps, gamma, lam): + + self.model = model + #nenv = env.num_envs + nenv = 1 + + observation_space_shape = (256, 256, 3) + sensor_space_shape = (20,) + self.obs = np.zeros((nenv,) + observation_space_shape, dtype=model.train_model.X.dtype.name) + self.obs_sensor = np.zeros((nenv,) + sensor_space_shape, dtype=model.train_model.X.dtype.name) + + #self.obs[:], self.obs_sensor[:] = env.reset() + self.gamma = gamma + self.lam = lam + self.nsteps = nsteps + self.states = model.initial_state + #self.dones = [False for _ in range(nenv)] + self.dones = False + + self.imgs = imgs + + def run(self): + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] + mb_states = self.states + epinfos = [] + for i in range(len(self.imgs)): + #with Profiler("PPO2 step"): + self.obs[:] = self.imgs[i] + actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) + #print("actions", actions) + mb_obs.append(self.obs.copy()) + mb_actions.append([actions]) + mb_values.append(values) + mb_neglogpacs.append(neglogpacs) + mb_dones.append([self.dones]) + + print(i, self.obs.shape, actions) + + return +# obs, returns, masks, actions, values, neglogpacs, states = runner.run() +def sf01(arr): + """ + swap and then flatten axes 0 and 1 + """ + s = arr.shape + #print(arr) + #print("arr shape", s) + return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) + +def constfn(val): + def f(_): + return val + return f + + +def safemean(xs): + return np.nan if len(xs) == 0 else np.mean(xs) + + + + +def enjoy(*, policy, imgs, nsteps, total_timesteps, ent_coef, lr, + vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, + log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, + save_interval=0, reload_name=None): + + if isinstance(lr, float): lr = constfn(lr) + else: assert callable(lr) + if isinstance(cliprange, float): cliprange = constfn(cliprange) + else: assert callable(cliprange) + total_timesteps = int(total_timesteps) + + #nenv = env.num_envs + nenvs = 1 + ac_space = gym.spaces.Discrete(5) + + observation_space_shape = (256, 256, 3) + sensor_space_shape = (20,) + + obs_high = np.inf * np.ones(observation_space_shape) + ob_space = gym.spaces.Box(-obs_high, obs_high) + + + nbatch = nenvs * nsteps + nbatch_train = nbatch // nminibatches + + make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, + nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, + max_grad_norm=max_grad_norm) + + if save_interval and logger.get_dir(): + import cloudpickle + with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: + fh.write(cloudpickle.dumps(make_model)) + + model = make_model() + if reload_name: + model.load(reload_name) + + runner = Runner(imgs=imgs, model=model, nsteps=nsteps, gamma=gamma, lam=lam) + + epinfobuf = deque(maxlen=100) + tfirststart = time.time() + + + runner.run() #pylint: disable=E0632