add enjoy from images

2017-12-02 00:28:27 -06:00 · 2017-12-02 00:28:27 -06:00 · 574647075f
parent 743dbb90f0
commit 574647075f
4 changed files with 317 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -56,4 +56,11 @@ comparison/


 # Plotting
-examples/scripts/plot*
+examples/scripts/plot*
+
+
+#pycharm
+.idea*
+
+#models
+nav_models
--- a/examples/train/enjoy_husky_navigate_ppo2_image.py
+++ b/examples/train/enjoy_husky_navigate_ppo2_image.py
@ -0,0 +1,104 @@
+#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
+import os, inspect
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(os.path.dirname(currentdir))
+os.sys.path.insert(0,parentdir)
+
+import gym, logging
+from mpi4py import MPI
+from realenv.envs.husky_env import HuskyNavigateEnv
+from baselines.common import set_global_seeds
+import pposgd_simple
+import baselines.common.tf_util as U
+from fuse_policy2 import CnnPolicy, MlpPolicy
+from baselines.common.atari_wrappers import make_atari, wrap_deepmind
+import utils
+import datetime
+from baselines import logger
+#from baselines.ppo2 import ppo2
+import ppo2_imgs
+from monitor import Monitor
+import os.path as osp
+import tensorflow as tf
+import random
+import sys
+import numpy as np
+
+## Training code adapted from: https://github.com/openai/baselines/blob/master/baselines/ppo1/run_atari.py
+
+def enjoy(num_timesteps, seed):
+    rank = MPI.COMM_WORLD.Get_rank()
+    #sess = U.single_threaded_session()
+    sess = utils.make_gpu_session(args.num_gpu)
+    sess.__enter__()
+    if args.meta != "":
+        saver = tf.train.import_meta_graph(args.meta)
+        saver.restore(sess,tf.train.latest_checkpoint('./'))
+
+    if rank == 0:
+        logger.configure()
+    else:
+        logger.configure(format_strs=[])
+    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
+    set_global_seeds(workerseed)
+
+    use_filler = not args.disable_filler
+    
+    #raw_env = HuskyNavigateEnv(human=args.human, is_discrete=True, mode=args.mode, gpu_count=args.gpu_count, use_filler=use_filler, resolution=args.resolution)
+
+    #env = Monitor(raw_env, logger.get_dir() and
+    #    osp.join(logger.get_dir(), str(rank)))
+    #env.seed(workerseed)
+
+    gym.logger.setLevel(logging.WARN)
+
+    ppo2_imgs.enjoy(policy=CnnPolicy, imgs=np.zeros((100,256,256,3)), nsteps=600, nminibatches=4,
+        lam=0.95, gamma=0.996, noptepochs=4, log_interval=1,
+        ent_coef=.01,
+        lr=lambda f : f * 2.5e-4,
+        cliprange=lambda f : f * 0.2,
+        total_timesteps=int(num_timesteps * 1.1),
+        save_interval=10,
+        reload_name=args.reload_name)
+    
+    '''
+    pposgd_fuse.learn(env, policy_fn,
+        max_timesteps=int(num_timesteps * 1.1),
+        timesteps_per_actorbatch=1024,
+        clip_param=0.2, entcoeff=0.0001,
+        optim_epochs=10, optim_stepsize=3e-6, optim_batchsize=64,
+        gamma=0.995, lam=0.95,
+        schedule='linear',
+        save_name=args.save_name,
+        save_per_acts=10000,
+        reload_name=args.reload_name
+    )
+
+    env.close()
+    '''
+
+def callback(lcl, glb):
+    # stop training if reward exceeds 199
+    total = sum(lcl['episode_rewards'][-101:-1]) / 100
+    totalt = lcl['t']
+    is_solved = totalt > 2000 and total >= -50
+    return is_solved
+
+
+def main():
+    enjoy(num_timesteps=10000000, seed=5)
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--mode', type=str, default="RGB")
+    parser.add_argument('--num_gpu', type=int, default=1)
+    parser.add_argument('--human', action='store_true', default=False)
+    parser.add_argument('--gpu_count', type=int, default=0)
+    parser.add_argument('--disable_filler', action='store_true', default=False)
+    parser.add_argument('--meta', type=str, default="")
+    parser.add_argument('--resolution', type=str, default="SMALL")
+    parser.add_argument('--reload_name', type=str, default=None)
+    parser.add_argument('--save_name', type=str, default=None)
+    args = parser.parse_args()
+    main()
--- a/examples/train/ppo2.py
+++ b/examples/train/ppo2.py
@ -94,7 +94,8 @@ class Runner(object):
        nenv = 1
        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
        self.obs_sensor = np.zeros((nenv,) + env.sensor_space.shape, dtype=model.train_model.X.dtype.name)
-
+        print(self.obs.shape)
+        print(self.obs_sensor.shape)
        self.obs[:], self.obs_sensor[:] = env.reset()
        self.gamma = gamma
        self.lam = lam
@ -189,6 +190,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
    nenvs = 1
    ob_space = env.observation_space
    ac_space = env.action_space
+
+
+
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

@ -293,6 +297,11 @@ def enjoy(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
    nenvs = 1
    ob_space = env.observation_space
    ac_space = env.action_space
+
+    print(env.observation_space)
+    print(env.action_space)
+
+
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

--- a/examples/train/ppo2_imgs.py
+++ b/examples/train/ppo2_imgs.py
@ -0,0 +1,195 @@
+import os
+import time
+import joblib
+import numpy as np
+import os.path as osp
+import tensorflow as tf
+from baselines import logger
+from collections import deque
+from baselines.common import explained_variance
+import gym
+from realenv.core.render.profiler import Profiler
+
+class Model(object):
+    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
+                nsteps, ent_coef, vf_coef, max_grad_norm):
+        sess = tf.get_default_session()
+
+        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
+
+        A = train_model.pdtype.sample_placeholder([None])
+        ADV = tf.placeholder(tf.float32, [None])
+        R = tf.placeholder(tf.float32, [None])
+        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        OLDVPRED = tf.placeholder(tf.float32, [None])
+        LR = tf.placeholder(tf.float32, [])
+        CLIPRANGE = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.pd.neglogp(A)
+        entropy = tf.reduce_mean(train_model.pd.entropy())
+
+        vpred = train_model.vf
+        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        vf_losses1 = tf.square(vpred - R)
+        vf_losses2 = tf.square(vpredclipped - R)
+        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+        pg_losses = -ADV * ratio
+        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+        with tf.variable_scope('model'):
+            params = tf.trainable_variables()
+        grads = tf.gradients(loss, params)
+        if max_grad_norm is not None:
+            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads = list(zip(grads, params))
+        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        _train = trainer.apply_gradients(grads)
+
+        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+            advs = returns - values
+            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, 
+                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
+            if states is not None:
+                td_map[train_model.S] = states
+                td_map[train_model.M] = masks
+            return sess.run(
+                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
+                td_map
+            )[:-1]
+        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
+
+        def save(save_path):
+            ps = sess.run(params)
+            joblib.dump(ps, save_path)
+
+        def load(load_path):
+            loaded_params = joblib.load(load_path)
+            restores = []
+            for p, loaded_p in zip(params, loaded_params):
+                restores.append(p.assign(loaded_p))
+            sess.run(restores)
+
+        self.train = train
+        self.train_model = train_model
+        self.act_model = act_model
+        self.step = act_model.step
+        self.value = act_model.value
+        self.initial_state = act_model.initial_state
+        self.save = save
+        self.load = load
+        tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
+
+class Runner(object):
+
+    def __init__(self, *, imgs, model, nsteps, gamma, lam):
+
+        self.model = model
+        #nenv = env.num_envs
+        nenv = 1
+
+        observation_space_shape = (256, 256, 3)
+        sensor_space_shape = (20,)
+        self.obs = np.zeros((nenv,) + observation_space_shape, dtype=model.train_model.X.dtype.name)
+        self.obs_sensor = np.zeros((nenv,) + sensor_space_shape, dtype=model.train_model.X.dtype.name)
+
+        #self.obs[:], self.obs_sensor[:] = env.reset()
+        self.gamma = gamma
+        self.lam = lam
+        self.nsteps = nsteps
+        self.states = model.initial_state
+        #self.dones = [False for _ in range(nenv)]
+        self.dones = False
+
+        self.imgs = imgs
+
+    def run(self):
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
+        mb_states = self.states
+        epinfos = []
+        for i in range(len(self.imgs)):
+            #with Profiler("PPO2 step"):
+            self.obs[:] = self.imgs[i]
+            actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
+            #print("actions", actions)
+            mb_obs.append(self.obs.copy())
+            mb_actions.append([actions])
+            mb_values.append(values)
+            mb_neglogpacs.append(neglogpacs)
+            mb_dones.append([self.dones])
+
+            print(i, self.obs.shape, actions)
+
+        return
+# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
+def sf01(arr):
+    """
+    swap and then flatten axes 0 and 1
+    """
+    s = arr.shape
+    #print(arr)
+    #print("arr shape", s)
+    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
+
+def constfn(val):
+    def f(_):
+        return val
+    return f
+
+
+def safemean(xs):
+    return np.nan if len(xs) == 0 else np.mean(xs)
+
+
+
+
+def enjoy(*, policy, imgs, nsteps, total_timesteps, ent_coef, lr,
+            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95, 
+            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
+            save_interval=0, reload_name=None):
+
+    if isinstance(lr, float): lr = constfn(lr)
+    else: assert callable(lr)
+    if isinstance(cliprange, float): cliprange = constfn(cliprange)
+    else: assert callable(cliprange)
+    total_timesteps = int(total_timesteps)
+
+    #nenv = env.num_envs
+    nenvs = 1
+    ac_space = gym.spaces.Discrete(5)
+
+    observation_space_shape = (256, 256, 3)
+    sensor_space_shape = (20,)
+
+    obs_high = np.inf * np.ones(observation_space_shape)
+    ob_space = gym.spaces.Box(-obs_high, obs_high)
+
+
+    nbatch = nenvs * nsteps
+    nbatch_train = nbatch // nminibatches
+
+    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, 
+                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
+                    max_grad_norm=max_grad_norm)
+    
+    if save_interval and logger.get_dir():
+        import cloudpickle
+        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
+            fh.write(cloudpickle.dumps(make_model))
+    
+    model = make_model()
+    if reload_name:
+        model.load(reload_name)
+    
+    runner = Runner(imgs=imgs, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
+
+    epinfobuf = deque(maxlen=100)
+    tfirststart = time.time()
+
+
+    runner.run() #pylint: disable=E0632