add enjoy from images
This commit is contained in:
parent
743dbb90f0
commit
574647075f
|
@ -56,4 +56,11 @@ comparison/
|
|||
|
||||
|
||||
# Plotting
|
||||
examples/scripts/plot*
|
||||
examples/scripts/plot*
|
||||
|
||||
|
||||
#pycharm
|
||||
.idea*
|
||||
|
||||
#models
|
||||
nav_models
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
|
||||
import os, inspect
|
||||
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||||
parentdir = os.path.dirname(os.path.dirname(currentdir))
|
||||
os.sys.path.insert(0,parentdir)
|
||||
|
||||
import gym, logging
|
||||
from mpi4py import MPI
|
||||
from realenv.envs.husky_env import HuskyNavigateEnv
|
||||
from baselines.common import set_global_seeds
|
||||
import pposgd_simple
|
||||
import baselines.common.tf_util as U
|
||||
from fuse_policy2 import CnnPolicy, MlpPolicy
|
||||
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
|
||||
import utils
|
||||
import datetime
|
||||
from baselines import logger
|
||||
#from baselines.ppo2 import ppo2
|
||||
import ppo2_imgs
|
||||
from monitor import Monitor
|
||||
import os.path as osp
|
||||
import tensorflow as tf
|
||||
import random
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
## Training code adapted from: https://github.com/openai/baselines/blob/master/baselines/ppo1/run_atari.py
|
||||
|
||||
def enjoy(num_timesteps, seed):
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
#sess = U.single_threaded_session()
|
||||
sess = utils.make_gpu_session(args.num_gpu)
|
||||
sess.__enter__()
|
||||
if args.meta != "":
|
||||
saver = tf.train.import_meta_graph(args.meta)
|
||||
saver.restore(sess,tf.train.latest_checkpoint('./'))
|
||||
|
||||
if rank == 0:
|
||||
logger.configure()
|
||||
else:
|
||||
logger.configure(format_strs=[])
|
||||
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
|
||||
set_global_seeds(workerseed)
|
||||
|
||||
use_filler = not args.disable_filler
|
||||
|
||||
#raw_env = HuskyNavigateEnv(human=args.human, is_discrete=True, mode=args.mode, gpu_count=args.gpu_count, use_filler=use_filler, resolution=args.resolution)
|
||||
|
||||
#env = Monitor(raw_env, logger.get_dir() and
|
||||
# osp.join(logger.get_dir(), str(rank)))
|
||||
#env.seed(workerseed)
|
||||
|
||||
gym.logger.setLevel(logging.WARN)
|
||||
|
||||
ppo2_imgs.enjoy(policy=CnnPolicy, imgs=np.zeros((100,256,256,3)), nsteps=600, nminibatches=4,
|
||||
lam=0.95, gamma=0.996, noptepochs=4, log_interval=1,
|
||||
ent_coef=.01,
|
||||
lr=lambda f : f * 2.5e-4,
|
||||
cliprange=lambda f : f * 0.2,
|
||||
total_timesteps=int(num_timesteps * 1.1),
|
||||
save_interval=10,
|
||||
reload_name=args.reload_name)
|
||||
|
||||
'''
|
||||
pposgd_fuse.learn(env, policy_fn,
|
||||
max_timesteps=int(num_timesteps * 1.1),
|
||||
timesteps_per_actorbatch=1024,
|
||||
clip_param=0.2, entcoeff=0.0001,
|
||||
optim_epochs=10, optim_stepsize=3e-6, optim_batchsize=64,
|
||||
gamma=0.995, lam=0.95,
|
||||
schedule='linear',
|
||||
save_name=args.save_name,
|
||||
save_per_acts=10000,
|
||||
reload_name=args.reload_name
|
||||
)
|
||||
|
||||
env.close()
|
||||
'''
|
||||
|
||||
def callback(lcl, glb):
|
||||
# stop training if reward exceeds 199
|
||||
total = sum(lcl['episode_rewards'][-101:-1]) / 100
|
||||
totalt = lcl['t']
|
||||
is_solved = totalt > 2000 and total >= -50
|
||||
return is_solved
|
||||
|
||||
|
||||
def main():
|
||||
enjoy(num_timesteps=10000000, seed=5)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--mode', type=str, default="RGB")
|
||||
parser.add_argument('--num_gpu', type=int, default=1)
|
||||
parser.add_argument('--human', action='store_true', default=False)
|
||||
parser.add_argument('--gpu_count', type=int, default=0)
|
||||
parser.add_argument('--disable_filler', action='store_true', default=False)
|
||||
parser.add_argument('--meta', type=str, default="")
|
||||
parser.add_argument('--resolution', type=str, default="SMALL")
|
||||
parser.add_argument('--reload_name', type=str, default=None)
|
||||
parser.add_argument('--save_name', type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
main()
|
|
@ -94,7 +94,8 @@ class Runner(object):
|
|||
nenv = 1
|
||||
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
|
||||
self.obs_sensor = np.zeros((nenv,) + env.sensor_space.shape, dtype=model.train_model.X.dtype.name)
|
||||
|
||||
print(self.obs.shape)
|
||||
print(self.obs_sensor.shape)
|
||||
self.obs[:], self.obs_sensor[:] = env.reset()
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
|
@ -189,6 +190,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
|||
nenvs = 1
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
|
||||
|
||||
|
||||
nbatch = nenvs * nsteps
|
||||
nbatch_train = nbatch // nminibatches
|
||||
|
||||
|
@ -293,6 +297,11 @@ def enjoy(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
|
|||
nenvs = 1
|
||||
ob_space = env.observation_space
|
||||
ac_space = env.action_space
|
||||
|
||||
print(env.observation_space)
|
||||
print(env.action_space)
|
||||
|
||||
|
||||
nbatch = nenvs * nsteps
|
||||
nbatch_train = nbatch // nminibatches
|
||||
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
import os
|
||||
import time
|
||||
import joblib
|
||||
import numpy as np
|
||||
import os.path as osp
|
||||
import tensorflow as tf
|
||||
from baselines import logger
|
||||
from collections import deque
|
||||
from baselines.common import explained_variance
|
||||
import gym
|
||||
from realenv.core.render.profiler import Profiler
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
|
||||
nsteps, ent_coef, vf_coef, max_grad_norm):
|
||||
sess = tf.get_default_session()
|
||||
|
||||
act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
|
||||
train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
|
||||
|
||||
A = train_model.pdtype.sample_placeholder([None])
|
||||
ADV = tf.placeholder(tf.float32, [None])
|
||||
R = tf.placeholder(tf.float32, [None])
|
||||
OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
|
||||
OLDVPRED = tf.placeholder(tf.float32, [None])
|
||||
LR = tf.placeholder(tf.float32, [])
|
||||
CLIPRANGE = tf.placeholder(tf.float32, [])
|
||||
|
||||
neglogpac = train_model.pd.neglogp(A)
|
||||
entropy = tf.reduce_mean(train_model.pd.entropy())
|
||||
|
||||
vpred = train_model.vf
|
||||
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
|
||||
vf_losses1 = tf.square(vpred - R)
|
||||
vf_losses2 = tf.square(vpredclipped - R)
|
||||
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
|
||||
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
|
||||
pg_losses = -ADV * ratio
|
||||
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
|
||||
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
|
||||
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
|
||||
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
|
||||
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
|
||||
with tf.variable_scope('model'):
|
||||
params = tf.trainable_variables()
|
||||
grads = tf.gradients(loss, params)
|
||||
if max_grad_norm is not None:
|
||||
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
|
||||
grads = list(zip(grads, params))
|
||||
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
|
||||
_train = trainer.apply_gradients(grads)
|
||||
|
||||
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
|
||||
advs = returns - values
|
||||
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
|
||||
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
|
||||
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
|
||||
if states is not None:
|
||||
td_map[train_model.S] = states
|
||||
td_map[train_model.M] = masks
|
||||
return sess.run(
|
||||
[pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
|
||||
td_map
|
||||
)[:-1]
|
||||
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
|
||||
|
||||
def save(save_path):
|
||||
ps = sess.run(params)
|
||||
joblib.dump(ps, save_path)
|
||||
|
||||
def load(load_path):
|
||||
loaded_params = joblib.load(load_path)
|
||||
restores = []
|
||||
for p, loaded_p in zip(params, loaded_params):
|
||||
restores.append(p.assign(loaded_p))
|
||||
sess.run(restores)
|
||||
|
||||
self.train = train
|
||||
self.train_model = train_model
|
||||
self.act_model = act_model
|
||||
self.step = act_model.step
|
||||
self.value = act_model.value
|
||||
self.initial_state = act_model.initial_state
|
||||
self.save = save
|
||||
self.load = load
|
||||
tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
|
||||
|
||||
class Runner(object):
|
||||
|
||||
def __init__(self, *, imgs, model, nsteps, gamma, lam):
|
||||
|
||||
self.model = model
|
||||
#nenv = env.num_envs
|
||||
nenv = 1
|
||||
|
||||
observation_space_shape = (256, 256, 3)
|
||||
sensor_space_shape = (20,)
|
||||
self.obs = np.zeros((nenv,) + observation_space_shape, dtype=model.train_model.X.dtype.name)
|
||||
self.obs_sensor = np.zeros((nenv,) + sensor_space_shape, dtype=model.train_model.X.dtype.name)
|
||||
|
||||
#self.obs[:], self.obs_sensor[:] = env.reset()
|
||||
self.gamma = gamma
|
||||
self.lam = lam
|
||||
self.nsteps = nsteps
|
||||
self.states = model.initial_state
|
||||
#self.dones = [False for _ in range(nenv)]
|
||||
self.dones = False
|
||||
|
||||
self.imgs = imgs
|
||||
|
||||
def run(self):
|
||||
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
|
||||
mb_states = self.states
|
||||
epinfos = []
|
||||
for i in range(len(self.imgs)):
|
||||
#with Profiler("PPO2 step"):
|
||||
self.obs[:] = self.imgs[i]
|
||||
actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
|
||||
#print("actions", actions)
|
||||
mb_obs.append(self.obs.copy())
|
||||
mb_actions.append([actions])
|
||||
mb_values.append(values)
|
||||
mb_neglogpacs.append(neglogpacs)
|
||||
mb_dones.append([self.dones])
|
||||
|
||||
print(i, self.obs.shape, actions)
|
||||
|
||||
return
|
||||
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
|
||||
def sf01(arr):
|
||||
"""
|
||||
swap and then flatten axes 0 and 1
|
||||
"""
|
||||
s = arr.shape
|
||||
#print(arr)
|
||||
#print("arr shape", s)
|
||||
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
|
||||
|
||||
def constfn(val):
|
||||
def f(_):
|
||||
return val
|
||||
return f
|
||||
|
||||
|
||||
def safemean(xs):
|
||||
return np.nan if len(xs) == 0 else np.mean(xs)
|
||||
|
||||
|
||||
|
||||
|
||||
def enjoy(*, policy, imgs, nsteps, total_timesteps, ent_coef, lr,
|
||||
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
|
||||
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
|
||||
save_interval=0, reload_name=None):
|
||||
|
||||
if isinstance(lr, float): lr = constfn(lr)
|
||||
else: assert callable(lr)
|
||||
if isinstance(cliprange, float): cliprange = constfn(cliprange)
|
||||
else: assert callable(cliprange)
|
||||
total_timesteps = int(total_timesteps)
|
||||
|
||||
#nenv = env.num_envs
|
||||
nenvs = 1
|
||||
ac_space = gym.spaces.Discrete(5)
|
||||
|
||||
observation_space_shape = (256, 256, 3)
|
||||
sensor_space_shape = (20,)
|
||||
|
||||
obs_high = np.inf * np.ones(observation_space_shape)
|
||||
ob_space = gym.spaces.Box(-obs_high, obs_high)
|
||||
|
||||
|
||||
nbatch = nenvs * nsteps
|
||||
nbatch_train = nbatch // nminibatches
|
||||
|
||||
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
|
||||
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
|
||||
max_grad_norm=max_grad_norm)
|
||||
|
||||
if save_interval and logger.get_dir():
|
||||
import cloudpickle
|
||||
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
|
||||
fh.write(cloudpickle.dumps(make_model))
|
||||
|
||||
model = make_model()
|
||||
if reload_name:
|
||||
model.load(reload_name)
|
||||
|
||||
runner = Runner(imgs=imgs, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
|
||||
|
||||
epinfobuf = deque(maxlen=100)
|
||||
tfirststart = time.time()
|
||||
|
||||
|
||||
runner.run() #pylint: disable=E0632
|
Loading…
Reference in New Issue