add enjoy from images

This commit is contained in:
fxia22 2017-12-02 00:28:27 -06:00
parent 743dbb90f0
commit 574647075f
4 changed files with 317 additions and 2 deletions

9
.gitignore vendored
View File

@ -56,4 +56,11 @@ comparison/
# Plotting
examples/scripts/plot*
examples/scripts/plot*
#pycharm
.idea*
#models
nav_models

View File

@ -0,0 +1,104 @@
#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
import os, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
os.sys.path.insert(0,parentdir)
import gym, logging
from mpi4py import MPI
from realenv.envs.husky_env import HuskyNavigateEnv
from baselines.common import set_global_seeds
import pposgd_simple
import baselines.common.tf_util as U
from fuse_policy2 import CnnPolicy, MlpPolicy
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
import utils
import datetime
from baselines import logger
#from baselines.ppo2 import ppo2
import ppo2_imgs
from monitor import Monitor
import os.path as osp
import tensorflow as tf
import random
import sys
import numpy as np
## Training code adapted from: https://github.com/openai/baselines/blob/master/baselines/ppo1/run_atari.py
def enjoy(num_timesteps, seed):
rank = MPI.COMM_WORLD.Get_rank()
#sess = U.single_threaded_session()
sess = utils.make_gpu_session(args.num_gpu)
sess.__enter__()
if args.meta != "":
saver = tf.train.import_meta_graph(args.meta)
saver.restore(sess,tf.train.latest_checkpoint('./'))
if rank == 0:
logger.configure()
else:
logger.configure(format_strs=[])
workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
set_global_seeds(workerseed)
use_filler = not args.disable_filler
#raw_env = HuskyNavigateEnv(human=args.human, is_discrete=True, mode=args.mode, gpu_count=args.gpu_count, use_filler=use_filler, resolution=args.resolution)
#env = Monitor(raw_env, logger.get_dir() and
# osp.join(logger.get_dir(), str(rank)))
#env.seed(workerseed)
gym.logger.setLevel(logging.WARN)
ppo2_imgs.enjoy(policy=CnnPolicy, imgs=np.zeros((100,256,256,3)), nsteps=600, nminibatches=4,
lam=0.95, gamma=0.996, noptepochs=4, log_interval=1,
ent_coef=.01,
lr=lambda f : f * 2.5e-4,
cliprange=lambda f : f * 0.2,
total_timesteps=int(num_timesteps * 1.1),
save_interval=10,
reload_name=args.reload_name)
'''
pposgd_fuse.learn(env, policy_fn,
max_timesteps=int(num_timesteps * 1.1),
timesteps_per_actorbatch=1024,
clip_param=0.2, entcoeff=0.0001,
optim_epochs=10, optim_stepsize=3e-6, optim_batchsize=64,
gamma=0.995, lam=0.95,
schedule='linear',
save_name=args.save_name,
save_per_acts=10000,
reload_name=args.reload_name
)
env.close()
'''
def callback(lcl, glb):
# stop training if reward exceeds 199
total = sum(lcl['episode_rewards'][-101:-1]) / 100
totalt = lcl['t']
is_solved = totalt > 2000 and total >= -50
return is_solved
def main():
enjoy(num_timesteps=10000000, seed=5)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--mode', type=str, default="RGB")
parser.add_argument('--num_gpu', type=int, default=1)
parser.add_argument('--human', action='store_true', default=False)
parser.add_argument('--gpu_count', type=int, default=0)
parser.add_argument('--disable_filler', action='store_true', default=False)
parser.add_argument('--meta', type=str, default="")
parser.add_argument('--resolution', type=str, default="SMALL")
parser.add_argument('--reload_name', type=str, default=None)
parser.add_argument('--save_name', type=str, default=None)
args = parser.parse_args()
main()

View File

@ -94,7 +94,8 @@ class Runner(object):
nenv = 1
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
self.obs_sensor = np.zeros((nenv,) + env.sensor_space.shape, dtype=model.train_model.X.dtype.name)
print(self.obs.shape)
print(self.obs_sensor.shape)
self.obs[:], self.obs_sensor[:] = env.reset()
self.gamma = gamma
self.lam = lam
@ -189,6 +190,9 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
nenvs = 1
ob_space = env.observation_space
ac_space = env.action_space
nbatch = nenvs * nsteps
nbatch_train = nbatch // nminibatches
@ -293,6 +297,11 @@ def enjoy(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
nenvs = 1
ob_space = env.observation_space
ac_space = env.action_space
print(env.observation_space)
print(env.action_space)
nbatch = nenvs * nsteps
nbatch_train = nbatch // nminibatches

195
examples/train/ppo2_imgs.py Normal file
View File

@ -0,0 +1,195 @@
import os
import time
import joblib
import numpy as np
import os.path as osp
import tensorflow as tf
from baselines import logger
from collections import deque
from baselines.common import explained_variance
import gym
from realenv.core.render.profiler import Profiler
class Model(object):
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
nsteps, ent_coef, vf_coef, max_grad_norm):
sess = tf.get_default_session()
act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
A = train_model.pdtype.sample_placeholder([None])
ADV = tf.placeholder(tf.float32, [None])
R = tf.placeholder(tf.float32, [None])
OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
OLDVPRED = tf.placeholder(tf.float32, [None])
LR = tf.placeholder(tf.float32, [])
CLIPRANGE = tf.placeholder(tf.float32, [])
neglogpac = train_model.pd.neglogp(A)
entropy = tf.reduce_mean(train_model.pd.entropy())
vpred = train_model.vf
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
vf_losses1 = tf.square(vpred - R)
vf_losses2 = tf.square(vpredclipped - R)
vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
pg_losses = -ADV * ratio
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
with tf.variable_scope('model'):
params = tf.trainable_variables()
grads = tf.gradients(loss, params)
if max_grad_norm is not None:
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params))
trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
_train = trainer.apply_gradients(grads)
def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
advs = returns - values
advs = (advs - advs.mean()) / (advs.std() + 1e-8)
td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
if states is not None:
td_map[train_model.S] = states
td_map[train_model.M] = masks
return sess.run(
[pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
td_map
)[:-1]
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
def save(save_path):
ps = sess.run(params)
joblib.dump(ps, save_path)
def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
sess.run(restores)
self.train = train
self.train_model = train_model
self.act_model = act_model
self.step = act_model.step
self.value = act_model.value
self.initial_state = act_model.initial_state
self.save = save
self.load = load
tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
class Runner(object):
def __init__(self, *, imgs, model, nsteps, gamma, lam):
self.model = model
#nenv = env.num_envs
nenv = 1
observation_space_shape = (256, 256, 3)
sensor_space_shape = (20,)
self.obs = np.zeros((nenv,) + observation_space_shape, dtype=model.train_model.X.dtype.name)
self.obs_sensor = np.zeros((nenv,) + sensor_space_shape, dtype=model.train_model.X.dtype.name)
#self.obs[:], self.obs_sensor[:] = env.reset()
self.gamma = gamma
self.lam = lam
self.nsteps = nsteps
self.states = model.initial_state
#self.dones = [False for _ in range(nenv)]
self.dones = False
self.imgs = imgs
def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
mb_states = self.states
epinfos = []
for i in range(len(self.imgs)):
#with Profiler("PPO2 step"):
self.obs[:] = self.imgs[i]
actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
#print("actions", actions)
mb_obs.append(self.obs.copy())
mb_actions.append([actions])
mb_values.append(values)
mb_neglogpacs.append(neglogpacs)
mb_dones.append([self.dones])
print(i, self.obs.shape, actions)
return
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr):
"""
swap and then flatten axes 0 and 1
"""
s = arr.shape
#print(arr)
#print("arr shape", s)
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
def constfn(val):
def f(_):
return val
return f
def safemean(xs):
return np.nan if len(xs) == 0 else np.mean(xs)
def enjoy(*, policy, imgs, nsteps, total_timesteps, ent_coef, lr,
vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
save_interval=0, reload_name=None):
if isinstance(lr, float): lr = constfn(lr)
else: assert callable(lr)
if isinstance(cliprange, float): cliprange = constfn(cliprange)
else: assert callable(cliprange)
total_timesteps = int(total_timesteps)
#nenv = env.num_envs
nenvs = 1
ac_space = gym.spaces.Discrete(5)
observation_space_shape = (256, 256, 3)
sensor_space_shape = (20,)
obs_high = np.inf * np.ones(observation_space_shape)
ob_space = gym.spaces.Box(-obs_high, obs_high)
nbatch = nenvs * nsteps
nbatch_train = nbatch // nminibatches
make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm)
if save_interval and logger.get_dir():
import cloudpickle
with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
fh.write(cloudpickle.dumps(make_model))
model = make_model()
if reload_name:
model.load(reload_name)
runner = Runner(imgs=imgs, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
epinfobuf = deque(maxlen=100)
tfirststart = time.time()
runner.run() #pylint: disable=E0632