import numpy as np import MDP import RL ''' Construct simple MDP as described in Lecture 2a Slides 13-14''' T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]]) R = np.array([[0,0,10,10],[0,0,10,10]]) discount = 0.9 mdp = MDP.MDP(T,R,discount) rlProblem = RL.RL(mdp, np.random.normal) # Test Q-learning [Q,policy,_] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=1000,nSteps=100,epsilon=0.3) print ("\nQ-learning results") print (Q) print (policy)