From 7fe5720b368e33151c72740da2c464c9986f063d Mon Sep 17 00:00:00 2001 From: p74035216 Date: Wed, 22 Nov 2023 23:52:55 +0800 Subject: [PATCH] ADD file via upload --- TestMaze.py | 345 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 TestMaze.py diff --git a/TestMaze.py b/TestMaze.py new file mode 100644 index 0000000..954473f --- /dev/null +++ b/TestMaze.py @@ -0,0 +1,345 @@ +import numpy as np +import MDP +import RL + +''' Construct a simple maze MDP + + Grid world layout: + + --------------------- + | 0 | 1 | 2 | 3 | + --------------------- + | 4 | 5 | 6 | 7 | + --------------------- + | 8 | 9 | 10 | 11 | + --------------------- + | 12 | 13 | 14 | 15 | + --------------------- + + Goal state: 15 + Bad state: 9 + End state: 16 + + The end state is an absorbing state that the agent transitions + to after visiting the goal state. + + There are 17 states in total (including the end state) + and 4 actions (up, down, left, right).''' + +# Transition function: |A| x |S| x |S'| array +T = np.zeros([4,17,17]) +a = 0.8; # intended move +b = 0.1; # lateral move + +# up (a = 0) + +T[0,0,0] = a+b; +T[0,0,1] = b; + +T[0,1,0] = b; +T[0,1,1] = a; +T[0,1,2] = b; + +T[0,2,1] = b; +T[0,2,2] = a; +T[0,2,3] = b; + +T[0,3,2] = b; +T[0,3,3] = a+b; + +T[0,4,4] = b; +T[0,4,0] = a; +T[0,4,5] = b; + +T[0,5,4] = b; +T[0,5,1] = a; +T[0,5,6] = b; + +T[0,6,5] = b; +T[0,6,2] = a; +T[0,6,7] = b; + +T[0,7,6] = b; +T[0,7,3] = a; +T[0,7,7] = b; + +T[0,8,8] = b; +T[0,8,4] = a; +T[0,8,9] = b; + +T[0,9,8] = b; +T[0,9,5] = a; +T[0,9,10] = b; + +T[0,10,9] = b; +T[0,10,6] = a; +T[0,10,11] = b; + +T[0,11,10] = b; +T[0,11,7] = a; +T[0,11,11] = b; + +T[0,12,12] = b; +T[0,12,8] = a; +T[0,12,13] = b; + +T[0,13,12] = b; +T[0,13,9] = a; +T[0,13,14] = b; + +T[0,14,13] = b; +T[0,14,10] = a; +T[0,14,15] = b; + +T[0,15,16] = 1; +T[0,16,16] = 1; + +# down (a = 1) + +T[1,0,0] = b; +T[1,0,4] = a; +T[1,0,1] = b; + +T[1,1,0] = b; +T[1,1,5] = a; +T[1,1,2] = b; + +T[1,2,1] = b; +T[1,2,6] = a; +T[1,2,3] = b; + +T[1,3,2] = b; +T[1,3,7] = a; +T[1,3,3] = b; + +T[1,4,4] = b; +T[1,4,8] = a; +T[1,4,5] = b; + +T[1,5,4] = b; +T[1,5,9] = a; +T[1,5,6] = b; + +T[1,6,5] = b; +T[1,6,10] = a; +T[1,6,7] = b; + +T[1,7,6] = b; +T[1,7,11] = a; +T[1,7,7] = b; + +T[1,8,8] = b; +T[1,8,12] = a; +T[1,8,9] = b; + +T[1,9,8] = b; +T[1,9,13] = a; +T[1,9,10] = b; + +T[1,10,9] = b; +T[1,10,14] = a; +T[1,10,11] = b; + +T[1,11,10] = b; +T[1,11,15] = a; +T[1,11,11] = b; + +T[1,12,12] = a+b; +T[1,12,13] = b; + +T[1,13,12] = b; +T[1,13,13] = a; +T[1,13,14] = b; + +T[1,14,13] = b; +T[1,14,14] = a; +T[1,14,15] = b; + +T[1,15,16] = 1; +T[1,16,16] = 1; + +# left (a = 2) + +T[2,0,0] = a+b; +T[2,0,4] = b; + +T[2,1,1] = b; +T[2,1,0] = a; +T[2,1,5] = b; + +T[2,2,2] = b; +T[2,2,1] = a; +T[2,2,6] = b; + +T[2,3,3] = b; +T[2,3,2] = a; +T[2,3,7] = b; + +T[2,4,0] = b; +T[2,4,4] = a; +T[2,4,8] = b; + +T[2,5,1] = b; +T[2,5,4] = a; +T[2,5,9] = b; + +T[2,6,2] = b; +T[2,6,5] = a; +T[2,6,10] = b; + +T[2,7,3] = b; +T[2,7,6] = a; +T[2,7,11] = b; + +T[2,8,4] = b; +T[2,8,8] = a; +T[2,8,12] = b; + +T[2,9,5] = b; +T[2,9,8] = a; +T[2,9,13] = b; + +T[2,10,6] = b; +T[2,10,9] = a; +T[2,10,14] = b; + +T[2,11,7] = b; +T[2,11,10] = a; +T[2,11,15] = b; + +T[2,12,8] = b; +T[2,12,12] = a+b; + +T[2,13,9] = b; +T[2,13,12] = a; +T[2,13,13] = b; + +T[2,14,10] = b; +T[2,14,13] = a; +T[2,14,14] = b; + +T[2,15,16] = 1; +T[2,16,16] = 1; + +# right (a = 3) + +T[3,0,0] = b; +T[3,0,1] = a; +T[3,0,4] = b; + +T[3,1,1] = b; +T[3,1,2] = a; +T[3,1,5] = b; + +T[3,2,2] = b; +T[3,2,3] = a; +T[3,2,6] = b; + +T[3,3,3] = a+b; +T[3,3,7] = b; + +T[3,4,0] = b; +T[3,4,5] = a; +T[3,4,8] = b; + +T[3,5,1] = b; +T[3,5,6] = a; +T[3,5,9] = b; + +T[3,6,2] = b; +T[3,6,7] = a; +T[3,6,10] = b; + +T[3,7,3] = b; +T[3,7,7] = a; +T[3,7,11] = b; + +T[3,8,4] = b; +T[3,8,9] = a; +T[3,8,12] = b; + +T[3,9,5] = b; +T[3,9,10] = a; +T[3,9,13] = b; + +T[3,10,6] = b; +T[3,10,11] = a; +T[3,10,14] = b; + +T[3,11,7] = b; +T[3,11,11] = a; +T[3,11,15] = b; + +T[3,12,8] = b; +T[3,12,13] = a; +T[3,12,12] = b; + +T[3,13,9] = b; +T[3,13,14] = a; +T[3,13,13] = b; + +T[3,14,10] = b; +T[3,14,15] = a; +T[3,14,14] = b; + +T[3,15,16] = 1; +T[3,16,16] = 1; + +# Reward function: |A| x |S| array +R = -1 * np.ones([4,17]); + +# set rewards +R[:,15] = 100; # goal state +R[:,9] = -70; # bad state +R[:,16] = 0; # end state + +# Discount factor: scalar in [0,1) +discount = 0.95 + +# MDP object +mdp = MDP.MDP(T,R,discount) + +# RL problem +rlProblem = RL.RL(mdp, np.random.normal) + +# Test Q-learning + + +rewardList_sum_005 = np.zeros(200) +for i in range(100): + [Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.05) + rewardList_sum_005 += rewardList +print("epsilon=0.05\n ","rewardList_avg",rewardList_sum_005/100) + +rewardList_sum_01 = np.zeros(200) +for i in range(100): + [Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.1) + rewardList_sum_01 += rewardList +print("epsilon=0.1\n ","rewardList_avg",rewardList_sum_01/100) + +rewardList_sum_03 = np.zeros(200) +for i in range(100): + [Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.3) + rewardList_sum_03 += rewardList +print("epsilon=0.3\n ","rewardList_avg",rewardList_sum_03/100) + +rewardList_sum_05 = np.zeros(200) +for i in range(100): + [Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.5) + rewardList_sum_05 += rewardList +print("epsilon=0.5\n ","rewardList_avg",rewardList_sum_05/100) + +import matplotlib.pyplot as plt +fig = plt.figure() +X = np.arange(1,201) +plt.plot(X,rewardList_sum_005/100,label="0.05") +plt.plot(X,rewardList_sum_01/100,label="0.1",linestyle="--") +plt.plot(X,rewardList_sum_03/100,label="0.3",linestyle="-.") +plt.plot(X,rewardList_sum_05/100,label="0.5") +plt.title("XXX's result") +plt.legend() +plt.savefig('Q-learning.jpg') +plt.show() + + +#[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0] \ No newline at end of file