ADD file via upload

This commit is contained in:
p74035216 2023-11-22 23:52:55 +08:00
parent 664383c23c
commit 7fe5720b36
1 changed files with 345 additions and 0 deletions

345
TestMaze.py Normal file
View File

@ -0,0 +1,345 @@
import numpy as np
import MDP
import RL
''' Construct a simple maze MDP
Grid world layout:
---------------------
| 0 | 1 | 2 | 3 |
---------------------
| 4 | 5 | 6 | 7 |
---------------------
| 8 | 9 | 10 | 11 |
---------------------
| 12 | 13 | 14 | 15 |
---------------------
Goal state: 15
Bad state: 9
End state: 16
The end state is an absorbing state that the agent transitions
to after visiting the goal state.
There are 17 states in total (including the end state)
and 4 actions (up, down, left, right).'''
# Transition function: |A| x |S| x |S'| array
T = np.zeros([4,17,17])
a = 0.8; # intended move
b = 0.1; # lateral move
# up (a = 0)
T[0,0,0] = a+b;
T[0,0,1] = b;
T[0,1,0] = b;
T[0,1,1] = a;
T[0,1,2] = b;
T[0,2,1] = b;
T[0,2,2] = a;
T[0,2,3] = b;
T[0,3,2] = b;
T[0,3,3] = a+b;
T[0,4,4] = b;
T[0,4,0] = a;
T[0,4,5] = b;
T[0,5,4] = b;
T[0,5,1] = a;
T[0,5,6] = b;
T[0,6,5] = b;
T[0,6,2] = a;
T[0,6,7] = b;
T[0,7,6] = b;
T[0,7,3] = a;
T[0,7,7] = b;
T[0,8,8] = b;
T[0,8,4] = a;
T[0,8,9] = b;
T[0,9,8] = b;
T[0,9,5] = a;
T[0,9,10] = b;
T[0,10,9] = b;
T[0,10,6] = a;
T[0,10,11] = b;
T[0,11,10] = b;
T[0,11,7] = a;
T[0,11,11] = b;
T[0,12,12] = b;
T[0,12,8] = a;
T[0,12,13] = b;
T[0,13,12] = b;
T[0,13,9] = a;
T[0,13,14] = b;
T[0,14,13] = b;
T[0,14,10] = a;
T[0,14,15] = b;
T[0,15,16] = 1;
T[0,16,16] = 1;
# down (a = 1)
T[1,0,0] = b;
T[1,0,4] = a;
T[1,0,1] = b;
T[1,1,0] = b;
T[1,1,5] = a;
T[1,1,2] = b;
T[1,2,1] = b;
T[1,2,6] = a;
T[1,2,3] = b;
T[1,3,2] = b;
T[1,3,7] = a;
T[1,3,3] = b;
T[1,4,4] = b;
T[1,4,8] = a;
T[1,4,5] = b;
T[1,5,4] = b;
T[1,5,9] = a;
T[1,5,6] = b;
T[1,6,5] = b;
T[1,6,10] = a;
T[1,6,7] = b;
T[1,7,6] = b;
T[1,7,11] = a;
T[1,7,7] = b;
T[1,8,8] = b;
T[1,8,12] = a;
T[1,8,9] = b;
T[1,9,8] = b;
T[1,9,13] = a;
T[1,9,10] = b;
T[1,10,9] = b;
T[1,10,14] = a;
T[1,10,11] = b;
T[1,11,10] = b;
T[1,11,15] = a;
T[1,11,11] = b;
T[1,12,12] = a+b;
T[1,12,13] = b;
T[1,13,12] = b;
T[1,13,13] = a;
T[1,13,14] = b;
T[1,14,13] = b;
T[1,14,14] = a;
T[1,14,15] = b;
T[1,15,16] = 1;
T[1,16,16] = 1;
# left (a = 2)
T[2,0,0] = a+b;
T[2,0,4] = b;
T[2,1,1] = b;
T[2,1,0] = a;
T[2,1,5] = b;
T[2,2,2] = b;
T[2,2,1] = a;
T[2,2,6] = b;
T[2,3,3] = b;
T[2,3,2] = a;
T[2,3,7] = b;
T[2,4,0] = b;
T[2,4,4] = a;
T[2,4,8] = b;
T[2,5,1] = b;
T[2,5,4] = a;
T[2,5,9] = b;
T[2,6,2] = b;
T[2,6,5] = a;
T[2,6,10] = b;
T[2,7,3] = b;
T[2,7,6] = a;
T[2,7,11] = b;
T[2,8,4] = b;
T[2,8,8] = a;
T[2,8,12] = b;
T[2,9,5] = b;
T[2,9,8] = a;
T[2,9,13] = b;
T[2,10,6] = b;
T[2,10,9] = a;
T[2,10,14] = b;
T[2,11,7] = b;
T[2,11,10] = a;
T[2,11,15] = b;
T[2,12,8] = b;
T[2,12,12] = a+b;
T[2,13,9] = b;
T[2,13,12] = a;
T[2,13,13] = b;
T[2,14,10] = b;
T[2,14,13] = a;
T[2,14,14] = b;
T[2,15,16] = 1;
T[2,16,16] = 1;
# right (a = 3)
T[3,0,0] = b;
T[3,0,1] = a;
T[3,0,4] = b;
T[3,1,1] = b;
T[3,1,2] = a;
T[3,1,5] = b;
T[3,2,2] = b;
T[3,2,3] = a;
T[3,2,6] = b;
T[3,3,3] = a+b;
T[3,3,7] = b;
T[3,4,0] = b;
T[3,4,5] = a;
T[3,4,8] = b;
T[3,5,1] = b;
T[3,5,6] = a;
T[3,5,9] = b;
T[3,6,2] = b;
T[3,6,7] = a;
T[3,6,10] = b;
T[3,7,3] = b;
T[3,7,7] = a;
T[3,7,11] = b;
T[3,8,4] = b;
T[3,8,9] = a;
T[3,8,12] = b;
T[3,9,5] = b;
T[3,9,10] = a;
T[3,9,13] = b;
T[3,10,6] = b;
T[3,10,11] = a;
T[3,10,14] = b;
T[3,11,7] = b;
T[3,11,11] = a;
T[3,11,15] = b;
T[3,12,8] = b;
T[3,12,13] = a;
T[3,12,12] = b;
T[3,13,9] = b;
T[3,13,14] = a;
T[3,13,13] = b;
T[3,14,10] = b;
T[3,14,15] = a;
T[3,14,14] = b;
T[3,15,16] = 1;
T[3,16,16] = 1;
# Reward function: |A| x |S| array
R = -1 * np.ones([4,17]);
# set rewards
R[:,15] = 100; # goal state
R[:,9] = -70; # bad state
R[:,16] = 0; # end state
# Discount factor: scalar in [0,1)
discount = 0.95
# MDP object
mdp = MDP.MDP(T,R,discount)
# RL problem
rlProblem = RL.RL(mdp, np.random.normal)
# Test Q-learning
rewardList_sum_005 = np.zeros(200)
for i in range(100):
[Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.05)
rewardList_sum_005 += rewardList
print("epsilon=0.05\n ","rewardList_avg",rewardList_sum_005/100)
rewardList_sum_01 = np.zeros(200)
for i in range(100):
[Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.1)
rewardList_sum_01 += rewardList
print("epsilon=0.1\n ","rewardList_avg",rewardList_sum_01/100)
rewardList_sum_03 = np.zeros(200)
for i in range(100):
[Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.3)
rewardList_sum_03 += rewardList
print("epsilon=0.3\n ","rewardList_avg",rewardList_sum_03/100)
rewardList_sum_05 = np.zeros(200)
for i in range(100):
[Q,policy,rewardList] = rlProblem.qLearning(s0=0,initialQ=np.zeros([mdp.nActions,mdp.nStates]),nEpisodes=200,nSteps=100,epsilon=0.5)
rewardList_sum_05 += rewardList
print("epsilon=0.5\n ","rewardList_avg",rewardList_sum_05/100)
import matplotlib.pyplot as plt
fig = plt.figure()
X = np.arange(1,201)
plt.plot(X,rewardList_sum_005/100,label="0.05")
plt.plot(X,rewardList_sum_01/100,label="0.1",linestyle="--")
plt.plot(X,rewardList_sum_03/100,label="0.3",linestyle="-.")
plt.plot(X,rewardList_sum_05/100,label="0.5")
plt.title("XXX's result")
plt.legend()
plt.savefig('Q-learning.jpg')
plt.show()
#[3 3 3 1 3 3 3 1 1 3 3 1 3 3 3 0 0]