My Learning Channel

TD learning

2022-09-18

介绍

state evaluation: 好不好? 有多好?

policy learning:

依据我的行动策略,这个状态有多好?

最佳的行动策略是什么?optimal control problem 

grid world:路网

goal 1 2 3
4 5 6 7
8 9 10 11
12 13 14 goal

转化为强化学习的问题

目标:如何最快到达终点

1. state(s):

1–14

goal

2. reward(r):

r(goal) = 1

r(1-14) = -1

3. action(A):

up,down,left,right

3. episode:

移动一步

算法

V(St) = V(St) + α[Rt+1] + γV(St+1) - V(St)]

最简单的policy: equiprobable random policy(等概率随机)

代码实现

1
2
3
4
5
# environment: grids with size m*n; goal / ordinary grid / windy grid / cliff grid / blocking grid / start point 
# other variation
# task: can be temporal discounting (R(goal)=0 , R(ordinary)=-1) others =0
# learning : TD evaluation; dynamic programming could be added
# GUI to be added
1
2
3
import numpy as np 
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# Environment setup : Sutton book example 4.1 
gsize = [4,4]
gw = np.zeros([gsize[0],gsize[1]])
gw[0,0] = 1
gw[3,3] = 1
print(gw)

#action and transition matrix
def state_act(state,action,gsize):
# action is a character of either u,d,l,r(up,down,left,right)
# state is a 1*2 tuple, making the current position
newstate = state[:]
if action =='l':
newstate[1] = max(0,state[1]-1)
elif action =='r':
newstate[1] = min(gsize[1]-1,state[1]+1)
elif action =='u':
newstate[0] = max(0,state[0]-1)
elif action =='d':
newstate[0] = min(gsize[0]-1,state[0]+1)
else:
raise ValueError("action note valid")

return newstate

# reward setup
def reward(state,gw):
# state represetns the current position ; gw is the setting of grid world
if gw[state[0],state[1]]==1:
R = 1
else:
R = -1 # with temporal discount
return R
[[1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 1.]]
1
2
3
4
5
6
# Learner setup
A =.1 # learning rate
gamma = 1 # no temperal discount for future state
def policy(state):
acts = ['u','d','l','r']
return acts[np.random.randint(len(acts))]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# start learning
Nepis = 10 # total episode num
V = np.zeros_like(gw)
#V[0,0] = 1
#V[3,3] = 1

for k in range(Nepis):
s1= np.random.randint(0,4)
s2= np.random.randint(0,4)
s = [s1,s2]
print('episode:',k,'start postion:',s)

while gw[s[0],s[1]] !=1:
a=policy(s)
print('action:',a)
s_new=state_act(s,a,gsize)
print('new state:',s_new)
pred_err = reward(s,gw) + gamma * V[s_new[0],s_new[1]]- V[s[0],s[1]]
V[s[0],s[1]] = V[s[0],s[1]] + A * pred_err
s=s_new
print(V)
episode: 0 start postion: [0, 2]
action: u
new state: [0, 2]
action: r
new state: [0, 3]
action: l
new state: [0, 2]
action: d
new state: [1, 2]
action: l
new state: [1, 1]
action: d
new state: [2, 1]
action: d
new state: [3, 1]
action: d
new state: [3, 1]
action: l
new state: [3, 0]
action: l
new state: [3, 0]
action: r
new state: [3, 1]
action: r
new state: [3, 2]
action: l
new state: [3, 1]
action: u
new state: [2, 1]
action: l
new state: [2, 0]
action: d
new state: [3, 0]
action: r
new state: [3, 1]
action: l
new state: [3, 0]
action: l
new state: [3, 0]
action: r
new state: [3, 1]
action: r
new state: [3, 2]
action: d
new state: [3, 2]
action: r
new state: [3, 3]
[[ 0.         0.        -0.271     -0.119    ]
 [ 0.        -0.1       -0.1        0.       ]
 [-0.1209    -0.19       0.         0.       ]
 [-0.5262269 -0.5184831 -0.30439    0.       ]]
episode: 1 start postion: [3, 0]
action: r
new state: [3, 1]
action: l
new state: [3, 0]
action: u
new state: [2, 0]
action: l
new state: [2, 0]
action: l
new state: [2, 0]
action: l
new state: [2, 0]
action: r
new state: [2, 1]
action: l
new state: [2, 0]
action: d
new state: [3, 0]
action: u
new state: [2, 0]
action: d
new state: [3, 0]
action: l
new state: [3, 0]
action: d
new state: [3, 0]
action: l
new state: [3, 0]
action: r
new state: [3, 1]
action: u
new state: [2, 1]
action: d
new state: [3, 1]
action: u
new state: [2, 1]
action: r
new state: [2, 2]
action: u
new state: [1, 2]
action: l
new state: [1, 1]
action: u
new state: [0, 1]
action: r
new state: [0, 2]
action: u
new state: [0, 2]
action: r
new state: [0, 3]
action: u
new state: [0, 3]
action: r
new state: [0, 3]
action: l
new state: [0, 2]
action: r
new state: [0, 3]
action: l
new state: [0, 2]
action: l
new state: [0, 1]
action: l
new state: [0, 0]
[[ 0.         -0.21439    -0.6026592  -0.5429508 ]
 [ 0.         -0.19       -0.2         0.        ]
 [-0.7308809  -0.51268322 -0.11        0.        ]
 [-1.12506338 -0.77435982 -0.30439     0.        ]]
episode: 2 start postion: [2, 1]
action: u
new state: [1, 1]
action: l
new state: [1, 0]
action: u
new state: [0, 0]
[[ 0.         -0.21439    -0.6026592  -0.5429508 ]
 [-0.1        -0.271      -0.2         0.        ]
 [-0.7308809  -0.5804149  -0.11        0.        ]
 [-1.12506338 -0.77435982 -0.30439     0.        ]]
episode: 3 start postion: [0, 0]
[[ 0.         -0.21439    -0.6026592  -0.5429508 ]
 [-0.1        -0.271      -0.2         0.        ]
 [-0.7308809  -0.5804149  -0.11        0.        ]
 [-1.12506338 -0.77435982 -0.30439     0.        ]]
episode: 4 start postion: [3, 3]
[[ 0.         -0.21439    -0.6026592  -0.5429508 ]
 [-0.1        -0.271      -0.2         0.        ]
 [-0.7308809  -0.5804149  -0.11        0.        ]
 [-1.12506338 -0.77435982 -0.30439     0.        ]]
episode: 5 start postion: [1, 2]
action: l
new state: [1, 1]
action: r
new state: [1, 2]
action: u
new state: [0, 2]
action: u
new state: [0, 2]
action: l
new state: [0, 1]
action: u
new state: [0, 1]
action: u
new state: [0, 1]
action: d
new state: [1, 1]
action: d
new state: [2, 1]
action: l
new state: [2, 0]
action: d
new state: [3, 0]
action: u
new state: [2, 0]
action: l
new state: [2, 0]
action: r
new state: [2, 1]
action: d
new state: [3, 1]
action: r
new state: [3, 2]
action: r
new state: [3, 3]
[[ 0.         -0.510412   -0.75383228 -0.5429508 ]
 [-0.1        -0.49519049 -0.43665592  0.        ]
 [-1.04281538 -0.80335133 -0.11        0.        ]
 [-1.19958695 -0.82736283 -0.373951    0.        ]]
episode: 6 start postion: [1, 3]
action: d
new state: [2, 3]
action: u
new state: [1, 3]
action: u
new state: [0, 3]
action: r
new state: [0, 3]
action: r
new state: [0, 3]
action: r
new state: [0, 3]
action: l
new state: [0, 2]
action: r
new state: [0, 3]
action: r
new state: [0, 3]
action: u
new state: [0, 3]
action: u
new state: [0, 3]
action: u
new state: [0, 3]
action: d
new state: [1, 3]
action: d
new state: [2, 3]
action: l
new state: [2, 2]
action: l
new state: [2, 1]
action: r
new state: [2, 2]
action: d
new state: [3, 2]
action: d
new state: [3, 2]
action: l
new state: [3, 1]
action: r
new state: [3, 2]
action: r
new state: [3, 3]
[[ 0.         -0.510412   -0.87185295 -1.32506456]
 [-0.1        -0.49519049 -0.43665592 -0.33086557]
 [-1.04281538 -0.85094971 -0.38879672 -0.21      ]
 [-1.19958695 -0.90555577 -0.64836297  0.        ]]
episode: 7 start postion: [1, 3]
action: l
new state: [1, 2]
action: d
new state: [2, 2]
action: r
new state: [2, 3]
action: d
new state: [3, 3]
[[ 0.         -0.510412   -0.87185295 -1.32506456]
 [-0.1        -0.49519049 -0.53187    -0.44144461]
 [-1.04281538 -0.85094971 -0.47091705 -0.289     ]
 [-1.19958695 -0.90555577 -0.64836297  0.        ]]
episode: 8 start postion: [3, 0]
action: u
new state: [2, 0]
action: u
new state: [1, 0]
action: u
new state: [0, 0]
[[ 0.         -0.510412   -0.87185295 -1.32506456]
 [-0.19       -0.49519049 -0.53187    -0.44144461]
 [-1.04853384 -0.85094971 -0.47091705 -0.289     ]
 [-1.2839098  -0.90555577 -0.64836297  0.        ]]
episode: 9 start postion: [3, 0]
action: l
new state: [3, 0]
action: u
new state: [2, 0]
action: l
new state: [2, 0]
action: d
new state: [3, 0]
action: l
new state: [3, 0]
action: r
new state: [3, 1]
action: u
new state: [2, 1]
action: r
new state: [2, 2]
action: u
new state: [1, 2]
action: r
new state: [1, 3]
action: d
new state: [2, 3]
action: r
new state: [2, 3]
action: u
new state: [1, 3]
action: r
new state: [1, 3]
action: u
new state: [0, 3]
action: r
new state: [0, 3]
action: u
new state: [0, 3]
action: l
new state: [0, 2]
action: l
new state: [0, 1]
action: r
new state: [0, 2]
action: u
new state: [0, 2]
action: l
new state: [0, 1]
action: u
new state: [0, 1]
action: u
new state: [0, 1]
action: u
new state: [0, 1]
action: r
new state: [0, 2]
action: d
new state: [1, 2]
action: d
new state: [2, 2]
action: l
new state: [2, 1]
action: r
new state: [2, 2]
action: d
new state: [3, 2]
action: r
new state: [3, 3]
[[ 0.         -1.06739073 -1.14997167 -1.5597434 ]
 [-0.19       -0.49519049 -0.71824595 -0.79608659]
 [-1.27871768 -0.99271238 -0.80438147 -0.50272001]
 [-1.58589056 -1.00009516 -0.68352667  0.        ]]
1

1

1

使用支付宝打赏
使用微信打赏

若你觉得我的文章对你有帮助,欢迎点击上方按钮对我打赏

扫描二维码,分享此文章