My Learning Channel

Off-policy learning

2022-09-18

介绍

问题:e-greedy总是走巾cliff的问题?

off-policy: 学习中使用的策略和最后估算的策略不同(使用non-optimal policy, 估算 optimal policy),学一套 做一套

grid world with cliff:

0 1 2 3 4
5 6 7 8 9
10 11 12 13 14
s cliff cliff cliff goal

Q-Learning算法

1
2
3
4
# ϵ-greedy:
# ϵ-greedy(Epsilon Greedy): always exploit, sometimes explore
#exploit greedy(p=1-ε): At = argmaxQt(a)
#explore(p=ε): A_t = random(a)

强化学习算法:avatar

TD learning:

V(St)<–V(St) + α[Rt+1 + γV(St+1)-V(St)]

on policy TD COntrol(SARSA) :

Q(St,At) <– Q(St,At) + α[Rt+1 +γQ(St+1,At+1)-Q(St,At)]

Off-policy TD COntrol(Q-learning):

Q(St,At) <– Q(St,At) + α[Rt+1 +γmaxQ(St+1,At+1)-Q(St,At)]

代码实现

1
2
3
# environment: grids with size m*n; goal / cliff grid / start point(down left coner)
# task: can be temporal discounting R(R(goal)=0, R(cliff)=-100,R(orginary)=-1 )
# learning algorithm: SARSA
1
2
import numpy as np
import matplotlib.pyplot as plt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Environment setup: Sutten book example 6.6 cliff walk
# grid configuration
gsize =[4,12]
s0=[gsize[0]-1,0] # initial state
gw=np.zeros([gsize[0],gsize[1]]) # 0 is orginary block
gw[gsize[0]-1,gsize[1]-1] = 1 # set goals
gw[gsize[0]-1,1:-1] = -100 # cliff
acts = ['u','d','l','r']
print(gw)

# action and transition matrix
def state_act(state, action, gsize):
# action is a character of either u,d,l,r(up,down,left,right)
# start is a 1*2 array, marking the current positon
newstate = state[:]
if action =='l' or action==2:
newstate[1] = max(0,state[1]-1)
elif action =='r'or action==3:
newstate[1] = min(gsize[1]-1,state[1]+1)
elif action =='u'or action==0:
newstate[0] = max(0,state[0]-1)
elif action =='d'or action==1:
newstate[0] = min(gsize[0]-1,state[0]+1)
else:
raise ValueError("action note valid")

# fall into the cliff, return to the initial
if gw[newstate[0],newstate[1]] == -100:
newstate = [0,0]

return newstate

# reward setup
def reward(state, gw):
#state represents the current postion; gw is the setting of grid world
if gw[state[0],state[1]] == 1: # goal
R = 0
elif gw[state[0],state[1]] == -100: # cliff
R = -100
else:
R=-1
return R
[[   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.]
 [   0. -100. -100. -100. -100. -100. -100. -100. -100. -100. -100.    1.]]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# learning setup
A = .5 # learning rate
gamma = 1 # no temporal discount for future state

def e_greedy(state,Q): #e-greedy

e=0.1
if np.random.rand(1)<e:
action = np.random.randint(len(acts))
# print('随机选择action:',action)
else:
Q_now = Q[state[0]][state[1]]
#action = np.argmax(Q_now)
allmax = [i for i,j in enumerate(Q_now) if j == max(Q_now)] # find all actions of largest Qs
action = allmax[np.random.randint(len(allmax))] # randomly select one
# print("Q_now:",allmax,"Q action:",action)
return action
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# start learning
Nepis = 200 # total episode num
Q = np.zeros([gsize[0],gsize[1],len(acts)]) #Q(St,At)

isQLearning = True
isSARSA = False

for k in range(Nepis):
s=s0 # initial point
nstep =0
tot_r = 0

while gw[s[0],s[1]] != 1:
a = e_greedy(s,Q) #action
s_new = state_act(s,a,gsize) # current state
if isSARSA:
a_new = e_greedy(s_new,Q) # use the same policy as current state
elif isQLearning:
a_new = np.argmax(Q[s_new[0]][s_new[1]])
nstep = nstep + 1 #step
pred_err = reward(s,gw) + gamma*Q[s_new[0],s_new[1],a_new] - Q[s[0],s[1],a]
tot_r = tot_r+reward(s,gw)

Q[s[0],s[1],a] = Q[s[0],s[1],a] + A * pred_err
s = s_new


if a==0:
Q_steps=np.around(Q[:,:,0],1)
print("第{}步,向UP移动了:\n".format(nstep),Q_steps)
elif a==1:
Q_steps=np.around(Q[:,:,1],1)
print("第{}步,向DOWN移动了:\n".format(nstep),Q_steps)
elif a==2:
Q_steps=np.around(Q[:,:,2],1)
print("第{}步,向LEFT移动了:\n".format(nstep),Q_steps)
else :
Q_steps=np.around(Q[:,:,3],1)
print("第{}步,向RIGHT移动了:\n".format(nstep),Q_steps)

print("第{}个Nepis,R值为:{}\n".format(k,tot_r),Q_steps)
1
2
3
4
5
#第199个Nepis,R值为:-13。
[[-13.6 -12.8 -11.7 -10.8 -9.9 -8.9 -7.9 -6.9 -5.9 -5. -4. -3. ]
[-12.8 -12. -11. -10. -9. -8. -7. -6. -5. -4. -3. -2. ]
[-13.9 -13.8 -13.2 -14.4 -14.4 -13.6 -13.6 -13.9 -13.5 -12.3 -13.7 -1. ]
[-13.5 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]]
使用支付宝打赏
使用微信打赏

若你觉得我的文章对你有帮助,欢迎点击上方按钮对我打赏

扫描二维码,分享此文章