>>>>>>>>>>>>>>>>>>>for episode in range(1000):
#在这之前要拥有⼀个⼤循环,需要遍历1000次
x = 4
x = 4
y = 0
initial_s = 24
分火头terminal_s = 0
s = initial_s
step = 1
while s != 0:
if np.random.uniform() < epsilon: # 输出0-1的随机数
# choose best action
state_action = q_table.loc[s, :] # 显⽰数值obversation为⾏数
'''输出形式:
1 0.0
2 0.0
3 0.0
Name: 0,
dtype: float64
'''
# some actions may have the same value, randomly choose on in these actions
p_action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
a = q_table.loc[s, :]
p_action = np.random.choice(a[a != float('-inf')].index)
##print(p_action)
s_ = s + ac_matrix[p_action]
x_ = x + x_matrix[p_action]
y_ = y + y_matrix[p_action]
trace.append(s_)
step = step + 1
if maze[s_] == 1:
reward = -10
else:
氟塑料离心泵结构图reward = -1
adc = q_table.loc[s_, :]
max_q = np.max(adc)
q_new = q_table.loc[s,p_action]+learning_rate*((reward + gamma*max_q-q_table.loc[s, p_action])) q_table.iloc[s,p_action] = q_new
s=s_
##print(s)
##print(q_table)
r_step.append(step)
plt.plot(r_step,linewidth=1)
#plt.show()
plt.savefig("22.png")
MATLAB代码:
主程序:
clc
num_of_iteration = 1000;
N = 5;
r = 20;
gamma = 0.8;
learning_rate = 0.1;
actions = 4; % North, East, West, South (N, E, W, S)
exploration_rate = 0.4;
action_matrix = [-5 1 -1 5 ];
继电器封装
maze = create_random_maze(N, r);
q_matrix = construct_q_matrix(N, actions);
i = 0;
reward = 0;
trace_index = [];
for episode = 1 : num_of_iteration
initial_state = N*N;
goal_state = 1;
current_state = initial_state;
while current_state ~= goal_state
if rand < exploration_rate
eliminate_inf = find(q_matrix(current_state, :) > -inf);
prefered_action = eliminate_inf(randi(numel(eliminate_inf)));
else
[temp_value, prefered_action] = max( q_matrix(current_state,:) );
temp_value
end
% receive immediate reward
maze_exp_index = current_state + action_matrix(prefered_action);
通乳器
trace_index(i) = maze_exp_index;
i = i + 1;
if maze(maze_exp_index) == 1
reward = -4;
else
reward = -1;
end
q_matrix(current_state, prefered_action) = (1-learning_rate) * q_matrix(current_state, prefered_action)+... learning_rate * (reward + gamma * max(q_matrix(maze_exp_index,:)));
% observe new state
current_state = maze_exp_index ;
end
end
maze
q_matrix
Q值表定义函数:
function [ q_matrix ] = costruct_q_matrix( N, actions ) q_matrix = zeros(N*N, actions);
q_matrix(N*N, 4) = -inf;
q_matrix(1:N, 1) = -inf;
q_matrix(N*4+1 : N*N, 4) = -inf;
for i = 1 : N
q_matrix(i*N,2) = -inf;
q_matrix(i*N + 1, 3) = -inf;
end
q_matrix = q_matrix( 1:N*N, :);
End
环境矩阵定义函数:
function [ maze ] = create_random_maze(N, r)
maze_temp = zeros(N, N);
for i = 1 : N
for j = 1 : N
if rand < 0.01 * r
maze_temp(i, j) = 1;
end
end
end
maze = maze_temp;
end
MATLAB仿照python写的,含义⼤同⼩异不做过多的解释了