为 OpenAI Gym 迷宫环境添加奖品

如何解决为 OpenAI Gym 迷宫环境添加奖品

有没有办法通过在地图上的随机位置添加固定数量的奖品（例如 2 个）来稍微修改 OpenAI Gym 环境？这样代理不仅会找到出口，还会在途中领取奖品。

我有以下代码用于使用 maze-v0

的 Q 学习算法

import gym
import gym_maze 
import numpy as np

env = gym.make("maze-v0")

states_dic = {} #dictionary to keep the states/coordinates of the Q table
count = 0
for i in range(5):
    for j in range(5):
        states_dic[i,j] = count
        count+=1
        
n_actions = env.action_space.n

#Initialize the Q-table to 0
Q_table = np.zeros((len(states_dic),n_actions))


#number of episode we will run
n_episodes = 10000
#maximum of iteration per episode
max_iter_episode = 100
#initialize the exploration probability to 1
exploration_proba = 1
#exploartion decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001
# minimum of exploration prob
min_exploration_proba = 0.01
#discounted factor
gamma = 0.99
#learning rate
lr = 0.1

rewards_per_episode = list()


#we iterate over episodes
for e in range(n_episodes):
    #we initialize the first state of the episode
    current_state = env.reset()
    done = False
    
    #sum the rewards that the agent gets from the environment
    total_episode_reward = 0

    for i in range(max_iter_episode): 
        env.render()  # For image you MUST call this
        current_coordinate_x = int(current_state[0])
        current_coordinate_y = int(current_state[1])
        current_Q_table_coordinates = states_dic[current_coordinate_x,current_coordinate_y]

        if np.random.uniform(0,1) < exploration_proba:
            action = env.action_space.sample()
        else:
            action = int(np.argmax(Q_table[current_Q_table_coordinates]))


        next_state,reward,done,_ = env.step(action)

        next_coordinate_x = int(next_state[0]) #get coordinates to be used in dictionary
        next_coordinate_y = int(next_state[1]) #get coordinates to be used in dictionary


        # update our Q-table using the Q-learning iteration
        next_Q_table_coordinates = states_dic[next_coordinate_x,next_coordinate_y]
        
        Q_table[current_Q_table_coordinates,action] = (1-lr) *Q_table[current_Q_table_coordinates,action] +lr*(reward + gamma*max(Q_table[next_Q_table_coordinates,:]))
    
        total_episode_reward = total_episode_reward + reward
        # If the episode is finished,we leave the for loop
        if done:
            break
        current_state = next_state
    #We update the exploration proba using exponential decay formula 
    exploration_proba = max(min_exploration_proba,\
                            np.exp(-exploration_decreasing_decay*e))
    rewards_per_episode.append(total_episode_reward)