高速公路环境的深度 Q 学习

如何解决高速公路环境的深度 Q 学习

目前我正在 OpenAIgym 的 Freeway 上使用深度强化学习算法，我们有一个模型似乎正在学习的周期性，然后它会重置。下面是每集的奖励和时间图表：

我有我们 Q Learning 代理的当前代码：

class Q_agent():
  def __init__(self,environment,value_func,gamma = 0.9,epsilon = 0.001,alpha=0.2):
    
            
    self.epsilon = epsilon
    self.alpha = alpha
    self.gamma = gamma
    self.value_function = value_func

    self.environment = gym.make("Freeway-v0")
    state = self.environment.reset()

    state = grayscale_resize(state)
    states = [state,state,state]
    self.value_function.init_models(states)

    self.memory_Box = []

    pass
  
  def run_episode(self,ep = 2000):

    #initialise the environment
    self.environment = gym.make("Freeway-v0")
    state = self.environment.reset()
    state = grayscale_resize(state)
    initial = 0
    cumulative_return = 0


    state_compilations = np.array([state,state])
    terminal = False
    end = 0

    while not terminal:


      ##choosing the action
      if initial < 4: # we need 4 frames/observations to call predict
        action = np.random.choice(3)
      elif np.random.uniform()<self.epsilon: #epsilon greedy policy
        action = np.random.choice(3)
      else: #choose actions using the predict function
        state_action_values = self.value_function.predict_best(state_compilations)
        print('state action value:',state_action_values)
        action = np.argmax(state_action_values)

      ##taking the action
      obs,reward,terminal,_ = self.environment.step(action)
      if reward == 0:
        reward = -1
      else:
        reward = 100
        end += 1

      cumulative_return+=reward
      obs = grayscale_resize(obs)

      ##adding the observation to our observation list
      if initial < 4:
        state_compilations[initial] = obs
      else:
        state_compilations[:3] = state_compilations[1:]
        state_compilations[3] = obs

      ##calculating the estimated value of our observation
      if initial < 4:
        initial+=1
        V=reward
      else:
        V = reward + self.gamma*np.max(self.value_function.predict_best(state_compilations))
      
      ## taking a sample for replay experience 10% of the time
      if np.random.uniform()<0.1 and initial >=4:
        self.memory_Box.append([state_compilations[0],state_compilations[1],state_compilations[2],state_compilations[3],action,V])

        if len(self.memory_Box) == 25:
          self.value_function.update(self.memory_Box)
          
          self.memory_Box = []

      if end == 5:
        terminal = True

    self.environment.close()
    return cumulative_return


  def run_simulation(self,start_ep = 0,n_episodes=2):
    start_time = time.time()
    self.episodic_rewards = np.zeros(n_episodes)
    self.times = np.zeros(n_episodes)
    
    for ep in range(n_episodes):
      start_time = time.time()
      self.episodic_rewards[ep] = self.run_episode(ep)
      self.times[ep] = time.time() - start_time
      print(self.episodic_rewards[ep])
      print("--- %s seconds ---" % (time.time() - start_time))

    return self.episodic_rewards,self.times

有更多代码，共享 colab 链接可能会更容易，但我们实际上不确定是什么导致了这种周期性。我们的学习率为 0.005，这与之前的 Q-Learning 论文相比似乎相当高。