深度强化学习 - CartPole 问题

如何解决深度强化学习 - CartPole 问题

我尝试实现最简单的深度 Q 学习算法。我认为，我已经正确地实施了它，并且知道深度 Q 学习在发散方面挣扎，但奖励下降得非常快，损失也在发散。如果有人能帮我指出正确的超参数，或者我是否错误地实现了算法，我将不胜感激。我尝试了很多超参数组合，也改变了 QNet 的复杂性。

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import collections
import numpy as np
import matplotlib.pyplot as plt
import gym
from torch.nn.modules.linear import Linear
from torch.nn.modules.loss import MSELoss


class ReplayBuffer:
  def __init__(self,max_replay_size,batch_size):
    self.max_replay_size = max_replay_size
    self.batch_size      = batch_size
    self.buffer          = collections.deque()


def push(self,*transition):
    if len(self.buffer) == self.max_replay_size:
        self.buffer.popleft()
    self.buffer.append(transition)


def sample_batch(self):
    indices = np.random.choice(len(self.buffer),self.batch_size,replace = False)
    batch   = [self.buffer[index] for index in indices]
    
    state,action,reward,next_state,done = zip(*batch)
    
    state      = np.array(state)
    action     = np.array(action)
    reward     = np.array(reward)
    next_state = np.array(next_state)
    done       = np.array(done)
    
    return state,done


def __len__(self):
    return len(self.buffer)


class QNet(nn.Module):
  def __init__(self,state_dim,action_dim):
    super(QNet,self).__init__()

    self.linear1 = Linear(in_features = state_dim,out_features = 64)
    self.linear2 = Linear(in_features = 64,out_features = action_dim)


  def forward(self,x):
    x = self.linear1(x)
    x = F.relu(x)
    x = self.linear2(x)
    return x


def train(replay_buffer,model,target_model,discount_factor,mse,optimizer):
  state,_ = replay_buffer.sample_batch()
  state,next_state = torch.tensor(state,dtype = torch.float),torch.tensor(next_state,dtype = torch.float)

  # Compute Q Value and Target Q Value
  q_values = model(state).gather(1,torch.tensor(action,dtype = torch.int64).unsqueeze(-1))

  with torch.no_grad():
    max_next_q_values = target_model(next_state).detach().max(1)[0]
    q_target_value = torch.tensor(reward,dtype = torch.float) + discount_factor * 
                     max_next_q_values

  optimizer.zero_grad()
  loss = mse(q_values,q_target_value.unsqueeze(1))
  loss.backward()
  optimizer.step()

  return loss.item()


def main():
  # Define Hyperparameters and Parameters
  EPISODES        = 10000
  MAX_REPLAY_SIZE = 10000
  BATCH_SIZE      = 32
  EPSILON         = 1.0
  MIN_EPSILON     = 0.05
  disCOUNT_FACTOR = 0.95
  DECAY_RATE      = 0.99
  LEARNING_RATE   = 1e-3
  SYNCHRONISATION = 33
  EVALUATION      = 32

  # Initialize Environment,Model,Target-Model,Optimizer,Loss Function and Replay Buffer
  env = gym.make("CartPole-v0")

  model        = QNet(state_dim = env.observation_space.shape[0],action_dim = 
                 env.action_space.n)
  target_model = QNet(state_dim = env.observation_space.shape[0],action_dim = 
                 env.action_space.n)
  target_model.load_state_dict(model.state_dict())

  optimizer = optim.Adam(model.parameters(),lr = LEARNING_RATE)
  mse       = MSELoss()

  replay_buffer = ReplayBuffer(max_replay_size = MAX_REPLAY_SIZE,batch_size = BATCH_SIZE)

  while len(replay_buffer) != MAX_REPLAY_SIZE:
    state = env.reset()
    done  = False
    while done != True:
        action = env.action_space.sample()

        next_state,done,_ = env.step(action)

        replay_buffer.push(state,done)

        state = next_state

  # Begin with the Main Loop where the QNet is trained
  count_until_synchronisation = 0
  count_until_evaluation      = 0
  history = {'Episode': [],'Reward': [],'Loss': []}
  for episode in range(EPISODES):
    total_reward = 0.0
    total_loss   = 0.0
    state        = env.reset()
    iterations   = 0
    done         = False
    while done != True:
        count_until_synchronisation += 1
        count_until_evaluation      += 1

        # Take an action
        if np.random.rand(1) < EPSILON:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                output = model(torch.tensor(state,dtype = torch.float)).numpy()
            action = np.argmax(output)

        # Observe new state and reward + store into replay_buffer
        next_state,_ = env.step(action)
        total_reward += reward

        replay_buffer.push(state,done)

        state = next_state

        if count_until_synchronisation % SYNCHRONISATION == 0:
            target_model.load_state_dict(model.state_dict())

        if count_until_evaluation % EVALUATION == 0:
            loss = train(replay_buffer = replay_buffer,model = model,target_model = 
                         target_model,discount_factor = disCOUNT_FACTOR,mse = mse,optimizer = optimizer)
            total_loss += loss

        iterations += 1

    print (f"Episode {episode} is concluded in {iterations} iterations with a total reward 
           of {total_reward}")

    if EPSILON > MIN_EPSILON:
        EPSILON *= DECAY_RATE

    history['Episode'].append(episode)
    history['Reward'].append(total_reward)
    history['Loss'].append(total_loss)

# Plot the Loss + Reward per Episode
fig,ax = plt.subplots(figsize = (10,6))
ax.plot(history['Episode'],history['Reward'],label = "Reward")
ax.set_xlabel('Episodes',fontsize = 15)
ax.set_ylabel('Total Reward per Episode',fontsize = 15)
plt.legend(prop = {'size': 15})
plt.show()

fig,history['Loss'],label = "Loss")
ax.set_xlabel('Episodes',fontsize = 15)
ax.set_ylabel('Total Loss per Episode',fontsize = 15)
plt.legend(prop = {'size': 15})
plt.show()


if __name__ == "__main__":
  main()

解决方法

你的代码看起来不错，写得很好，超参数看起来很合理（除了可能更新频率太低），我认为 Q 网络非常小，只有一个密集层。

更深的模型可能会做得更好（虽然可能不超过 3-4 层），但您说您已经尝试过不同的网络大小。

我想到的另一件事是目标更新。您每 n 步进行一次硬更新；软更新可能会有所帮助，但我不会指望它。

您也可以尝试稍微降低学习率，但我想您已经这样做了。

我的建议是：

尝试不那么频繁的目标更新
尝试更大的（更深的，例如具有 32 个节点的 2/3 密集层），如果您还没有尝试
研究软目标更新（polyak 平均等）
在其他简单的健身房环境中尝试您的实现，并检查其行为是否仍然相同。

遗憾的是，DQN 并不理想，对于很多问题不会收敛，但它应该能够解决 Cartpole。

你的代码看起来不错，我认为你的超参数不理想。我会改变两件事，可能是三件事：

如果我没记错的话，您每 32 步更新一次目标网络。我认为这太低了。在 original paper by Mnih et al. 中，他们每 10k 步执行一次硬更新。想一想：目标网络用于计算损失，您基本上每 32 步更改一次损失函数，这将是每集不止一次。
您的重播缓冲区大小非常小。我会将其设置为 100k 或 1M，即使这比您打算训练的时间长。如果重放缓冲区太小，您将丢失旧的转换，这可能导致您的网络“忘记”它已经学到的东西。不确定这对 cartpole 来说有多戏剧化，但也许值得一试......
学习率也可能更低，我使用 1-e4 和 RMSProp。通常，更改优化器也会产生不同的结果。

希望这会有所帮助，祝你好运:)