如何解决使用 Q-Learning 算法同时查找多个对象的最短路径

我正在尝试使用 Q 学习算法为多个对象同时并行而非序列寻找最短路径

这是我正在编写的代码

std::stringstream

初始化奖励和Q矩阵

edges = [(0,1),(1,0),(0,3),(3,2),(2,4),(4,5),(5,6),(6,7),(7,8),(8,9),(9,10),(10,11),(11,12),(12,6)]


G = nx.Graph() 
G.add_edges_from(edges) 
pos = nx.spring_layout(G,seed=1)
nx.draw_networkx_nodes(G,pos) 
nx.draw_networkx_edges(G,pos) 
nx.draw_networkx_labels(G,pos) 
plt.show()

函数接受一个起始节点并返回下一个节点

R = np.matrix(np.zeros(shape=(13,13)))
for node in G.nodes:
    for x in G[node]:
        if node==10 :
            R[x,node]=100
    
            
Q= np.matrix(np.zeros(shape=(13,13)))
Q-=100
for node in G.nodes:
    for x in G[node]:
        Q[node,x]=0
        Q[x,node]=0
        
    
import pandas as pd
pd.DataFrame(R)
pd.DataFrame(Q)

用于更新所采取动作的 Q 值的函数

import random
def next_number(start,er):
    random_value = random.uniform(0,1) #chooses a random number between 0 and 1 
    if random_value<er: # if the random number is below the treshold,it chooses next action randomly 
        sample=G[start]    
    else:
        sample=np.where(Q[start,] == np.max(Q[start,]))[1]
    next_node= int(np.random.choice(sample,1))
    return next_node

步行 50000 次提高 Q 值

def updateQ(node1,node2,lr,discount):
    max_index = np.where(Q[node2,] == np.max(Q[node2,]))[1] # finds the index of the highest Q-value
    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index,size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[node2,max_index]
    Q[node1,node2] = int((1-lr)*Q[node1,node2]+lr*(R[node1,node2] + discount * max_value)) # updates the Q-value using Bellman equation

def learn(er,discount):
    for i in range(50000):
            start = np.random.randint(0,13)
            next_node=next_number(start,er)
            updateQ(start,next_node,discount)
        

learn(0.5,0.8,0.8) 
# I set exploration rate,learning rate and discount factor of the learning