无模型Q-learning不能改进

如何解决无模型Q-learning不能改进

我应用无模型 Q 学习算法，通过添加或删除 VM（水平自动缩放）将响应时间保持在 [0.4,0.6] 范围内。我使用因子 [0,1,2,-1,-2] 这意味着 0：不扩展，1 添加 1 个 VM，-1 删除 1 个 VM。我使用 Amazon Cloudwatch 每分钟读取一次测量值。我训练了 10 集的算法，每集包含 200 个步骤。结果是任何状态的动作都是 0。我得到的 Q 表是：

    [[7.00356635 2.44257089 2.48274487 2.61551342 2.30166671]
    [7.06118893 2.88096418 3.68686476 4.12289437 3.78545427]
    [6.48884566 3.38330439 2.71853144 3.03036759 3.3645094 ]]

第一行是“低”我的意思是响应时间小于 0.4。最后一行是高表示响应时间高于 0.6。问题出在哪里，为什么agent没有学好？我希望当响应时间很长时代理应该添加 VM，当响应时间很短时代理应该释放 VM。

代码是：

    class AgentWrapper:

states = ["L","R","H"]   

actions = [0,-2]

q_table = numpy.zeros((len(states),len(actions)))

def average_RT_normalize(self,a,b,R_value,average_measurment):
    
    normalize = b*2/math.pi*math.atan(average_measurment/R_value-a)
    
    return normalize

def reward_fun(self,lower_value,upper_value):
    

    
    response_time_reward = 0
    
    # calculate average response time reward
    if self.list_metrics[0] <= lower_value:
        
        response_time_reward = 0 
    
    elif self.list_metrics[0] > lower_value and self.list_metrics[0] <= R_value:
        
        response_time_reward = (self.list_metrics[0] - lower_value)/(R_value - lower_value)
        
    elif self.list_metrics[0] > R_value and self.list_metrics[0] <= upper_value:
        
        response_time_reward = (upper_value - self.list_metrics[0])/(upper_value - R_value)
    
    else:
        
        response_time_reward = 0
        
    
    reward_result = response_time_reward
    
    return reward_result


def state_reward(self,namespaces,metric_names,dimensions,statistics,upper_value):
    
    reward = 0
    
    CWobject = CloudwatchWrapper()
    
    self.list_metrics = CWobject.getmetrics(namespaces,statistics)
    
    print ("Timestamp = ",self.list_metrics[1])
    print ("Response Time = ",self.list_metrics[0])
    
    require_RT_normal = self.average_RT_normalize(0,100,R_value)
    lower_value_normalize = self.average_RT_normalize(0,lower_value)
    upper_value_normalize = self.average_RT_normalize(0,upper_value)
    
    normalize_measurment = self.average_RT_normalize(0,self.list_metrics[0])
    print ("normalized Response Time = ",normalize_measurment)
    
    if  normalize_measurment < lower_value_normalize:
        state = type(self).states[0]
        
    if  normalize_measurment >= lower_value_normalize and normalize_measurment <= upper_value_normalize :
        state = type(self).states[1]
        
    if  normalize_measurment > upper_value_normalize:
        state = type(self).states[2]
    
    reward = self.reward_fun(R_value,upper_value)
    
    
    return [state,reward]

def Q_learning(self,upper_value):
    
    print ( "required Response Time Range = [",",upper_value,"]")
    
    print ( "normalized required Response Time Range = [",self.average_RT_normalize(0,lower_value),upper_value),"]")
    
    print ("Q-table = ")
    print(type(self).q_table)
    print("------------------------------------------------")
    
    
    
    autoscaling = AutoscalingWrapper()
    
    num_episodes = 10
    
    max_step = 200
    
    learning_rate = 0.1
    
    discount_rate = 0.99
    
    exploration_rate = 0.9
    
    
    for episode in range(num_episodes):
        
        print("Episode_number = ",episode)
            
        state_reward_var = self.state_reward(namespaces,upper_value)
        
    

        for step in range (max_step): 
            
           
            exploration_rate_threshold = random.uniform(0,1)

            print("exploration_rate = ",exploration_rate)
            print("exploration_rate_threshold = ",exploration_rate_threshold)

            if exploration_rate_threshold >= exploration_rate:
                
                action = type(self).actions[numpy.argmax(type(self).q_table[type(self).states.index(state_reward_var[0]),:])]
                print("action q_table...........................")
            else:
                action = (random.choice(type(self).actions))
                print("action randomly............................")

            print("action = ",action)
            print("state = ",state_reward_var[0])


            if action > 0:  #Run the inactive instances according to the action to make scale-out

                inactive_instances = autoscaling.get_autoscaling_group_max(dimensions)-autoscaling.get_autoscaling_group_desired(dimensions)
                print("inactive instances = ",inactive_instances)

                # if amount of requiered inactive instances is available 
                if inactive_instances >= action and inactive_instances > 0:
                    print("inactive instances are available for scaling out................" )
                    autoscaling.updat_autoscaling_group(dimensions,action+autoscaling.get_autoscaling_group_desired(dimensions))

                # if amount of requiered inactive instances is not available,run all the inactive instances

                elif inactive_instances >0 and inactive_instances < action:
                    print("inactive instances are not available for scaling out.................")
                    #autoscaling.updat_autoscaling_group(dimensions,autoscaling.get_autoscaling_group_max(dimensions))
                    action = 0
                    print("Action will be replaced to action = 0")
                elif inactive_instances == 0:
                    print("Scaling out is not possible since no instances are available......")
                    action = 0
                    print("Action will be replaced to action = 0")


            elif action < 0: #release the active instances according to the action to make scale-in

                active_instances = autoscaling.get_autoscaling_group_desired(dimensions)
                print("active instances = ",active_instances)

                if active_instances == autoscaling.get_autoscaling_group_min(dimensions):
                    print("Scale in is not possible since active instances are the minimum..... ")
                    action = 0
                    print("Action will be replaced to action = 0")
                elif  active_instances > abs(action) and \
                    active_instances + action >= autoscaling.get_autoscaling_group_min(dimensions):
                    print("active instances are available for scaling in..............")
                    autoscaling.updat_autoscaling_group(dimensions,action+autoscaling.get_autoscaling_group_desired(dimensions))

                 # if amount of requiered active instances is not available,stop all active instances
                elif active_instances + action < autoscaling.get_autoscaling_group_min(dimensions) and active_instances <= abs(action):
                    print("active instances are not available for scaling in..............")
                    #autoscaling.updat_autoscaling_group(dimensions,autoscaling.get_autoscaling_group_min(dimensions))
                    action = 0
                    print("Action will be replaced to action = 0")

            elif action == 0:
                print("scaling is not active..................................")



            #observe the new state 
            print("sleep for 1 minutes until get new metrics............")
            time.sleep(60)
            new_state_reward_var = self.state_reward(namespaces,upper_value)


            #Get intermediate reward
            print("reward = ",new_state_reward_var[1])
        

            #update the Q-table for estimate Q(s,a)

            old_estimate = type(self).q_table[type(self).states.index(state_reward_var[0]),type(self).actions.index(action)]

            optimal_new_state = numpy.max(type(self).q_table[type(self).states.index(new_state_reward_var[0]),:])

            print ("Estimation equation = ",old_estimate,"+",learning_rate,"*(",new_state_reward_var[1],"+(",discount_rate,"*",optimal_new_state,")-",")")
                
            type(self).q_table[type(self).states.index(state_reward_var[0])][type(self).actions.index(action)] = old_estimate + \
                    learning_rate * (new_state_reward_var[1] + (discount_rate * optimal_new_state) - old_estimate ) 


            print("Q_Table = ")
            print(type(self).q_table)
            # make the new state is the current state 
            state_reward_var = new_state_reward_var
            
            #if new_state_reward_var[1] != 0:
             #   break
        if exploration_rate > 0.1:
            exploration_rate = round(exploration_rate-0.1,1)
        time.sleep(60)
        print("------------------------------------------------------------------------------------------")