忽略 keras 中 R - 编程之家

如何解决忽略 keras 中 R

我正在使用 keras R 包将 LSTM 模型拟合到多元时间序列（有关 Python 或 PyTorch 中的 keras 的答案也会有所帮助，因为我可以切换）并且有多个输出（3 个连续的，一个分类的）。有些目标在某些时间步长中丢失（编码为 -1，因为所有观察到的值都是 $\geq 0$，但我显然可以将其更改为其他任何值）。我认为有意义的是，如果目标变量丢失 (=-1)，模型的任何预测都被认为是正确的（= 没有损失）。我对预测值是否缺失没有兴趣，因此强制模型输出 -1 对我来说没有兴趣，即使模型可以可靠地预测缺失。我更愿意预测缺失值是多少（即使我无法检查这是否正确）。

如何创建“忽略”-1 值/认为它们正确的自定义损失函数？

如果上下文更重要，下面是说明我的模型的图表，下面是用于生成一些示例数据并拟合模型的 R 代码，以防数据丢失。删除下面代码中 # %>% mutate_at(vars(x1:x4,y1:y4),randomly_set_to_minus_one) 行的注释后，您会得到一些编码为 -1 的输入和输出。我没有强烈的意见应该如何将这些编码为特征，我也可以将这些值设置为中值输入值并添加一个标记缺失或其他内容。（对我而言）真正重要的是我的损失函数正确处理 -1 目标值。在帖子的最后，我尝试编写这样一个损失函数失败了。

library(tidyverse)
library(keras)

# A function I use to set some values randomly to -1
randomly_set_to_minus_one = function(x){
  ifelse(rnorm(length(x))>1,-1,x)
}
# randomly_set_to_minus_one(rnorm(100))

set.seed(1234)
subjects = 250
records_per_subject = 25

# Simulate some time series for multiple subject with multiple records per subject.
example = tibble(subject = rep(1:subjects,each=records_per_subject),rand1 = rep(rnorm(subjects),rand2 = rep(rnorm(subjects),rand3 = rnorm(subjects*records_per_subject),rand4 = rnorm(subjects*records_per_subject)) %>%
  mutate(x1 = 0.8*rand1 + 0.2*rand2 + 0.8*rand3 + 0.2*rand4 + rnorm(n=n(),sd=0.1),x2 = 0.1*rand1 + 0.9*rand2 + 2*rand3 + rnorm(n=n(),x3 = 0.5*rand1 + 0.5*rand2 + 0.2*rand4 + rnorm(n=n(),sd=0.25),x4 = 0.2*rand1 + 0.2*rand2 + 0.5*rand3 + 0.5*rand4 + rnorm(n=n(),x5 = rep(1:records_per_subject,subjects),y1 = 1+tanh(rand1 + rand2 + 0.05*rand3 + 0.05*rand4 + 2*x5/records_per_subject + rnorm(n=n(),sd=0.05)),y2 = 10*plogis(0.2*rand1 + 0.2*rand2 + 0.2*rand3 + 0.2*rand4),y3 = 3*plogis(0.8*rand1 + 0.8*rand4 + 2*(x5-records_per_subject/2)/records_per_subject),prob1 = exp(rand1/4*3+rand3/4),prob2 = exp(rand2/4*3+rand4/4),prob3 = exp(-rand1-rand2-rand3-rand4),total = prob1+prob2+prob3,prob1 = prob1/total,prob2 = prob2/total,prob3 = prob3/total,y4 = pmap(list(prob1,prob2,prob3),function(x,y,z) sample(1:3,1,replace=T,prob=c(x,z)))) %>%
  unnest(y4) %>%
  mutate(x1 = x1 + min(x1),x2 = x2 + min(x2),x3 = x3 + min(x3),x4 = x4 + min(x4)) %>%
  dplyr::select(subject,x1:x5,y1:y4) 
# %>% mutate_at(vars(x1:x4,randomly_set_to_minus_one)
  
# Create arrays the way keras wants them as inputs/outputs:
# 250,25,5 array of predictors
x_array = map(sort(unique(example$subject)),function(x) {
  example %>%
    filter(subject==x) %>%
    dplyr::select(x1:x5) %>%
    as.matrix()
}) %>%
  abind::abind(along=3 ) %>%
  aperm(perm=c(3,2))

# 250,3 array of continuous target variables
y13_array = map(sort(unique(example$subject)),function(x) {
  example %>%
    filter(subject==x) %>%
    dplyr::select(y1:y3) %>%
    as.matrix()
}) %>%
  abind::abind(along=3 ) %>%
  aperm(perm=c(3,1 array of categorical target variables (one-hot-encoded)
y4_array = map(sort(unique(example$subject)),function(x) {
  example %>%
    filter(subject==x) %>%
    mutate(y41 = case_when(y4==1~1,y4==-1~-1,TRUE~0),y42 = case_when(y4==2~1,y43 = case_when(y4==3~1,TRUE~0)) %>%
    dplyr::select(y41:y43) %>%
    as.matrix()
}) %>%
  abind::abind(along=3 ) %>%
  aperm(perm=c(3,2))

# Define LSTM neural network
nn_inputs <- layer_input(shape = c(dim(x_array)[2],dim(x_array)[3])) 

nn_lstm_layers <- nn_inputs %>%
  layer_lstm(units = 32,return_sequences = TRUE,dropout = 0.3,# That's dropout applied to the inputs,the below is recurrent drop-out applied to LSTM memory cells
             recurrent_dropout = 0.3) %>%
  layer_lstm(units = 16,recurrent_dropout = 0.3)

# First continuous output (3 variables)
cont_target <- nn_lstm_layers %>%
  layer_dense(units = dim(y13_array)[3],name = "cont_target")

# Categorical outcome (3 categories one-hot-encoded)
cat_target <- nn_lstm_layers %>%
  layer_dense(units = dim(y4_array)[3],activation = "sigmoid",name = "cat_target")

model <- keras_model(nn_inputs,list(cont_target,cat_target))
summary(model)

val_samples = sample(x=c( rep(FALSE,floor(dim(x_array)[1]*0.8)),rep(TRUE,ceiling(dim(x_array)[1]*0.2))),size = dim(x_array)[1],replace = F)

model %>% compile(
  optimizer = "rmsprop",loss = list( cont_target = "mse",cat_target = "categorical_crossentropy"),loss_weights = list(cont_target = 1.0,cat_target = 1.0))

history <- model %>% 
  fit(
    x_array[!val_samples,],list(cont_target = y13_array[!val_samples,cat_target = y4_array[!val_samples,]),epochs = 100,batch_size = 32,validation_data = list(x_array[val_samples,list(cont_target = y13_array[val_samples,cat_target = y4_array[val_samples,])),callbacks = list(callback_reduce_lr_on_plateau(
      monitor = "val_loss",factor = 0.5,patience = 10,verbose = 0,mode = "min",min_delta = 1e-04,cooldown = 0,min_lr = 0),callback_early_stopping(monitor = "val_loss",min_delta = 0,patience = 20,restore_best_weights = TRUE,mode = c("auto")))
  )

plot(history) + scale_y_log10()

这是我尝试编写一个忽略 -1 值的修改后的 MSE 损失函数：

# Custom loss functions to deal with missing values (coded as -1)
mse_na_loss <- function(y_true,y_pred){
  K <- backend()
  #K$mean( K$switch(K$equal(y_true,-1),K$zeros(shape=K$constant(y_true)$shape),K$pow(y_true-y_pred,2)),axis=-1)
  #K$mean( K$pow(y_true-y_pred,2))
  #K$zeros(shape=K$constant(y_true)$shape)
  #K$equal(y_true,-1)
  K$mean(
  K$switch( K$equal(y_true,K$zeros(shape=K$constant(y_true)$shape,dtype = "float64"),axis=-1L)
}

解决方法

我认为有意义的是，如果目标变量缺失 (=-1)，模型的任何预测都被认为是正确的（= 没有损失）。

您可以通过检查 y_true 是否不同于 -1 (=no loss incurred) 然后将二进制转换为数字 (k_not_equal) 来实现此目的 (k_cast)。这将为您提供诸如 (1,1,0) 之类的值，这些值可以与 MSE 相乘。

mse_na_loss <- function(y_true,y_pred){
  k_pow(y_true-y_pred,2) * k_cast(k_not_equal(y_true,-1),'float32')
}

这基本上会为您提供您在问题结束时尝试制作的损失函数。并回答您问题中引用的部分。

但是，我认为这不是一个好方法。正如您所说，此损失函数不会“忽略”这些观察结果。它只是了解到任何值都适合这里。这可能会给您的学习带来不必要的干扰。

基于域，其他 NA 处理方法，例如“最后一次观察结转”（na.locf）可能比 -1 更好。