带有map2_df的2个数据集之间的差异未在函数中提供所需的结果，但单独运行时可以正常工作

如何解决带有map2_df的2个数据集之间的差异未在函数中提供所需的结果，但单独运行时可以正常工作

我有一个计算两个数据集之间差异的函数，该函数在小型数据集上可以正常使用，但是当我在大型数据集上尝试时，它给我一个错误，我无法理解的是，当我单独运行每个步骤都可以正常工作，但是一旦我运行了整个功能，就会给我一个错误。

# function 
soustraction.j=function(D,R,i,threshold){
  D=as.data.frame(D)
  R=as.data.frame(R)
  dif=purrr::map2_df(D[-1],R[i,-1],`-`)
  dif[dif<0] = 0
  dif$mismatch=rowSums(dif)
  dif= cbind(ID = D[1],R[1],dif)
  dif=dif[which(dif$mismatch <= threshold),]
  return(dif)
}

# trying it on a small dataset 
# small data sets
#####################################
# data frame for recipients
IDr= c(seq(1,4))
BTR=c("A","B","AB","O")
data_R=data.frame(IDr,BTR,A=rep(0,4),B=c(rep(0,3),1),C=c(rep(1,0),D=rep(1,E=c(rep(0,2),rep(1,stringsAsFactors=FALSE)

  data_R
  IDr BTR A B C D E
1   1   A 0 0 1 1 0
2   2   B 0 0 1 1 0
3   3  AB 0 0 1 1 1
4   4   O 0 1 0 1 0

# data frame for donors 
IDd= c(seq(1,8))
BTD= c(rep("A",each=2),rep("B",rep("AB",rep("O",each=2))
WD= c(rep(0.25,rep(0.125,rep(0.5,each=2))
data_D=data.frame(IDd,BTD,A=c(rep(0,6),1,7),8),WD,stringsAsFactors=FALSE)

  data_D
  IDd BTD A B C D E    WD
1   1   A 0 0 1 1 0 0.250
2   2   A 0 0 1 1 0 0.250
3   3   B 0 0 1 1 0 0.125
4   4   B 0 0 1 1 0 0.125
5   5  AB 0 0 1 1 0 0.125
6   6  AB 0 0 1 1 0 0.125
7   7   O 1 1 1 1 1 0.500
8   8   O 1 1 0 1 0 0.500

# Applying function
 soustraction.j(data_D[,c(1,3:7)],data_R[,3)
  IDd IDr A B C D E mismatch
1   1   1 0 0 0 0 0        0
2   2   2 0 0 0 0 0        0
3   3   3 0 0 0 0 0        0
4   4   4 0 0 0 0 0        0
5   5   1 0 0 0 0 0        0
6   6   2 0 0 0 0 0        0
7   7   3 1 1 0 0 1        3
8   8   4 1 1 0 0 0        2

###############################################
######  different datasets   #################

######### generating a pool of donor#########
set.seed(1023)
x=t(replicate(1000,rbinom(400,0.5)))
colnames(x)=paste0("epi",sprintf("%02d",1:400))
pool1=as.data.frame(x)
duptimes <- c(5,10),rep(0,298),rep(2,200),rep(3,100),rep(4,50),40),10)
# Create an index of the rows you want with duplication
idx <- rep(1:nrow(pool1),duptimes)
# Use that index to generate the new data frame
dupdf <- pool1[idx,]
pool=rbind(pool1,dupdf)
y=runif(2025)
freq.g=y/sum(y)
BTD=replicate(2025,sample(c("A","O"),prob = c(0.42,0.09,0.03,0.46)))
pooldup=as.data.frame(cbind(IDd=seq(1:2025),pool,freq.g))

pooldup[1:5,1:6]
  IDd BTD epi01 epi02 epi03 epi04
1   1   A     0     0     1     0
2   2   O     0     1     1     1
3   3   O     1     1     1     1
4   4  AB     1     0     0     0
5   5   A     1     1     1     0

######### generating recipient data#########
set.seed(1024)
x1=t(replicate(20,0.5)))
colnames(x1)=paste0("epi",1:400))
x1r=as.data.frame(x1)
BTR=replicate(20,0.46)))
rdata=as.data.frame(cbind(IDr=seq(1:20),x1r))

 rdata[1:5,1:6]
  IDr BTR epi01 epi02 epi03 epi04
1   1   B     0     1     0     0
2   2   B     1     1     0     0
3   3   O     1     1     1     1
4   4   A     0     0     0     0
5   5   O     1     1     0     0

# Applying the function 
soustraction.j(pooldup[,3:402)],rdata[,75)
#  Error in data.frame(...,check.names = FALSE) : 
# arguments imply differing number of rows: 2025,20

当我单独运行步骤purrr::map2_df(D[-1],-) 时，它可以工作，但不在功能内。如果我的代码太长，我很抱歉，我只是不知道此错误来自何处。谢谢您的帮助。

解决方法

错误在cbind中。 dif是具有2025行的数据帧，而R具有20行的数据帧。

cbind通常会回收值

cbind(1:2,1:3)
#> [,1] [,2]
#> [1,]    1    1
#> [2,]    2    2
#> [3,]    1    3
#> Warning message:
#>   In cbind(1:2,1:3) :
#>   number of rows of result is not a multiple of vector length (arg 1)

但是，对于数据帧，它会因部分回收而出错

x <- data.frame(a = 1:2)
y <- data.frame(a = 1:3)
cbind(x,y)
#> Error in data.frame(...,check.names = FALSE) : 
#>   arguments imply differing number of rows: 2,3

如果您希望小数重新整理与cbind一起使用，请转换为matrix并设置行数

cbind(matrix(unlist(x),ncol = 1,nrow = 3),matrix(unlist(y),nrow = 3))
#>        x
#> [1,] 1 1
#> [2,] 2 2
#> [3,] 1 3
#> Warning message:
#>   In matrix(unlist(x),nrow = 3) :
#>   data length [2] is not a sub-multiple or multiple of the number of rows [3]

带有map2_df的2个数据集之间的差异未在函数中提供所需的结果，但单独运行时可以正常工作

如何解决带有map2_df的2个数据集之间的差异未在函数中提供所需的结果，但单独运行时可以正常工作

解决方法

相关推荐