微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

如何在R中的data.table中使用自定义函数

如何解决如何在R中的data.table中使用自定义函数

这是我的交易数据。它显示了从from列中的帐户到to列中的帐户的交易,并带有日期和金额信息

data 

id          from    to          date        amount  
<int>       <chr>   <chr>       <date>      <dbl>
19521       6644    6934        2005-01-01  700.0
19524       6753    8456        2005-01-01  600.0
19523       9242    9333        2005-01-01  1000.0
…           …       …           …           …
1056317     7819    7454        2010-12-31  60.2
1056318     6164    7497        2010-12-31  107.5
1056319     7533    7492        2010-12-31  164.1

我想在每笔特定交易发生之前的最近6个月中,对交易网络上的closeness centrality度量进行计算,并希望将此信息另存为原始数据中的新列。

>

我将在此处使用的示例数据是:

structure(list(id = c(83324L,87614L,88898L,89874L,94765L,100277L,101587L),from = c("5370","7816","8046","5492","8756","5370","9254"),to = c("9676","9105","5370"),date = structure(c(13391,13400,13404,13409,13428,13452,13452),class = "Date"),amount = c(261.1,16400,3500,2700,19882,182,14.6)),row.names = c(NA,-7L),class = "data.frame")

现在,以下代码可以很好地在一个小的数据集中完成此操作:

library(tnet)
closeness_fnc <- function(data){
  accounts <- data[date == max(date),from]
  id <- data[date == max(date),id]  
  
  # for directed networks
  df <- data %>% group_by(from,to) %>% mutate(weights = sum(amount)) %>% select(from,to,weights) %>% distinct
  cl <- closeness_w(df,directed = T,gconly=FALSE,alpha = 0.5) 
  
  list(
    id = id,closeness_directed = cl[,"n.closeness"][accounts]
  )

}

network_data <- data[,closeness_fnc(data[(date >= end_date - 180) & (date <= end_date)]),.(end_date = date)] %>% select(-end_date)
# adding this info into the original data
data <- merge(x = data,y = network_data,by = "id")

所以,输出结果与我预期的一样:

# data
id      from    to      date        amount  closeness_directed 
<int>   <chr>   <chr>   <date>      <dbl>   <dbl> 
83324   5370    9676    2006-08-31  261.1   1.00000000
87614   7816    5370    2006-09-09  16400.0 0.98744695
88898   8046    5370    2006-09-13  3500.0  0.35329017
89874   5492    5370    2006-09-18  2700.0  0.25176754
94765   8756    5370    2006-10-07  19882.0 0.39233504
100277  5370    9105    2006-10-31  182.0   0.07167582
101587  9254    5370    2006-10-31  14.6    0.02390589

但是,由于我的数据有超过一百万行,因此该代码将需要一天以上的时间才能完成(它运行了12个小时以上,但尚未完成)。

我有一个类似的运行时问题here,并且我想对这段代码应用相同的逻辑。因此,我将代码修改如下:

library(tnet)
closeness_fnc <- function(data){
  accounts <- data[date == max(date),alpha = 0.5) 
  
  closeness_directed <- cl[,"n.closeness"][accounts]
  closeness_directed <- as.data.frame(closeness_directed)
  closeness_directed$from <- rownames(closeness_directed)
  rownames(closeness_directed) <- NULL

  return(closeness_directed)
}

# this is the approach given in the link I provided:
setDT(data)[,date_minus_180 := date - 180]
data[,':=' (closeness_directed = data[data,closeness_fnc(data),on = .(from,date <= date,date >= date_minus_180),by = .EACHI]$closeness_directed
     )] %>% select(-date_minus_180)

但是,自那以后显然不起作用

data[data,by = .EACHI]

提供输出

from   date         date     closeness_directed      from   
<chr>  <date>      <date>       <dbl>                <chr>

5370    2006-08-31  2006-03-04  0.07167582           5370
5370    2006-08-31  2006-03-04  0.02390589           9254
7816    2006-09-09  2006-03-13  0.07167582           5370
7816    2006-09-09  2006-03-13  0.02390589           9254
8046    2006-09-13  2006-03-17  0.07167582           5370
8046    2006-09-13  2006-03-17  0.02390589           9254
5492    2006-09-18  2006-03-22  0.07167582           5370
5492    2006-09-18  2006-03-22  0.02390589           9254
8756    2006-10-07  2006-04-10  0.07167582           5370
8756    2006-10-07  2006-04-10  0.02390589           9254
1-10 of 14 rows

那么,现在我该如何调整代码解决问题?

更大的数据集

structure(list(id = c(19521L,19522L,19523L,19524L,19525L,19526L,19527L,19528L,19529L,19530L,19531L,0L,19532L,19533L,19534L,21971L,21972L,21973L,21974L,21975L,21976L,21977L,21978L,21979L,21980L,21981L,1L,21761L,21762L,21763L,21764L,21765L,21766L,21767L,21982L,21983L,21984L,21768L,21769L,21770L,21771L,21772L,21773L,2L,21774L,21775L,21776L,21777L,21778L,21779L,21780L,21781L,21782L,3L,21783L,21784L,21785L,21786L,21787L,21788L,21789L,21790L,21791L,21792L,21793L,21794L,21795L,21796L,4L,21797L,21798L,21799L,21800L,21801L,21802L,21803L,21804L,21805L,21806L,21807L,21808L,21809L,21810L,21811L,21812L,21813L,21814L,21815L,5L,21816L,21817L,21818L,21819L,21820L,21821L,21822L,21823L,21824L,21825L,21826L,21827L,21828L,21829L,21830L,6L,21831L,21832L,21833L,21834L,21835L,21836L,21837L,21838L,7L,21839L,21840L,21841L,21842L,21843L,21844L,21845L,21846L,21847L,21848L,21849L,21850L,21851L,21852L,21853L,21854L,21855L,21856L,21857L,8L,21858L,21859L,9L,10L,21860L,21861L,21862L,21863L,21864L,21865L,21866L,21867L,21868L,21869L,21870L,21871L,21872L,21873L,21874L,21875L,21876L,21877L,21878L,21879L,21880L,21881L,21882L,21883L,21884L,21885L,21886L,21887L,21888L,21889L,21890L,21891L,21892L,21893L,21894L,21895L,21896L,21897L,21898L,21899L,21900L,11L,21901L,21902L,21903L,21904L,21905L,21906L,21907L,21908L,21909L,12L,21910L,21911L,21912L,21913L,21914L,21915L,21916L,21917L,21918L,21919L,13L,21920L,21921L,21922L,21923L,21924L,21925L,21926L,21927L,21928L,21929L,21930L,21931L,21932L,21933L,21934L,21935L,21936L,14L,21937L,21938L,21939L,21940L,21941L,21942L,21957L,21958L,21959L,21960L,21961L,21962L,21963L,21964L,15L,21965L,21966L,21967L,21968L,21969L,21970L,21985L,21986L,21987L,21988L,21989L,21990L,21991L,21992L,21993L,21994L,21995L,21996L,16L,17L,21551L,21552L,21553L,21554L,21555L,21556L,21557L,21558L,21559L,21560L,21561L,21562L,21563L,21564L,21565L,21566L,21567L,21997L,21998L,18L,21568L,21569L,21570L,21571L,21572L,21573L,21574L,21575L,21576L,21577L,21578L,21579L,21580L,21581L,19L,21582L,21583L,21584L,21585L,21586L,21587L,21588L,21589L,21590L,21591L,21592L,20L,21593L,21594L,21595L,21596L,21597L,21598L,21599L,21600L,21601L,21602L,21603L,21604L,21605L,21606L,21L,21607L,21608L,21609L,21610L,21611L,21612L,21613L,21614L,21615L,21616L,21617L,21618L,21619L,21620L,21621L,21622L,21623L,21624L,21625L,21626L,22L,21627L,21628L,21629L,21630L,21631L,21632L,21633L,21634L,21635L,21636L,21637L,21638L,21639L,21640L,21641L,21642L,21643L,21644L,21645L,23L,21646L,21647L,21648L,21649L,21650L,21651L,21652L,21653L,21654L,21655L,21656L,21657L,21658L,24L,21659L,21660L,21661L,21662L,21663L,21664L,21665L,21666L,21667L,21668L,21669L,25L,21670L,21671L,21672L,21673L,21674L,21675L,21676L,21677L,21678L,21679L,21680L,21681L,21682L,21683L,26L,21684L,21685L,21686L,21687L,21688L,21689L,21690L,21691L,21692L,21693L,21694L,21695L,21696L,21697L,21698L,21699L,21700L,21701L,21702L,21703L,27L,21704L,21719L,21720L,21721L,21722L,21723L,21724L,21725L,21726L,21727L,21728L,21729L,21730L,21731L,21732L,28L,21733L,21734L,21735L,21736L,21737L,21738L,21739L,21740L,29L,21741L,21742L,21743L,21744L,21745L,21746L,21747L,21748L,21749L,21750L,21751L,21752L,21753L,21754L,21755L,21756L,21757L,21758L,30L,31L,32L,33L,34L,35L,36L,37L,21229L,21230L,21231L,21232L,21233L,21234L,21235L,21236L,21237L,21238L,21239L,21240L,21241L,21242L,21243L,21244L,21245L,21246L,21247L,21248L,21249L,21250L,21251L,21252L,21253L,21254L,21255L,21256L,21257L,21258L),from = c("6644","9843","9242","6753","7075","8685","5513","6340","6042","5587","7237","5695","9582","8539","7939","9077","8946","5591","8380","5865","7867","9457","6968","7971","6150","9361","9379","8409","9740","7226","7531","6752","7362","6661","5730","5417","9049","7057","6252","9476","6228","8896","7371","8170","7122","6694","5450","9435","5619","8289","9862","5504","6555","9845","7537","9482","6810","8257","8490","6588","9652","7303","5852","5746","9198","6917","8688","9460","9640","7054","8628","7065","9006","6832","6185","8422","6914","7069","7848","8436","5494","6375","5653","8912","9794","8413","6527","9101","5815","6923","8184","6811","8130","6539","8643","6329","7744","8211","9641","8003","5599","8715","7108","9573","8583","5648","6444","5660","8191","9830","5931","7921","8314","7940","6265","6604","6509","5618","5860","6469","9525","5887","6626","7145","6862","5741","9144","9163","7297","7599","8427","8865","9418","8636","6530","9155","6934","8817","9028","5521","5943","7443","9557","8239","6819","9761","5983","6830","6368","5381","8782","8008","9160","9615","6920","6164","6278","9729","8960","6358","5939","8902","9522","7344","9070","6594","8058","6639","7896","6325","7804","9554","9725","8475","7746","7536","9671","5415","6837","8327","9061","8981","9226","5862","7085","8925","6226","6849","8432","9545","5837","5440","9732","8695","7690","5829","9373","7977","6361","7320","7603","6303","7077","7850","5792","9588","9204","8648","8950","7106","6334","6843","7060","9606","5520","9350","7463","7947","9668","9490","6241","8830","6374","9528","7919","8532","6795","8162","9275","8106","8615","9206","8283","7052","7737","7815","7932","6125","6671","7800","9835","5573","7874","8931","6748","8192","6822","6950","8020","8555","8986","7644","5736","8421","6224","8374","8304","8677","9208","7008","6074","9409","6269","9721","9304","9117","5420","9691","7728","8579","7495","9838","8139","9571","5385","5454","9620","7723","9249","7033","7966","9844","5793","5747","6362","6925","9318","6780","7150","6818","7246","5514","9574","7838","5540","6646","6893","6417","8039","8721","8763","6401","6510","7970","7117","6001","7505","7646","5600","6522","8395","5601","5418","6296","8790","7622","9012","8165","7624","5468","9316","9030","7155","5702","7492","8503","9868","6807","6404","9076","7213","8735","7849","8551","9351","6693","9653","9504","6948","9358","9280","8168","5456","9138","8420","9312","8930","7699","5506","9475","5776","5517","5644","8680","5474","7534","9363","9586","6508","6193","5401","8032","8461","9387","5812","7564","5917","5434","5794","7840","9085","8331","7175","6669","6352","7432","9810","8776","6112","8869","8248","9450","6974","7264","7336","6880","7866","7777","7502","5615","9777","9214","6039","7714","9056","8358","8963","8657","8846","9319","7220","7764","8967","8683","9137","6971","9747","7449","8259","5373","7300","6273","8391","7862","5696","6622","9240","7021","7313","7247","6679","8102","6812","9473","6345","7935","9696","5541","8939","6887","8998","9110","8666","6670","8975","7518","7549","7841","8888","5808","9807","6860","9811","5935","8966","8684","5915","8892","8493","7894","6342","6382","7833","7201","7253","6720","6175","9201","5682","5473","7173","6094","8810","5874","6947","8462","6885","6201"),to = c("6934","9115","9333","8456","7207","6046","7047","6213","9493","6248","7468","6727","6912","9251","6460","5773","6951","6153","6634","9440","8220","8512","8105","8786","6454","5997","6253","7517","6935","6143","9628","6517","8078","6442","6254","6217","5605","7084","5985","5443","8665","6883","8472","5715","5409","8876","5610","6043","5668","6986","7382","8671","8336","9750","6566","6145","7134","5606","9682","5635","8820","8323","6616","8678","7293","7843","6192","9131","7143","6333","6995","9770","5835","5614","8134","8887","8631","9744","8697","9664","9202","8606","9037","6989","7248","9519","6033","9565","9723","7222","8739","6502","5980","7001","9200","7471","6794","5621","6114","6772","9535","7034","7446","6896","7345","8183","6479","6503","5592","9712","6844","7109","5562","6984","5922","9839","8041","9039","8178","7990","9803","8879","8221","8545","8321","6994","6673","7769","6104","7818","8941","7825","7770","6954","6194","9741","5384","8653","6659","9321","7124","5866","7718","7321","9084","6507","9033","6540","6857","8945","9390","6359","8757","8280","7049","6205","7604","6085","6299","6671"),date = structure(c(12784,12784,12785,12786,12787,12788,12789,12790,12791,12792,12793,12794,12795,12796,12797,12798,12799,12800,12801,12802,12803,12804,12805,12806,12807,12808,12809,12810,12811,12812,12813,12814,12815,12816,12817,12818,12819,12820,12821,12822,12823,12824,12825,12826,12827,12828,12829,12830,12831,12832,12833,12834,12835,12836,12837,12838,12839,12840,12841,12842,12842),amount = c(700,900,1000,600,400,1100,200,800,700,300,5123,3401,500,3242,5298,11832,6774,14264,13851,17366,6007,6207,12644,4276,6434,14779,4507,6446,17477,5009,5718,13967,6959,15914,4470,10737,44749,46552,13156,23323,10792,7268,16125,14440,49752,36518,19961,21899,12336,38000,16983,13.4,42.7,34700,12.6,47.5,13.3,37.1,17,11.1,15.5,22.2,55.8,11.8,50.1,45,15.9,38.8,38.2,20.1,38.9,7.1,107.1,48,62.4,2900,21.5,19.1,14,19.5,15.2,5282,94.7,19.4,28.2,110.2,0.8,23.1,20,19.6,2000,5100,11900,5500,7500,6000,25300,45647,7000,5224,30192,24381,23330,8500,3191,23041,5029,6238,20213,7618,19935,5859,3375,38645,1600,10600,3217,14626,4550,4356,6689,9612,5080,5039,4212,17632,3395,3399,14493,28157,1800,31348,5544,14100,33045,20066,27666,3151,1500,22452,44333,18347,6242,13900,19746,48098,7041,9100,10584,49590,3021,5400,1200,5072,47831,12015,5200,8905,23524,22349,31038,25200,43737,12154,23736,24863,29700,14622,4758,5810,19588,27078,32594,25609,20281,6310,14319,14400,9600,13875,12043,4391,4327,9000,6698,16392,15263,18729,3098,4729,24400,9658,14963,9800,20567,3058,18497,6148,15447,24009,12138,4200,1300,8000,29511,6600,44162,24023,33410,14800,19136,33900,4100,10500,3700,100,3300,9400,45925,41740,6200,3100,31332,62100,7100,28361,4000,4500,2500,11000,8900,23990,1400,10700,10900,3900,18.7,102,2.9,3,39285,52.1,34.1,18.5,21.3,38.3,160,110.5,58.6,83.4,34.7,68.6,31,20.3,5.4,89.3,110.6,61.5,72.7,13.7,20.7,25.9,110,16.2,39,73.8,23.7,249,29.6,117.3,199)),-500L),class = "data.frame")

解决方法

这是一种实现方法:

closeness_fnc <- function(DT,grp){
  cl <- tnet::closeness_w(DT[,sum(amount),by = .(from,to)],directed = T,gconly=FALSE,alpha = 0.5) 
  
  ind = DT[date == max(date),from]
  cl[ind,"n.closeness"]))
}

setDT(data)
data[,closeness_directed := closeness_fnc(data[(date >= end_date - 180) & (date <= end_date)]),by = .(end_date = date)]

##     end_date     id closeness_directed
##       <Date>  <int>              <num>
##1: 2006-08-31  83324         1.00000000
##2: 2006-09-09  87614         0.98744695
##3: 2006-09-13  88898         0.35329017
##4: 2006-09-18  89874         0.25176754
##5: 2006-10-07  94765         0.39233504
##6: 2006-10-31 100277         0.07167582
##7: 2006-10-31 101587         0.02390589

请注意,closeness_fnc()被简化为仅使用代码可以进行类似的调整。然后,我主要使用了您的原始方法,该方法似乎效果很好。

编辑 问题的一部分是,我们正在汇总每个日期的往返时间。相反,我们可以一次全部聚合并创建一个查找表。

closeness_fx3 = function(from,to,amount,date) {
  cl = closeness_w(data.frame(from,amount),directed = TRUE,gconly = FALSE,alpha = 0.5)
  
  cl[from[date == max(date)],"n.closeness"]
}

setDT(data)
agg_data = data[,.(tot_amount = sum(amount)),by = .(date)]
agg_data[,end_date := date - 180]

data[,closeness_directed := .SD[agg_data,on = .(date >= end_date,date <= date),closeness_fx3(from,x.date),by = .EACHI]$V1
]
data

使用bench::mark(包括检查结果是否相等),该方法比其他解决方案要快一些。

# A tibble: 3 x 13
  expression        min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr>      <bch> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 simple_refactor 414ms  592ms      1.81     188MB     2.89    10    16      5.54s
2 ekoam           337ms  356ms      2.39     183MB     3.59    10    15      4.18s
3 use_agg         299ms  314ms      2.77     178MB     3.88    10    14       3.6s
,

考虑如下重写您的closeness_func

closeness_info <- function(from,DT) {
  DT <- DT[,.(weights = sum(amount)),to)]
  res <- closeness_w(DT,TRUE,FALSE,alpha = 0.5)
  `names<-`(res[,"n.closeness"],row.names(res))[from]
}

然后,您可以使用以下data.table操作来实现您的目标:

set_closeness <- function(DT) {
  DT[,closeness_directed := closeness_info(.SD$from,DT[between(date,.BY$date - 180,.BY$date),]),by = date]
}

现在,让我们使用您提供的长数据集,对照您的原始性能对本实现的性能进行基准测试。在这里,我将其称为df。我们首先创建该数据集的两个副本。这是必需的,因为默认情况下data.table使用引用传递。如果我们没有创建数据集的两个副本,那么以下测试将始终应用于同一对象(即df)。

data1 <- copy(df)
data2 <- copy(df)

我们将您的原始实现应用于data1

> system.time({
+   closeness_fnc <- function(data){
+     accounts <- data[date == max(date),from]
+     id <- data[date == max(date),id]  
+     
+     # for directed networks
+     df <- data %>% group_by(from,to) %>% mutate(weights = sum(amount)) %>% select(from,weights) %>% distinct
+     cl <- closeness_w(df,alpha = 0.5) 
+     
+     list(
+       id = id,+       closeness_directed = cl[,"n.closeness"][accounts]
+     )
+     
+   }
+   network_data <- data1[,closeness_fnc(data1[(date >= end_date - 180) & (date <= end_date)]),.(end_date = date)] %>% select(-end_date)
+   data1 <- merge(x = data1,y = network_data,by = "id")
+ })
   user  system elapsed 
   1.19    0.07    1.26

然后我们将上面的新实现应用于data2

> system.time({
+   closeness_info <- function(from,DT) {
+     DT <- DT[,to)]
+     res <- closeness_w(DT,alpha = 0.5)
+     `names<-`(res[,row.names(res))[from]
+   }
+   set_closeness <- function(DT) {
+     DT[,by = date]
+   }
+   set_closeness(data2)
+ })
   user  system elapsed 
   0.33    0.07    0.40 

它们产生相同的结果集吗?

> all(data1[order(id),] == data2[order(id),])
[1] TRUE

输出:

> data2[order(id),]
        id from   to       date amount closeness_directed
  1:     0 5695 7468 2005-01-04    700       0.0344016544
  2:     1 9379 6213 2005-01-08  11832       0.0492013976
  3:     2 8170 7517 2005-01-10   1000       0.0097043019
  4:     3 9845 6143 2005-01-12   4276       0.0142486370
  5:     4 9640 6254 2005-01-14    200       0.0022874217
 ---                                                     
496: 21994 6671 6033 2005-02-07  14100       0.0064464840
497: 21995 7800 8967 2005-02-07  33045       0.0098688428
498: 21996 9835 8105 2005-02-07   1800       0.0023032952
499: 21997 7008 6794 2005-02-08    200       0.0006955321
500: 21998 6074 9457 2005-02-08    600       0.0026025058

鉴于您不想重写closeness_w函数,我认为这可能是最好的性能,按照David的建议,这可能是瓶颈。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。