如何解决如何在R中的data.table中使用自定义函数
这是我的交易数据。它显示了从from
列中的帐户到to
列中的帐户的交易,并带有日期和金额信息
data
id from to date amount
<int> <chr> <chr> <date> <dbl>
19521 6644 6934 2005-01-01 700.0
19524 6753 8456 2005-01-01 600.0
19523 9242 9333 2005-01-01 1000.0
… … … … …
1056317 7819 7454 2010-12-31 60.2
1056318 6164 7497 2010-12-31 107.5
1056319 7533 7492 2010-12-31 164.1
我想在每笔特定交易发生之前的最近6个月中,对交易网络上的closeness centrality
度量进行计算,并希望将此信息另存为原始数据中的新列。
我将在此处使用的示例数据是:
structure(list(id = c(83324L,87614L,88898L,89874L,94765L,100277L,101587L),from = c("5370","7816","8046","5492","8756","5370","9254"),to = c("9676","9105","5370"),date = structure(c(13391,13400,13404,13409,13428,13452,13452),class = "Date"),amount = c(261.1,16400,3500,2700,19882,182,14.6)),row.names = c(NA,-7L),class = "data.frame")
library(tnet)
closeness_fnc <- function(data){
accounts <- data[date == max(date),from]
id <- data[date == max(date),id]
# for directed networks
df <- data %>% group_by(from,to) %>% mutate(weights = sum(amount)) %>% select(from,to,weights) %>% distinct
cl <- closeness_w(df,directed = T,gconly=FALSE,alpha = 0.5)
list(
id = id,closeness_directed = cl[,"n.closeness"][accounts]
)
}
network_data <- data[,closeness_fnc(data[(date >= end_date - 180) & (date <= end_date)]),.(end_date = date)] %>% select(-end_date)
# adding this info into the original data
data <- merge(x = data,y = network_data,by = "id")
所以,输出结果与我预期的一样:
# data
id from to date amount closeness_directed
<int> <chr> <chr> <date> <dbl> <dbl>
83324 5370 9676 2006-08-31 261.1 1.00000000
87614 7816 5370 2006-09-09 16400.0 0.98744695
88898 8046 5370 2006-09-13 3500.0 0.35329017
89874 5492 5370 2006-09-18 2700.0 0.25176754
94765 8756 5370 2006-10-07 19882.0 0.39233504
100277 5370 9105 2006-10-31 182.0 0.07167582
101587 9254 5370 2006-10-31 14.6 0.02390589
但是,由于我的数据有超过一百万行,因此该代码将需要一天以上的时间才能完成(它运行了12个小时以上,但尚未完成)。
我有一个类似的运行时问题here,并且我想对这段代码应用相同的逻辑。因此,我将代码修改如下:
library(tnet)
closeness_fnc <- function(data){
accounts <- data[date == max(date),alpha = 0.5)
closeness_directed <- cl[,"n.closeness"][accounts]
closeness_directed <- as.data.frame(closeness_directed)
closeness_directed$from <- rownames(closeness_directed)
rownames(closeness_directed) <- NULL
return(closeness_directed)
}
# this is the approach given in the link I provided:
setDT(data)[,date_minus_180 := date - 180]
data[,':=' (closeness_directed = data[data,closeness_fnc(data),on = .(from,date <= date,date >= date_minus_180),by = .EACHI]$closeness_directed
)] %>% select(-date_minus_180)
但是,自那以后显然不起作用
data[data,by = .EACHI]
提供输出
from date date closeness_directed from
<chr> <date> <date> <dbl> <chr>
5370 2006-08-31 2006-03-04 0.07167582 5370
5370 2006-08-31 2006-03-04 0.02390589 9254
7816 2006-09-09 2006-03-13 0.07167582 5370
7816 2006-09-09 2006-03-13 0.02390589 9254
8046 2006-09-13 2006-03-17 0.07167582 5370
8046 2006-09-13 2006-03-17 0.02390589 9254
5492 2006-09-18 2006-03-22 0.07167582 5370
5492 2006-09-18 2006-03-22 0.02390589 9254
8756 2006-10-07 2006-04-10 0.07167582 5370
8756 2006-10-07 2006-04-10 0.02390589 9254
1-10 of 14 rows
更大的数据集
structure(list(id = c(19521L,19522L,19523L,19524L,19525L,19526L,19527L,19528L,19529L,19530L,19531L,0L,19532L,19533L,19534L,21971L,21972L,21973L,21974L,21975L,21976L,21977L,21978L,21979L,21980L,21981L,1L,21761L,21762L,21763L,21764L,21765L,21766L,21767L,21982L,21983L,21984L,21768L,21769L,21770L,21771L,21772L,21773L,2L,21774L,21775L,21776L,21777L,21778L,21779L,21780L,21781L,21782L,3L,21783L,21784L,21785L,21786L,21787L,21788L,21789L,21790L,21791L,21792L,21793L,21794L,21795L,21796L,4L,21797L,21798L,21799L,21800L,21801L,21802L,21803L,21804L,21805L,21806L,21807L,21808L,21809L,21810L,21811L,21812L,21813L,21814L,21815L,5L,21816L,21817L,21818L,21819L,21820L,21821L,21822L,21823L,21824L,21825L,21826L,21827L,21828L,21829L,21830L,6L,21831L,21832L,21833L,21834L,21835L,21836L,21837L,21838L,7L,21839L,21840L,21841L,21842L,21843L,21844L,21845L,21846L,21847L,21848L,21849L,21850L,21851L,21852L,21853L,21854L,21855L,21856L,21857L,8L,21858L,21859L,9L,10L,21860L,21861L,21862L,21863L,21864L,21865L,21866L,21867L,21868L,21869L,21870L,21871L,21872L,21873L,21874L,21875L,21876L,21877L,21878L,21879L,21880L,21881L,21882L,21883L,21884L,21885L,21886L,21887L,21888L,21889L,21890L,21891L,21892L,21893L,21894L,21895L,21896L,21897L,21898L,21899L,21900L,11L,21901L,21902L,21903L,21904L,21905L,21906L,21907L,21908L,21909L,12L,21910L,21911L,21912L,21913L,21914L,21915L,21916L,21917L,21918L,21919L,13L,21920L,21921L,21922L,21923L,21924L,21925L,21926L,21927L,21928L,21929L,21930L,21931L,21932L,21933L,21934L,21935L,21936L,14L,21937L,21938L,21939L,21940L,21941L,21942L,21957L,21958L,21959L,21960L,21961L,21962L,21963L,21964L,15L,21965L,21966L,21967L,21968L,21969L,21970L,21985L,21986L,21987L,21988L,21989L,21990L,21991L,21992L,21993L,21994L,21995L,21996L,16L,17L,21551L,21552L,21553L,21554L,21555L,21556L,21557L,21558L,21559L,21560L,21561L,21562L,21563L,21564L,21565L,21566L,21567L,21997L,21998L,18L,21568L,21569L,21570L,21571L,21572L,21573L,21574L,21575L,21576L,21577L,21578L,21579L,21580L,21581L,19L,21582L,21583L,21584L,21585L,21586L,21587L,21588L,21589L,21590L,21591L,21592L,20L,21593L,21594L,21595L,21596L,21597L,21598L,21599L,21600L,21601L,21602L,21603L,21604L,21605L,21606L,21L,21607L,21608L,21609L,21610L,21611L,21612L,21613L,21614L,21615L,21616L,21617L,21618L,21619L,21620L,21621L,21622L,21623L,21624L,21625L,21626L,22L,21627L,21628L,21629L,21630L,21631L,21632L,21633L,21634L,21635L,21636L,21637L,21638L,21639L,21640L,21641L,21642L,21643L,21644L,21645L,23L,21646L,21647L,21648L,21649L,21650L,21651L,21652L,21653L,21654L,21655L,21656L,21657L,21658L,24L,21659L,21660L,21661L,21662L,21663L,21664L,21665L,21666L,21667L,21668L,21669L,25L,21670L,21671L,21672L,21673L,21674L,21675L,21676L,21677L,21678L,21679L,21680L,21681L,21682L,21683L,26L,21684L,21685L,21686L,21687L,21688L,21689L,21690L,21691L,21692L,21693L,21694L,21695L,21696L,21697L,21698L,21699L,21700L,21701L,21702L,21703L,27L,21704L,21719L,21720L,21721L,21722L,21723L,21724L,21725L,21726L,21727L,21728L,21729L,21730L,21731L,21732L,28L,21733L,21734L,21735L,21736L,21737L,21738L,21739L,21740L,29L,21741L,21742L,21743L,21744L,21745L,21746L,21747L,21748L,21749L,21750L,21751L,21752L,21753L,21754L,21755L,21756L,21757L,21758L,30L,31L,32L,33L,34L,35L,36L,37L,21229L,21230L,21231L,21232L,21233L,21234L,21235L,21236L,21237L,21238L,21239L,21240L,21241L,21242L,21243L,21244L,21245L,21246L,21247L,21248L,21249L,21250L,21251L,21252L,21253L,21254L,21255L,21256L,21257L,21258L),from = c("6644","9843","9242","6753","7075","8685","5513","6340","6042","5587","7237","5695","9582","8539","7939","9077","8946","5591","8380","5865","7867","9457","6968","7971","6150","9361","9379","8409","9740","7226","7531","6752","7362","6661","5730","5417","9049","7057","6252","9476","6228","8896","7371","8170","7122","6694","5450","9435","5619","8289","9862","5504","6555","9845","7537","9482","6810","8257","8490","6588","9652","7303","5852","5746","9198","6917","8688","9460","9640","7054","8628","7065","9006","6832","6185","8422","6914","7069","7848","8436","5494","6375","5653","8912","9794","8413","6527","9101","5815","6923","8184","6811","8130","6539","8643","6329","7744","8211","9641","8003","5599","8715","7108","9573","8583","5648","6444","5660","8191","9830","5931","7921","8314","7940","6265","6604","6509","5618","5860","6469","9525","5887","6626","7145","6862","5741","9144","9163","7297","7599","8427","8865","9418","8636","6530","9155","6934","8817","9028","5521","5943","7443","9557","8239","6819","9761","5983","6830","6368","5381","8782","8008","9160","9615","6920","6164","6278","9729","8960","6358","5939","8902","9522","7344","9070","6594","8058","6639","7896","6325","7804","9554","9725","8475","7746","7536","9671","5415","6837","8327","9061","8981","9226","5862","7085","8925","6226","6849","8432","9545","5837","5440","9732","8695","7690","5829","9373","7977","6361","7320","7603","6303","7077","7850","5792","9588","9204","8648","8950","7106","6334","6843","7060","9606","5520","9350","7463","7947","9668","9490","6241","8830","6374","9528","7919","8532","6795","8162","9275","8106","8615","9206","8283","7052","7737","7815","7932","6125","6671","7800","9835","5573","7874","8931","6748","8192","6822","6950","8020","8555","8986","7644","5736","8421","6224","8374","8304","8677","9208","7008","6074","9409","6269","9721","9304","9117","5420","9691","7728","8579","7495","9838","8139","9571","5385","5454","9620","7723","9249","7033","7966","9844","5793","5747","6362","6925","9318","6780","7150","6818","7246","5514","9574","7838","5540","6646","6893","6417","8039","8721","8763","6401","6510","7970","7117","6001","7505","7646","5600","6522","8395","5601","5418","6296","8790","7622","9012","8165","7624","5468","9316","9030","7155","5702","7492","8503","9868","6807","6404","9076","7213","8735","7849","8551","9351","6693","9653","9504","6948","9358","9280","8168","5456","9138","8420","9312","8930","7699","5506","9475","5776","5517","5644","8680","5474","7534","9363","9586","6508","6193","5401","8032","8461","9387","5812","7564","5917","5434","5794","7840","9085","8331","7175","6669","6352","7432","9810","8776","6112","8869","8248","9450","6974","7264","7336","6880","7866","7777","7502","5615","9777","9214","6039","7714","9056","8358","8963","8657","8846","9319","7220","7764","8967","8683","9137","6971","9747","7449","8259","5373","7300","6273","8391","7862","5696","6622","9240","7021","7313","7247","6679","8102","6812","9473","6345","7935","9696","5541","8939","6887","8998","9110","8666","6670","8975","7518","7549","7841","8888","5808","9807","6860","9811","5935","8966","8684","5915","8892","8493","7894","6342","6382","7833","7201","7253","6720","6175","9201","5682","5473","7173","6094","8810","5874","6947","8462","6885","6201"),to = c("6934","9115","9333","8456","7207","6046","7047","6213","9493","6248","7468","6727","6912","9251","6460","5773","6951","6153","6634","9440","8220","8512","8105","8786","6454","5997","6253","7517","6935","6143","9628","6517","8078","6442","6254","6217","5605","7084","5985","5443","8665","6883","8472","5715","5409","8876","5610","6043","5668","6986","7382","8671","8336","9750","6566","6145","7134","5606","9682","5635","8820","8323","6616","8678","7293","7843","6192","9131","7143","6333","6995","9770","5835","5614","8134","8887","8631","9744","8697","9664","9202","8606","9037","6989","7248","9519","6033","9565","9723","7222","8739","6502","5980","7001","9200","7471","6794","5621","6114","6772","9535","7034","7446","6896","7345","8183","6479","6503","5592","9712","6844","7109","5562","6984","5922","9839","8041","9039","8178","7990","9803","8879","8221","8545","8321","6994","6673","7769","6104","7818","8941","7825","7770","6954","6194","9741","5384","8653","6659","9321","7124","5866","7718","7321","9084","6507","9033","6540","6857","8945","9390","6359","8757","8280","7049","6205","7604","6085","6299","6671"),date = structure(c(12784,12784,12785,12786,12787,12788,12789,12790,12791,12792,12793,12794,12795,12796,12797,12798,12799,12800,12801,12802,12803,12804,12805,12806,12807,12808,12809,12810,12811,12812,12813,12814,12815,12816,12817,12818,12819,12820,12821,12822,12823,12824,12825,12826,12827,12828,12829,12830,12831,12832,12833,12834,12835,12836,12837,12838,12839,12840,12841,12842,12842),amount = c(700,900,1000,600,400,1100,200,800,700,300,5123,3401,500,3242,5298,11832,6774,14264,13851,17366,6007,6207,12644,4276,6434,14779,4507,6446,17477,5009,5718,13967,6959,15914,4470,10737,44749,46552,13156,23323,10792,7268,16125,14440,49752,36518,19961,21899,12336,38000,16983,13.4,42.7,34700,12.6,47.5,13.3,37.1,17,11.1,15.5,22.2,55.8,11.8,50.1,45,15.9,38.8,38.2,20.1,38.9,7.1,107.1,48,62.4,2900,21.5,19.1,14,19.5,15.2,5282,94.7,19.4,28.2,110.2,0.8,23.1,20,19.6,2000,5100,11900,5500,7500,6000,25300,45647,7000,5224,30192,24381,23330,8500,3191,23041,5029,6238,20213,7618,19935,5859,3375,38645,1600,10600,3217,14626,4550,4356,6689,9612,5080,5039,4212,17632,3395,3399,14493,28157,1800,31348,5544,14100,33045,20066,27666,3151,1500,22452,44333,18347,6242,13900,19746,48098,7041,9100,10584,49590,3021,5400,1200,5072,47831,12015,5200,8905,23524,22349,31038,25200,43737,12154,23736,24863,29700,14622,4758,5810,19588,27078,32594,25609,20281,6310,14319,14400,9600,13875,12043,4391,4327,9000,6698,16392,15263,18729,3098,4729,24400,9658,14963,9800,20567,3058,18497,6148,15447,24009,12138,4200,1300,8000,29511,6600,44162,24023,33410,14800,19136,33900,4100,10500,3700,100,3300,9400,45925,41740,6200,3100,31332,62100,7100,28361,4000,4500,2500,11000,8900,23990,1400,10700,10900,3900,18.7,102,2.9,3,39285,52.1,34.1,18.5,21.3,38.3,160,110.5,58.6,83.4,34.7,68.6,31,20.3,5.4,89.3,110.6,61.5,72.7,13.7,20.7,25.9,110,16.2,39,73.8,23.7,249,29.6,117.3,199)),-500L),class = "data.frame")
解决方法
这是一种实现方法:
closeness_fnc <- function(DT,grp){
cl <- tnet::closeness_w(DT[,sum(amount),by = .(from,to)],directed = T,gconly=FALSE,alpha = 0.5)
ind = DT[date == max(date),from]
cl[ind,"n.closeness"]))
}
setDT(data)
data[,closeness_directed := closeness_fnc(data[(date >= end_date - 180) & (date <= end_date)]),by = .(end_date = date)]
## end_date id closeness_directed
## <Date> <int> <num>
##1: 2006-08-31 83324 1.00000000
##2: 2006-09-09 87614 0.98744695
##3: 2006-09-13 88898 0.35329017
##4: 2006-09-18 89874 0.25176754
##5: 2006-10-07 94765 0.39233504
##6: 2006-10-31 100277 0.07167582
##7: 2006-10-31 101587 0.02390589
请注意,closeness_fnc()
被简化为仅使用data.table。 dplyr代码可以进行类似的调整。然后,我主要使用了您的原始方法,该方法似乎效果很好。
编辑 问题的一部分是,我们正在汇总每个日期的往返时间。相反,我们可以一次全部聚合并创建一个查找表。
closeness_fx3 = function(from,to,amount,date) {
cl = closeness_w(data.frame(from,amount),directed = TRUE,gconly = FALSE,alpha = 0.5)
cl[from[date == max(date)],"n.closeness"]
}
setDT(data)
agg_data = data[,.(tot_amount = sum(amount)),by = .(date)]
agg_data[,end_date := date - 180]
data[,closeness_directed := .SD[agg_data,on = .(date >= end_date,date <= date),closeness_fx3(from,x.date),by = .EACHI]$V1
]
data
使用bench::mark
(包括检查结果是否相等),该方法比其他解决方案要快一些。
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 simple_refactor 414ms 592ms 1.81 188MB 2.89 10 16 5.54s
2 ekoam 337ms 356ms 2.39 183MB 3.59 10 15 4.18s
3 use_agg 299ms 314ms 2.77 178MB 3.88 10 14 3.6s
,
考虑如下重写您的closeness_func
:
closeness_info <- function(from,DT) {
DT <- DT[,.(weights = sum(amount)),to)]
res <- closeness_w(DT,TRUE,FALSE,alpha = 0.5)
`names<-`(res[,"n.closeness"],row.names(res))[from]
}
然后,您可以使用以下data.table
操作来实现您的目标:
set_closeness <- function(DT) {
DT[,closeness_directed := closeness_info(.SD$from,DT[between(date,.BY$date - 180,.BY$date),]),by = date]
}
现在,让我们使用您提供的长数据集,对照您的原始性能对本实现的性能进行基准测试。在这里,我将其称为df
。我们首先创建该数据集的两个副本。这是必需的,因为默认情况下data.table
使用引用传递。如果我们没有创建数据集的两个副本,那么以下测试将始终应用于同一对象(即df
)。
data1 <- copy(df)
data2 <- copy(df)
我们将您的原始实现应用于data1
:
> system.time({
+ closeness_fnc <- function(data){
+ accounts <- data[date == max(date),from]
+ id <- data[date == max(date),id]
+
+ # for directed networks
+ df <- data %>% group_by(from,to) %>% mutate(weights = sum(amount)) %>% select(from,weights) %>% distinct
+ cl <- closeness_w(df,alpha = 0.5)
+
+ list(
+ id = id,+ closeness_directed = cl[,"n.closeness"][accounts]
+ )
+
+ }
+ network_data <- data1[,closeness_fnc(data1[(date >= end_date - 180) & (date <= end_date)]),.(end_date = date)] %>% select(-end_date)
+ data1 <- merge(x = data1,y = network_data,by = "id")
+ })
user system elapsed
1.19 0.07 1.26
然后我们将上面的新实现应用于data2
:
> system.time({
+ closeness_info <- function(from,DT) {
+ DT <- DT[,to)]
+ res <- closeness_w(DT,alpha = 0.5)
+ `names<-`(res[,row.names(res))[from]
+ }
+ set_closeness <- function(DT) {
+ DT[,by = date]
+ }
+ set_closeness(data2)
+ })
user system elapsed
0.33 0.07 0.40
它们产生相同的结果集吗?
> all(data1[order(id),] == data2[order(id),])
[1] TRUE
输出:
> data2[order(id),]
id from to date amount closeness_directed
1: 0 5695 7468 2005-01-04 700 0.0344016544
2: 1 9379 6213 2005-01-08 11832 0.0492013976
3: 2 8170 7517 2005-01-10 1000 0.0097043019
4: 3 9845 6143 2005-01-12 4276 0.0142486370
5: 4 9640 6254 2005-01-14 200 0.0022874217
---
496: 21994 6671 6033 2005-02-07 14100 0.0064464840
497: 21995 7800 8967 2005-02-07 33045 0.0098688428
498: 21996 9835 8105 2005-02-07 1800 0.0023032952
499: 21997 7008 6794 2005-02-08 200 0.0006955321
500: 21998 6074 9457 2005-02-08 600 0.0026025058
鉴于您不想重写closeness_w
函数,我认为这可能是最好的性能,按照David的建议,这可能是瓶颈。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。