r - 加速 data.table 相交-6ren

r - 加速 data.table 相交

转载作者：行者123 更新时间：2023-12-04 12:27:47

25

4

我正在寻找更快的方法来计算这种事情。 (真实数据库有 109 964 694 行)。

DT<-data.table(id=c(1,2,1,4,2,1,7,8,8,10),
               effect=c("A","A","B","B","B","C","C","C","A","D"),
               value=1)

我想拥有 effect =="A" 的 ID和 effect == "C"

intersect(DT[effect=="A",(id)], DT[effect=="C",(id)])

有没有其他更快的方法来做到这一点？

我也想拥有 effect =="A" 的 ID和 effect =="C" .. 但不是 effect=="B"

setdiff(
  intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),
DT[effect=="B",(id)]
)

有什么更好(更快)的想法吗？

问候

PS:我尝试过这种事情:

DT[,effect=="diag_998" | effect=="diag_1900",by=id][,sum(V1)==2,by=id][V1==TRUE,(id)]

但它非常非常非常慢...

编辑:感谢大家的建议。

似乎没有什么比 V1 和 A1 更快了:

system.time(V1<-intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]))# 0.014 sec
system.time(V2<-DT[effect=="A" | effect=="C", unique(id[duplicated(id)])]) #17sec
system.time(V3<-DT[,list(cond=all(c("A","C") %in% effect)),by=id][cond==TRUE,id] ) #more than 1 min
system.time(V4<-DT[effect=="A" | effect=="C", .N[.N > 1], by = id]$id) # 17 sec
system.time(V5<-DT[, .GRP[sum(c("A", "C") %chin% effect)==2], id]$id) # 103 sec

system.time(V6<-DT[, .GRP[sum(c("A", "C") %in% effect)==2], id]$id)#more than 1 min
setkey(DT, effect)
system.time(V7<-DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)#0.19 sec



system.time(A1<-setdiff(intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),DT[effect=="D",(id)])) # 0.014 sec
system.time(A2<-DT[,list(cond=all(c("A","C") %in% effect) & (!"D" %in% effect)),by=id][cond==TRUE,id])#more than 1 min
system.time(A3<-DT[,list(cond=all(c("A","C") %chin% effect) & (!"D" %chin% effect)),by=id][cond==TRUE,id])
system.time(A4<-DT[.(c("A", "C", "D")), if (.N == 2 & !("D" %in% effect)) TRUE, by = id]$id)# 1sec

编辑:另一个基准，感谢@MichaelChirico

microbenchmark(times=50L,
+                bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
+                bakal.keyed=intersect(DT["A", id], DT["C", id]),
+                rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
+                rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: milliseconds
        expr       min        lq      mean    median        uq       max neval
       bakal 10.963171 11.003494 11.072844 11.019909 11.060331 12.641751    50
 bakal.keyed 10.738537 10.794715 10.878960 10.838630 10.892020 12.416713    50
       rscr1  9.504886  9.572026  9.662024  9.598426  9.645478 11.127047    50
 rscr1.keyed  9.013076  9.037370  9.065215  9.065669  9.083492  9.206366    50

最佳答案

看来你走对了路；我只能建议键入以加快速度。

这是一个小样本数据集，用于剔除最糟糕的尝试:

set.seed(10239)
NN<-1e6
DT<-data.table(id=sample(8e5,NN,T),
               effect=sample(LETTERS[1:4],NN,T),
               val=rnorm(NN),key="effect")

我最初的建议是使用键控，但不是最好的方式，基本上是: setkey(DT,effect); DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id 。

这是这种方法的基准，您的原始方法以及评论中提到的其他方法(我做了一些优化，例如在@nicola 的方法中用 cond==TRUE 替换 (cond)，在@RichardScriven 的方法中用 effect=="A"|effect=="C" 替换 effect %in% c("A", C")):

library(microbenchmark)
microbenchmark(times=50L,
               bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
               rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
               nicol=DT[,.(cond=all(c("A","C") %in% effect)), by=id][(cond), id],
               rscr2=DT[effect %in% c("A","C"), .N[.N > 1], by = id]$id,
               akrun=DT[, .GRP[sum(c("A", "C") %chin% effect) == 2], id]$id,
               mikec=DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)

结果在我的机器上:

Unit: milliseconds
  expr        min         lq       mean     median         uq        max neval   cld
 bakal   14.82926   15.18540   17.86200   15.56453   16.70924   64.99443    50 a    
 rscr1   13.41102   13.98252   20.11127   14.93054   18.02248   66.14476    50 a    
 nicol 1329.82013 1377.03884 1436.45650 1404.48956 1483.47944 1758.00831    50     e
 rscr2  260.54888  269.86605  294.05219  276.66802  310.76356  479.50419    50   c  
 akrun  997.43300 1075.17322 1103.06220 1095.08246 1118.16848 1360.80793    50    d 
 mikec  154.39418  158.90884  180.01096  163.32763  206.59246  235.45523    50  b

让我们看看这些在您提到的大小的 data.table 上的表现如何:

set.seed(12039)
NN<-1e8
DT<-data.table(id=sample(8e5,NN,T),
               effect=sample(LETTERS[1:4],NN,T),
               val=rnorm(NN),key="effect")
microbenchmark(times=50L,
               bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
               bakal.keyed=intersect(DT["A", id], DT["C", id]),
               rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
               rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: seconds
        expr      min       lq     mean   median       uq      max neval cld
       bakal 3.772309 3.991414 4.395669 4.408319 4.681609 5.170224    50  a 
 bakal.keyed 3.708500 3.807447 4.289518 4.384870 4.653427 5.085429    50  a 
       rscr1 4.962940 5.210845 5.721636 5.707369 6.162103 6.779900    50   b
 rscr1.keyed 4.904702 5.117411 5.727848 5.807186 6.194990 6.975508    50   b