作者热门文章
- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
据我了解,data.table 比 dplyr 更高效、更快,但我今天在工作中发现了相反的情况。我创建了一个模拟来解释这种情况。
library(data.table)
library(dplyr)
library(microbenchmark)
# data simulated
dt = data.table(A = sample(1:4247,10000, replace = T),
B = sample(1:119,10000,replace = T),
C = sample(1:6,10000,replace = T),
D = sample(1:30,10000,replace = T))
dt[,ID:=paste(A, ":::" ,
D,":::",
C)]
# execution time
microbenchmark(
DATA_TABLE = dt[, .(count=uniqueN(ID)),
by=c("A","B","C")
],
DPLYR = dt %>%
group_by(A,B,C) %>%
summarise(count = n_distinct(ID)),
times = 10
)
Unit: milliseconds
expr min lq mean median uq max neval
DATA_TABLE 14241.57361 14305.67026 15585.80472 14651.16402 16244.22477 21367.56866 10
DPLYR 35.95123 37.63894 47.62637 48.56598 53.59919 62.63978 10
最佳答案
这是另一种选择:
dt[order(A, B, C), {
uniqn <- rleidv(c(.SD, .(ID)))
lastidx <- c(which(diff(rowidv(.SD))<1L), .N)
c(.SD[lastidx], .(count=c(uniqn[lastidx[1L]], diff(uniqn[lastidx]))))
}, .SDcols=cols]
cols <- c("A","B","C")
microbenchmark(times=1L,
DATA_TABLE = a00 <- dt[, .(count=uniqueN(ID)), cols],
DATA_TABLE1 = a01 <- dt[, .(count=length(unique(ID))), cols],
DPLYR = a_dplyr <- dt %>%
group_by(A,B,C) %>%
summarise(count = n_distinct(ID)),
#https://github.com/Rdatatable/data.table/issues/1120#issuecomment-463584656
mtd0 = a10 <- unique(dt, by=c(cols, "ID"))[, .(count=.N), cols],
#https://github.com/Rdatatable/data.table/issues/1120#issuecomment-463597107
mtd1 = a11 <- dt[, .N, c(cols, "ID")][, .(count=.N), cols],
mtd2 = a2 <- dt[order(A, B, C), {
uniqn <- rleidv(c(.SD, .(ID)))
lastidx <- c(which(diff(rowidv(.SD))<1L), .N)
c(.SD[lastidx], .(count=c(uniqn[lastidx[1L]], diff(uniqn[lastidx]))))
}, .SDcols=cols]
)
> fsetequal(a00, a01)
[1] TRUE
> fsetequal(a00, setDT(a_dplyr))
[1] TRUE
> fsetequal(a00, a10)
[1] TRUE
> fsetequal(a00, a11)
[1] TRUE
> fsetequal(a00, a2)
[1] TRUE
Unit: milliseconds
expr min lq mean median uq max neval
DATA_TABLE 147478.1089 147478.1089 147478.1089 147478.1089 147478.1089 147478.1089 1
DATA_TABLE1 4998.8236 4998.8236 4998.8236 4998.8236 4998.8236 4998.8236 1
DPLYR 244081.6925 244081.6925 244081.6925 244081.6925 244081.6925 244081.6925 1
mtd0 4519.4046 4519.4046 4519.4046 4519.4046 4519.4046 4519.4046 1
mtd1 2866.5808 2866.5808 2866.5808 2866.5808 2866.5808 2866.5808 1
mtd2 809.7442 809.7442 809.7442 809.7442 809.7442 809.7442 1
#R-3.6.1 64bit Win10
library(data.table) #data.table_1.12.8 getDTthreads()==4
library(dplyr) #dplyr_1.0.0
library(microbenchmark)
# data simulated
set.seed(0L)
nr <- 1e6
dt = data.table(A = sample(1:424700,nr, replace = T),
B = sample(1:11900,nr, replace = T),
C = sample(1:600, nr, replace = T),
D = sample(1:3000, nr, replace = T))
dt[,ID:=paste(A,":::",D,":::",C)]
关于R为什么dplyr比data.table(uniqueN)更快地按组计算唯一值(n_distinct)?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/60623235/
据我了解,data.table 比 dplyr 更高效、更快,但我今天在工作中发现了相反的情况。我创建了一个模拟来解释这种情况。 library(data.table) library(dplyr)
给定这样的数据集: test =data.table( id = c("a", "b", "c", "d", "e", "e", "e", "f", "g", "h", "i", "j", "k
我是一名优秀的程序员,十分优秀!