gpt4 book ai didi

R - 通过调节不同大小的序列来加速嵌套循环(矢量化?)

转载 作者:行者123 更新时间:2023-12-01 09:24:34 25 4
gpt4 key购买 nike

对于具有 3 列(X 位置、Y 位置和某个值 VAL)的数据集,我想对某些 XxY 间隔/箱中包含的所有 VAL 执行一些操作(例如,我想网格我的空间)。

我最初编写了简单的以下函数来执行此操作(myT 是传递的数据集,xboundsybounds 是连续区间的向量休息(垃圾箱)):

calcPerBin1 <- function(myT, xbounds, ybounds) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
line <- 1
for (i in 1:(length(xbounds)-1)) {
for (j in 1:(length(ybounds)-1)) {
myTsubset <- myT[myT$X >= xbounds[i] & myT$X < xbounds[i+1] &
myT$Y >= ybounds[j] & myT$Y < ybounds[j+1], ]
newT$MEAN[line] <- mean(myTsubset$VAL)
newT$X[line] <- mean(c(xbounds[i], xbounds[i+1]))
newT$Y[line] <- mean(c(ybounds[j], ybounds[j+1]))
line <- line+1
}
}
return(newT)
}

SHORTCUT 问题:如何改进上述代码? (以下是我的第一次尝试-如果太长可以跳过!)


for 循环当然不是最理想的,而且它的执行时间很糟糕(无法将它与我的真实数据集一起使用)。因此,我尝试了以下代码(即,内部循环是矢量化的,如果我没记错的话):

calcPerBin2 <- function(myT,xbounds, ybounds) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
xboundsmean <- vector() ; yboundsmean <- vector()
for (i in 1:(length(xbounds)-1)) {
xboundsmean <- c(xboundsmean, mean(c(xbounds[i],xbounds[i+1])))}
for (i in 1:(length(ybounds)-1)) {
yboundsmean <- c(yboundsmean, mean(c(ybounds[i],ybounds[i+1])))}
xyvals <- expand.grid(xmid=xboundsmean, ymid=yboundsmean)
xyvals$xmin <- xyvals$xmid-binsize/2
xyvals$xmax <- xyvals$xmid+binsize/2
xyvals$ymin <- xyvals$ymid-binsize/2
xyvals$ymax <- xyvals$ymid+binsize/2
res <- vector()
for (i in 1:dim(xyvals)[1]) {
cond <- (myT$X >= xyvals$xmin[i] & myT$X < xyvals$xmax[i] &
myT$Y >= xyvals$ymin[i] & myT$Y < xyvals$ymax[i])
res <- c(res, mean(myT$VAL[cond]))
}
newT$MEAN <- res
newT$X <- xyvals[,1]
newT$Y <- xyvals[,2]
return(newT)
}

非常丑陋,所以我尝试了以下变体:

calcPerBin2.2 <- function(myT,xbounds, ybounds, sizeofbin) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
xcut <- cut(myT$X, breaks=xbounds)
ycut <- cut(myT$Y, breaks=ybounds)
xycut <- expand.grid(XCUT=levels(xcut), YCUT=levels(ycut))
xylowers <- cbind(xlower = as.numeric(sub("\\((.+),.*", "\\1", xycut$XCUT) ),
ylower = as.numeric(sub("\\((.+),.*", "\\1", xycut$YCUT) ))
res <- vector()
for (i in 1:dim(xycut)[1]) {
cond <- (xcut==xycut$XCUT[i] & ycut==xycut$YCUT[i])
res <- c(res, mean(myT$VAL[cond]))
}
newT$MEAN <- res
newT$X <- xylowers[,1]+sizeofbin/2
newT$Y <- xylowers[,2]+sizeofbin/2
return(newT)
}

我可以运行它,例如:

# Control parameters
xmax <- 500
ymax <- 1000
N <- 100000
binsize <- 50

xbins <- seq(0,xmax,binsize)
ybins <- seq(0,ymax,binsize) # xbins and ybins do NOT have the same size

# Generate dummy data
xcoords <- runif(N, 1, xmax)
ycoords <- runif(N, 1, ymax)
vals <- xcoords+ycoords**2
data <- data.frame(cbind(X=xcoords, Y=ycoords, VAL=vals))

# Run
system.time(test1 <- calcPerBin1(data, xbins, ybins))
system.time(test2 <- calcPerBin2(data, xbins, ybins))
system.time(test2.2 <- calcPerBin2.2(data, xbins, ybins, binsize))

calcPerBin2 产生了轻微的改进,但 calcPerBin2.2 甚至比 calcPerBin1 更差(而且,是的,所有代码都很难看)。我的问题是我不太清楚如何替换(矢量化?)calcPerBin2 中剩余的循环。例如,如何在 xyvals$xmin 的基础上有效地在 myT$X 上编写条件,而后者是向量形式(它们不具有相同的size) 而不是我在 calcPerBin2 中使用的索引形式?欢迎任何改进上述代码的建议 - 谢谢。

最佳答案

您可以在三行中完成 大部分 (使用 zoo 表示 rollmean):

library(zoo) # load the package
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
res <- aggregate(VAL ~ X + Y, data, mean)

检查结果:

# order it the same way as in test1, then show the first lines
head(res[order(res$X, res$Y),])
# X Y VAL
#1 25 25 900.8305
#11 25 75 5957.4972
#21 25 125 15680.8103
#31 25 175 30877.6696
#41 25 225 50688.4860
#51 25 275 75961.8558

将其与原始函数的结果进行比较:

test1 <- calcPerBin1(data, xbins, ybins)
head(test1)
# X Y MEAN
#1 25 25 900.8305
#2 25 75 5957.4972
#3 25 125 15680.8103
#4 25 175 30877.6696
#5 25 225 50688.4860
#6 25 275 75961.8558

基准测试:

fastbin <- function(data, xbins, ybins){
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
aggregate(VAL ~ X + Y, data, mean)
}

library(dplyr) # for faster aggregation
fastbin.dplyr <- function(data, xbins, ybins){
data %>%
mutate(X = cut(X, xbins, labels = rollmean(xbins, 2)),
Y = cut(Y, ybins, labels = rollmean(ybins, 2))) %>%
group_by(X, Y) %>%
summarise(Val = mean(VAL))
}

system.time(test1 <- calcPerBin1(data, xbins, ybins))
User System elapsed
3.47 0.12 3.59

system.time(res.fastbin <- fastbin(data, xbins, ybins))
User System elapsed
1.01 0.02 1.05

system.time(res.fastbin.dplyr <- fastbin.dplyr(data, xbins, ybins))
User System elapsed
0.06 0.00 0.06

关于R - 通过调节不同大小的序列来加速嵌套循环(矢量化?),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27420234/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com