gpt4 book ai didi

R 使用 apply() 或 lapply() 等加速 for 循环

转载 作者:行者123 更新时间:2023-12-01 07:53:13 25 4
gpt4 key购买 nike

我编写了一个特殊的“插补”函数,它根据特定的列名用 mean() 或 mode() 替换缺少 (NA) 值的列值。

输入数据帧是 400,000+ 行并且它的 vert 很慢,我如何使用 lapply() 或 apply() 加速插补部分。

这是我想用 START OPTIMIZE 和 END OPTIMIZE 优化的功能,标记部分:

specialImpute <- function(inputDF) 
{

discoveredDf <- data.frame(STUDYID_SUBJID=character(), stringsAsFactors=FALSE)
dfList <- list()
counter = 1;

Whilecounter = nrow(inputDF)
#for testing just do 10 iterations,i = 10;

while (Whilecounter >0)
{

studyid_subjid=inputDF[Whilecounter,"STUDYID_SUBJID"]

vect = which(discoveredDf$STUDYID_SUBJID == studyid_subjid)
#was discovered and subset before
if (!is.null(vect))
{
#not subset before
if (length(vect)<1)
{
#subset the dataframe base on regex inputDF$STUDYID_SUBJID
df <- subset(inputDF, regexpr(studyid_subjid, inputDF$STUDYID_SUBJID) > 0)

#START OPTIMIZE
for (i in nrow(df))
{
#impute , add column mean & add to list

#apply(df[,c("y1","y2","y3","etc..")],2,function(x){x[is.na(x)] =mean(x, na.rm=TRUE)})

if (is.na(df[i,"y1"])) {df[i,"y1"] = mean(df[,"y1"], na.rm = TRUE)}
if (is.na(df[i,"y2"])) {df[i,"y2"] =mean(df[,"y2"], na.rm = TRUE)}
if (is.na(df[i,"y3"])) {df[i,"y3"] =mean(df[,"y3"], na.rm = TRUE)}
#impute using mean for CONTINUOUS variables
if (is.na(df[i,"COVAR_CONTINUOUS_2"])) {df[i,"COVAR_CONTINUOUS_2"] =mean(df[,"COVAR_CONTINUOUS_2"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_3"])) {df[i,"COVAR_CONTINUOUS_3"] =mean(df[,"COVAR_CONTINUOUS_3"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_4"])) {df[i,"COVAR_CONTINUOUS_4"] =mean(df[,"COVAR_CONTINUOUS_4"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_5"])) {df[i,"COVAR_CONTINUOUS_5"] =mean(df[,"COVAR_CONTINUOUS_5"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_6"])) {df[i,"COVAR_CONTINUOUS_6"] =mean(df[,"COVAR_CONTINUOUS_6"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_7"])) {df[i,"COVAR_CONTINUOUS_7"] =mean(df[,"COVAR_CONTINUOUS_7"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_10"])) {df[i,"COVAR_CONTINUOUS_10"] =mean(df[,"COVAR_CONTINUOUS_10"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_14"])) {df[i,"COVAR_CONTINUOUS_14"] =mean(df[,"COVAR_CONTINUOUS_14"], na.rm = TRUE)}
if (is.na(df[i,"COVAR_CONTINUOUS_30"])) {df[i,"COVAR_CONTINUOUS_30"] =mean(df[,"COVAR_CONTINUOUS_30"], na.rm = TRUE)}
#impute using mode ordinal & nominal values
if (is.na(df[i,"COVAR_ORDINAL_1"])) {df[i,"COVAR_ORDINAL_1"] =Mode(df[,"COVAR_ORDINAL_1"])}
if (is.na(df[i,"COVAR_ORDINAL_2"])) {df[i,"COVAR_ORDINAL_2"] =Mode(df[,"COVAR_ORDINAL_2"])}
if (is.na(df[i,"COVAR_ORDINAL_3"])) {df[i,"COVAR_ORDINAL_3"] =Mode(df[,"COVAR_ORDINAL_3"])}
if (is.na(df[i,"COVAR_ORDINAL_4"])) {df[i,"COVAR_ORDINAL_4"] =Mode(df[,"COVAR_ORDINAL_4"])}
#nominal
if (is.na(df[i,"COVAR_NOMINAL_1"])) {df[i,"COVAR_NOMINAL_1"] =Mode(df[,"COVAR_NOMINAL_1"])}
if (is.na(df[i,"COVAR_NOMINAL_2"])) {df[i,"COVAR_NOMINAL_2"] =Mode(df[,"COVAR_NOMINAL_2"])}
if (is.na(df[i,"COVAR_NOMINAL_3"])) {df[i,"COVAR_NOMINAL_3"] =Mode(df[,"COVAR_NOMINAL_3"])}
if (is.na(df[i,"COVAR_NOMINAL_4"])) {df[i,"COVAR_NOMINAL_4"] =Mode(df[,"COVAR_NOMINAL_4"])}
if (is.na(df[i,"COVAR_NOMINAL_5"])) {df[i,"COVAR_NOMINAL_5"] =Mode(df[,"COVAR_NOMINAL_5"])}
if (is.na(df[i,"COVAR_NOMINAL_6"])) {df[i,"COVAR_NOMINAL_6"] =Mode(df[,"COVAR_NOMINAL_6"])}
if (is.na(df[i,"COVAR_NOMINAL_7"])) {df[i,"COVAR_NOMINAL_7"] =Mode(df[,"COVAR_NOMINAL_7"])}
if (is.na(df[i,"COVAR_NOMINAL_8"])) {df[i,"COVAR_NOMINAL_8"] =Mode(df[,"COVAR_NOMINAL_8"])}

}#for
#END OPTIMIZE

dfList[[counter]] <- df
#add to discoveredDf since already substed
discoveredDf[nrow(discoveredDf)+1,]<- c(studyid_subjid)
counter = counter +1;
#for debugging to check progress
if (counter %% 100 == 0)
{
print(counter)
}
}
}


Whilecounter = Whilecounter -1;
}#end while
return (dfList)

}

谢谢

最佳答案

只要您在每个 上使用矢量化函数,很可能可以通过多种方式提高性能。目前,您正在遍历每一行,然后分别处理每一列,这确实会减慢您的速度。另一个改进是概括代码,这样您就不必为每个变量都输入一个新行。在我将在下面给出的示例中,这是因为连续变量是数字变量,而分类变量是因子。

要直接得到答案,您可以用以下代码替换要优化的代码(尽管修复了变量名称),前提是您的数字变量是数字变量,而序数/分类变量不是(例如,因子):

impute <- function(x) {
if (is.numeric(x)) { # If numeric, impute with mean
x[is.na(x)] <- mean(x, na.rm = TRUE)
} else { # mode otherwise
x[is.na(x)] <- names(which.max(table(x)))
}
x
}

# Correct cols_to_impute with names of your variables to be imputed
# e.g., c("COVAR_CONTINUOUS_2", "COVAR_NOMINAL_3", ...)
cols_to_impute <- names(df) %in% c("names", "of", "columns")
library(purrr)
df[, cols_to_impute] <- dmap(df[, cols_to_impute], impute)

以下是五种方法的详细比较:
  • 使用 for 迭代行的原始方法;然后分别处理每一列。
  • 使用 for 循环。
  • 使用 lapply()
  • 使用 sapply()
  • 使用 dmap() 包中的 purrr

  • 新方法都通过 列对数据框 进行迭代,并使用称为 impute 的矢量化函数,该函数使用均值(如果是数字)或众数(否则)来估算向量中的缺失值。否则,它们的差异相对较小(除了您将看到的 sapply()),但检查起来很有趣。

    以下是我们将使用的实用函数:
    # Function to simulate a data frame of numeric and factor variables with
    # missing values and `n` rows
    create_dat <- function(n) {
    set.seed(13)
    data.frame(
    con_1 = sample(c(10:20, NA), n, replace = TRUE), # continuous w/ missing
    con_2 = sample(c(20:30, NA), n, replace = TRUE), # continuous w/ missing
    ord_1 = sample(c(letters, NA), n, replace = TRUE), # ordinal w/ missing
    ord_2 = sample(c(letters, NA), n, replace = TRUE) # ordinal w/ missing
    )
    }

    # Function that imputes missing values in a vector with mean (if numeric) or
    # mode (otherwise)
    impute <- function(x) {
    if (is.numeric(x)) { # If numeric, impute with mean
    x[is.na(x)] <- mean(x, na.rm = TRUE)
    } else { # mode otherwise
    x[is.na(x)] <- names(which.max(table(x)))
    }
    x
    }

    现在,每种方法的包装函数:
    # Original approach
    func0 <- function(d) {
    for (i in 1:nrow(d)) {
    if (is.na(d[i, "con_1"])) d[i,"con_1"] <- mean(d[,"con_1"], na.rm = TRUE)

    if (is.na(d[i, "con_2"])) d[i,"con_2"] <- mean(d[,"con_2"], na.rm = TRUE)

    if (is.na(d[i,"ord_1"])) d[i,"ord_1"] <- names(which.max(table(d[,"ord_1"])))

    if (is.na(d[i,"ord_2"])) d[i,"ord_2"] <- names(which.max(table(d[,"ord_2"])))
    }
    return(d)
    }

    # for loop operates directly on d
    func1 <- function(d) {
    for(i in seq_along(d)) {
    d[[i]] <- impute(d[[i]])
    }
    return(d)
    }

    # Use lapply()
    func2 <- function(d) {
    lapply(d, function(col) {
    impute(col)
    })
    }

    # Use sapply()
    func3 <- function(d) {
    sapply(d, function(col) {
    impute(col)
    })
    }

    # Use purrr::dmap()
    func4 <- function(d) {
    purrr::dmap(d, impute)
    }

    现在,我们将这些方法的性能与 n 范围从 10 到 100(非常小)进行比较:
    library(microbenchmark)
    ns <- seq(10, 100, by = 10)
    times <- sapply(ns, function(n) {
    dat <- create_dat(n)
    op <- microbenchmark(
    ORIGINAL = func0(dat),
    FOR_LOOP = func1(dat),
    LAPPLY = func2(dat),
    SAPPLY = func3(dat),
    DMAP = func4(dat)
    )
    by(op$time, op$expr, function(t) mean(t) / 1000)
    })
    times <- t(times)
    times <- as.data.frame(cbind(times, n = ns))

    # Plot the results
    library(tidyr)
    library(ggplot2)

    times <- gather(times, -n, key = "fun", value = "time")
    pd <- position_dodge(width = 0.2)
    ggplot(times, aes(x = n, y = time, group = fun, color = fun)) +
    geom_point(position = pd) +
    geom_line(position = pd) +
    theme_bw()

    enter image description here

    很明显,原始方法比在每列上使用矢量化函数 impute 的新方法慢得多。新人之间的区别是什么?让我们增加我们的样本量来检查:
    ns <- seq(5000, 50000, by = 5000)
    times <- sapply(ns, function(n) {
    dat <- create_dat(n)
    op <- microbenchmark(
    FOR_LOOP = func1(dat),
    LAPPLY = func2(dat),
    SAPPLY = func3(dat),
    DMAP = func4(dat)
    )
    by(op$time, op$expr, function(t) mean(t) / 1000)
    })
    times <- t(times)
    times <- as.data.frame(cbind(times, n = ns))
    times <- gather(times, -n, key = "fun", value = "time")
    pd <- position_dodge(width = 0.2)
    ggplot(times, aes(x = n, y = time, group = fun, color = fun)) +
    geom_point(position = pd) +
    geom_line(position = pd) +
    theme_bw()

    enter image description here

    看起来 sapply() 不太好(正如@Martin 指出的那样)。这是因为 sapply() 正在做额外的工作来将我们的数据变成矩阵形状(我们不需要)。如果您自己在没有 sapply() 的情况下运行它,您会发现其余的方法都非常具有可比性。

    所以主要的性能改进是在每一列上使用向量化函数。我建议在开始时使用 dmap,因为我通常喜欢函数风格和 purrr 包,但是您可以轻松地替换您喜欢的任何方法。

    除此之外,非常感谢@Martin 的非常有用的评论,让我改进了这个答案!

    关于R 使用 apply() 或 lapply() 等加速 for 循环,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38649411/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com