r - 我可以使 for 循环中的 for 循环更有效率吗？-6ren

r - 我可以使 for 循环中的 for 循环更有效率吗？

转载作者：行者123 更新时间：2023-12-05 08:37:38

我的 R 脚本的目的是从数据框中提取一列数据点，然后对 10 个数据点进行线性回归，保存系数，然后继续接下来的 10 个数据点，直到所有数据列的点已被使用(编辑:为了更清楚，这里我的意思是对数据点 1-10、然后是 2-11、然后是 3-12、然后是 4-13 等进行回归)。然后对来自同一列的所有系数进行排序，并将 5 个最高值的平均值存储在新的数据框中。然后脚本继续到原始数据框的下一列并再次进行整个回归，将顶部系数的平均值存储在“系数”数据框中。代码如下:

    file_list <- list.files(path=getwd())

    df <- read_excel(file_list[1])
    name_vector <- colnames(df)
    regressors<-setdiff(names(df),"Time [min]")

n = 33  
coef2 <- data.frame()
coef3 <- data.frame()

#dat <- data.frame(df[[1]],df[[n]])

for (i in 1:length(regressors)) {
  dat <- data.frame(df[[1]], df[[(i + 1)]])
  for (i in 1:length(dat[[1]])) {
    temp_data <- dat[i:(i + 15),]
    linm <- lm(temp_data[[2]] ~ temp_data[[1]], data = temp_data)
    Inter <- summary(linm)$coefficients[1]
    Slope <- summary(linm)$coefficients[2]
    coef2 <-
      rbindlist(list(coef2, data.frame(Inter, Slope)), use.names = T)
  }
  mean <- coef2 %>%
    arrange(desc(Slope)) %>%
    slice(2:7)
  meanx <-  sapply(mean, FUN = mean)
  meanx <-
    data.frame(lapply(meanx, type.convert), stringsAsFactors = FALSE)
  coef3 <-  rbindlist(list(coef3, meanx), use.names = T)
  coef2 <- data.frame()
  z <- function(x)
    (meanx[[2]] * x + meanx[[1]])
  p <- qplot(dat[[1]],dat[[2]], data=dat, xlab="X-axis", ylab="Y-axis")+ylim(0,(max(dat[[2]], na.rm = TRUE)+100))
  b <- stat_function(fun=z)
  v <- print(p + b)
}

这里有一些示例数据:

Time    A   B   C
330 102 179 303
340 103 194 308
350 101 198 348
360 114 199 347
370 120 214 371
380 131 224 420
390 128 226 430
400 128 246 481
410 138 260 541
420 146 277 583
430 155 290 640
440 154 315 653

代码如我所愿，但执行起来真的很慢。我能以某种方式提高效率吗？我是 R 的新手，很多代码都是从各种来源拼接在一起的。

提前致谢-威廉

最佳答案

slider package专门帮助进行此类轧制操作。使用时对于数据框，slide() 函数为每个窗口应用一个函数数据中的行数。您可以使用它来插入模型拟合过程。

首先阅读示例数据:

library(tidyverse)

df <- read.table(header = TRUE, text = "
Time   A   B   C
 330 102 179 303
 340 103 194 308
 350 101 198 348
 360 114 199 347
 370 120 214 371
 380 131 224 420
 390 128 226 430
 400 128 246 481
 410 138 260 541
 420 146 277 583
 430 155 290 640
 440 154 315 653")

然后我们可以用 map_dfr() 替换输出结果的外层循环将结果合并到一个数据框中，并使用 slide_dfr() 进行内循环它的作用相同，但对于一个行窗口:

window_width <- 10
outcomes <- c("A", "B", "C")

# Estimate coefficients for each outcome
coefs <- map_dfr(set_names(outcomes), function(outcome) {
  # Fit model for each complete window in `df`
  slider::slide_dfr(df, function(data) {
    # Build model formula for outcome and fit
    f <- reformulate("Time", outcome)
    m <- lm(formula = f, data = data)
    set_names(coef(m), c("Inter", "Slope"))
  }, .after = window_width - 1, .complete = TRUE, .names_to = "Window")
}, .id = "Outcome")

coefs
#>   Outcome Window       Inter     Slope
#> 1       A      1   -67.30909 0.5024242
#> 2       A      2   -89.20000 0.5600000
#> 3       A      3   -87.54545 0.5545455
#> 4       B      1  -158.98182 1.0151515
#> 5       B      2  -191.86667 1.1030303
#> 6       B      3  -265.72727 1.2927273
#> 7       C      1  -749.07273 3.0993939
#> 8       C      2  -939.80000 3.6018182
#> 9       C      3 -1019.60000 3.8000000

请注意，我们保留了上方所有窗口的系数，而不仅仅是顶部5.结果总结最好保存在外面循环，如果你有足够的内存。这样，如果你决定你毕竟想要一个不同的总结，你不必重做昂贵的拟合模型的一部分。

有了整洁数据框中的系数，很容易总结:

n_top_slopes <- 5

mean_top_coefs <- coefs %>% 
  group_by(Outcome) %>% 
  slice_max(Slope, n = n_top_slopes) %>%
  summarise(across(c(Inter, Slope), mean))
#> `summarise()` ungrouping output (override with `.groups` argument)

mean_top_coefs
#> # A tibble: 3 x 3
#>   Outcome  Inter Slope
#>   <chr>    <dbl> <dbl>
#> 1 A        -81.4 0.539
#> 2 B       -206.  1.14 
#> 3 C       -903.  3.50

用数据绘制这些线性拟合的快速方法是使用 geom_abline():

df_long <- df %>% 
  pivot_longer(
    cols = all_of(outcomes),
    names_to = "Outcome",
    values_to = "Value"
  )

scatterplot <- function() {
  ggplot(df_long, aes(Time, Value)) +
    facet_wrap(~ Outcome) + geom_point()
}

scatterplot() +
  geom_abline(data = mean_top_coefs, aes(intercept = Inter, slope = Slope))

稍微复杂但更通用的方法是构建首先是一个新的预测数据框，然后用 geom_line() 绘制:

mean_top_fits <- df_long %>% 
  inner_join(mean_top_coefs, by = "Outcome") %>% 
  mutate(Value = Inter + Slope * Time)

scatterplot() + geom_line(data = mean_top_fits)

使用这种方法，我们可以，例如绘制每个单独窗口的拟合:

time_range <- function(window) {
  range(df$Time[seq(window, window + window_width - 1)])
}

# Calculate predicted lines for each window
window_fits <- coefs %>% 
  mutate(Time = map(Window, time_range)) %>% 
  unnest_longer(Time) %>% 
  mutate(Value = Inter + Slope * Time)

scatterplot() + geom_line(data = window_fits, aes(group = Window))