gpt4 book ai didi

游程长度序列按时间和ID

转载 作者:行者123 更新时间:2023-12-04 05:11:30 28 4
gpt4 key购买 nike

之前似乎没有解决过这个问题。

我想找到连续6个小时得分为1的科目数量。
尚未对每小时的受试者进行评分,因此,如果缺少一个小时,则这些小时将不连续,并且该6小时内的输出应为NA。
分配NA的原因是,我们不知道受试者在错过的时间得分如何。此问题可用于计算连续命中次数,但仅在受试者已参与的情况下才计算此问题。

我的数据框如下所示:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)

我尝试使用rle函数(我确信它是可能的),但我无法同时使用小时和ID来进行调节。
输出将是这样的:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

先感谢您。

我相信我提供的原始数据集太小,无法使解决方案更具通用性。
我只是用此数据集尝试了代码,发现这将导致错误的结果。
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)

对于新数据集,输出应为:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

最佳答案

这是在tidyverse中使用更新的数据集的一种方法:

library(tidyverse)

df %>%
group_by(ID) %>%
expand(hour = seq(min(hour), max(hour))) %>%
left_join(df) %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
filter(!is.na(A)) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2

检查请求的输出:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

all.equal(df, df_out2)
#output
TRUE

旧答案:
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2

让我们检查结果是否如要求的那样:
ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)

df1是请求的输出
all.equal(df1, df_out2)
#output
TRUE

一些基准测试:
library(microbenchmark)
library(data.table)

akrun <- function(df){
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}

missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}


Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}

microbenchmark(Mike(df),
akrun(df),
missuse(df))

#output
Unit: microseconds
expr min lq mean median uq max neval
Mike(df) 491.291 575.7115 704.2213 597.7155 629.0295 9578.684 100
akrun(df) 6568.313 6725.5175 7867.4059 6843.5790 7279.2240 69790.755 100
missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935 100

迈克·H(Mike H.)

关于游程长度序列按时间和ID,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48788059/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com