gpt4 book ai didi

r - 在 R 中合并大数据集并标记不匹配

转载 作者:行者123 更新时间:2023-12-04 05:16:23 24 4
gpt4 key购买 nike

我试图连续加入几个数据集,并标记第一个数据集中没有在后续数据集中找到匹配项的观察结果。下面是一个例子,我模拟了原始数据集加上三个要加入的附加数据。当前的代码做了我想要的,但效率很低。对于大型数据集,可能需要几天时间。是否可以使用 apply 或其他功能完成此任务?

#Toy datasets: x, y, z and w

#dataset X
id <- c(1:10, 1:100)
X1 <- rnorm(110, mean = 0, sd = 1)
year <- c("2004","2005","2006","2001","2002")
year <- rep(year, 22)

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 11)

x <- data.frame(id, X1, month, year)

#dataset Y
id2 <- c(1:10, 41:110)
Y1 <- rnorm(80, mean = 0 , sd = 1)
year <- c("2004","2005","2006","2001")
year <- rep(year, 20)

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 8)

y <- data.frame(id2,Y1, year,month)


#dataset z
id3 = c(1:60, 401:10000)
Z1 = rpois(9660, 10)
year = c('2004','2005','2006','2002')
year = rep(year, 2415)

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 966)

z = data.frame(id3,Z1,year,month)

#dataset w
id4 = c(1:300, 20:29)
W1 = rnorm(310, 20, 36)
year = c('2004','2005','2006','2000','2002')
year = rep(year, 62)

month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 31)

w = data.frame(id4, W1, year, month)


x$id2 = x$yflag = x$zflag = x$wflag = rep(NA, nrow(x))


y.index = rep(NA, nrow(x))
z.index = rep(NA, nrow(x))
w.index = rep(NA, nrow(x))

for(i in 1:nrow(x)) {

#compare to dataset y, insert yflag == 1 if the same ID, month, year is in x, otherwise 0
y.index = which(as.character(y$id2) == as.character(x$id[i])
& as.character(y$year) == as.character(x$year[i])
& as.character(y$month) == as.character(x$month[i]))
x$yflag[i] = ifelse(length(y.index==1), 1, 0)
x$id2[i] = ifelse(length(y.index) == 1, y$id2[y.index], x$id[i])

## compare to dataset z, insert zflag == 1 if the same ID, month, year is in x, otherwise 0
z.index <- which(as.character(z$id3) == as.character(x$id[i])
& as.character(z$month) == as.character(x$month[i])
& as.character(z$year) == as.character(x$year[i]))
x$zflag[i] <- ifelse(length(z.index == 1), 1, 0)


## compare to dataset w, insert wflag == 1 if the same ID, month, year is in x, otherwise 0
w.index <- which(as.character(w$id4) == as.character(x$id[i])
& as.character(w$month) == as.character(x$month[i])
& as.character(w$year) == as.character(x$year[i]))
x$wflag[i] <- ifelse(length(w.index == 1), 1, 0)
}

print(x)

最佳答案

许多解决方案之一:
创建所有四个后 data.frames ,

x$match.idx <- do.call(paste, c(x[,c("id", "month", "year")], sep=":"))
y$match.idx <- do.call(paste, c(y[,c("id2", "month", "year")], sep=":"))
z$match.idx <- do.call(paste, c(z[,c("id3", "month", "year")], sep=":"))
w$match.idx <- do.call(paste, c(w[,c("id4", "month", "year")], sep=":"))

xy.m <- match(x$match.idx, y$match.idx)
xz.m <- match(x$match.idx, z$match.idx)
xw.m <- match(x$match.idx, w$match.idx)
x$yflag <- x$zflag <- x$wflag <- 0
x$yflag[which(!is.na(xy.m))] <- 1
x$zflag[which(!is.na(xz.m))] <- 1
x$wflag[which(!is.na(xw.m))] <- 1

x <- subset(x, select=-c(match.idx))
> head(x)

id X1 month year wflag zflag yflag
1 1 -0.2470932 Jul 2004 1 1 1
2 2 0.2262816 Aug 2005 1 1 1
3 3 0.8473442 Sep 2006 1 1 1
4 4 0.9338628 Oct 2001 0 0 1
5 5 -0.1385540 Nov 2002 1 0 0
6 6 0.7825385 Dec 2004 1 0 0

关于r - 在 R 中合并大数据集并标记不匹配,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14215219/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com