gpt4 book ai didi

r - 如何 rbind()/dplyr::bind_rows()/data.table::rbindlist() 包含数据框列的数据框?

转载 作者:行者123 更新时间:2023-12-02 01:35:00 26 4
gpt4 key购买 nike

base R、dplyr 和 data.table 无法绑定(bind)包含数据框列的数据框:

x <- data.frame(a=1)
x$b <- data.frame(z=2)
y <- data.frame(a=3)
y$b <- data.frame(z=4)

# base and dplyr fail
rbind(x, y)
#> Warning: non-unique value when setting 'row.names': '1'
#> Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
dplyr::bind_rows(x,y)
#> Error: Argument 2 can't be a list containing data frames

# data.table gives a result that doesn't make much sense to me
str(data.table::rbindlist(list(x,y)))
#> Warning in setDT(ans): Some columns are a multi-column type (such as a matrix
#> column): [2]. setDT will retain these columns as-is but subsequent operations
#> like grouping and joining may fail. Please consider as.data.table() instead
#> which will create a new column for each embedded column.
#> Classes 'data.table' and 'data.frame': 2 obs. of 2 variables:
#> $ a: num 1 3
#> $ b:'data.frame': 1 obs. of 2 variables:
#> ..$ : num 2
#> ..$ : num 4
#> - attr(*, ".internal.selfref")=<externalptr>

reprex package于2020年1月3日创建(v0.3.0)

我的预期输出是重新绑定(bind)数据框列,因此我们最终会得到类似于下面的 res 的内容:

res <- data.frame(a= c(1,3))
res$b <- data.frame(z = c(3,4))
res
#> a z
#> 1 1 3
#> 2 3 4
str(res)
#> 'data.frame': 2 obs. of 2 variables:
#> $ a: num 1 3
#> $ b:'data.frame': 2 obs. of 1 variable:
#> ..$ z: num 3 4

我该如何解决这个问题?

最佳答案

我们可以将数据框列与常规列分开绑定(bind),这里有 3 个类似的解决方案,包装了问题中提到的 3 个函数:

基础R

rbind_fixed <- function(...){
dfs <- list(...)
# get all names of data.frame columns
get_df_col_ind <- function(df) sapply(df, is.data.frame)
df_col_names_list <- lapply(dfs, function(df) names(df[get_df_col_ind(df)]))
df_col_names <- unique(do.call(c,df_col_names_list))
# fail if these are not consistently data frames in all arguments
for(df_col_name in df_col_names) {
for(df in dfs){
if(!is.null(df[[df_col_name]]) && !is.data.frame(df[[df_col_name]]))
stop(df_col_name, "is not consistently a data frame column")
}
}
# bind data frames, except for data frame columns
dfs_regular <- lapply(dfs, function(df) df[setdiff(names(df), df_col_names)])
res <- do.call(rbind, dfs_regular)
# bind data frame columns separately and add them to the result
for(df_col_name in df_col_names) {
subdfs <- lapply(dfs, function(df) {
if(df_col_name %in% names(df)) df[[df_col_name]] else
data.frame(row.names = seq.int(nrow(df)))
})
# recursive to be robust in case of deep nested data frames
res[[df_col_name]] <- do.call(rbind_fixed, subdfs)
}
res
}
rbind_fixed(x, y)
#> a z
#> 1 1 2
#> 2 3 4

dplyr

bind_rows_fixed <- function(...){
# use list2() so we can use `!!!`, as we lose the "autosplice" feature of bind_rows
dfs <- rlang::list2(...)
# get all names of data.frame columns
get_df_col_ind <- function(df) sapply(df, is.data.frame)
df_col_names_list <- lapply(dfs, function(df) names(df[get_df_col_ind(df)]))
df_col_names <- unique(do.call(c,df_col_names_list))
# fail if these are not consistently data frames in all arguments
for(df_col_name in df_col_names) {
for(df in dfs){
if(!is.null(df[[df_col_name]]) && !is.data.frame(df[[df_col_name]]))
stop(df_col_name, "is not consistently a data frame column")
}
}
# bind data frames, except for data frame columns
dfs_regular <- lapply(dfs, function(df) df[setdiff(names(df), df_col_names)])
res <- dplyr::bind_rows(dfs_regular)
# bind data frame columns separately and add them to the result
for(df_col_name in df_col_names) {
subdfs <- lapply(dfs, function(df) {
if(df_col_name %in% names(df)) df[[df_col_name]] else
tibble(.rows = nrow(df))
})

# recursive to be robust in case of deep nested data frames
res[[df_col_name]] <- bind_rows_fixed(!!!subdfs)
}
res
}
bind_rows_fixed(x,y)
#> a z
#> 1 1 2
#> 2 3 4

数据表

rbindlist_fixed <- function(l){
dfs <- l
# get all names of data.frame columns
get_df_col_ind <- function(df) sapply(df, is.data.frame)
df_col_names_list <- lapply(dfs, function(df) names(df[get_df_col_ind(df)]))
df_col_names <- unique(do.call(c,df_col_names_list))
# fail if these are not consistently data frames in all arguments
for(df_col_name in df_col_names) {
for(df in dfs){
if(!is.null(df[[df_col_name]]) && !is.data.frame(df[[df_col_name]]))
stop(df_col_name, "is not consistently a data frame column")
}
}
# bind data frames, except for data frame columns
dfs_regular <- lapply(dfs, function(df) df[setdiff(names(df), df_col_names)])
res <- data.table::rbindlist(dfs_regular)
# bind data frame columns separately and add them to the result
for(df_col_name in df_col_names) {
subdfs <- lapply(dfs, function(df) {
if(df_col_name %in% names(df)) df[[df_col_name]] else
data.frame(row.names = seq.int(nrow(df)))
})
# recursive to be robust in case of deep nested data frames
res[[df_col_name]] <- rbindlist_fixed(subdfs)
}
res
}
dt <- rbindlist_fixed(list(x,y))
dt
#> a b
#> 1: 1 <multi-column>
#> 2: 3 <multi-column>
str(dt)
#> Classes 'data.table' and 'data.frame': 2 obs. of 2 variables:
#> $ a: num 1 3
#> $ b:Classes 'data.table' and 'data.frame': 2 obs. of 1 variable:
#> ..$ z: num 2 4
#> ..- attr(*, ".internal.selfref")=<externalptr>
#> - attr(*, ".internal.selfref")=<externalptr>

关于r - 如何 rbind()/dplyr::bind_rows()/data.table::rbindlist() 包含数据框列的数据框?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59571318/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com