gpt4 book ai didi

r - 多个列表上多个变量的统计检验(tibble)

转载 作者:行者123 更新时间:2023-12-05 02:39:13 32 4
gpt4 key购买 nike

为了简化我的数据分析,我需要针对不同的变量和不同的数据组处理不同的统计测试(在该示例中为 shapiro 测试)。目的是不要写 150 次相同的代码。为此,我从我的数据集中创建了一个 tibble,其中每个列表对应一组数据。

数据集:

dput(head(PdataLaKo,25)) 
structure(list(ctd_file = c("2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-03T090018 SBE0251090", "2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090", "2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090"), station = c("1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "3", "3", "3", "3", "3"), month = c(8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L), day = c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L), year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), time = c(16.38, 16.38, 16.38, 16.38, 16.38, 16.38, 16.38,
16.38, 16.38, 16.38, 17.57, 17.57, 17.57, 17.57, 17.57, 17.57,
17.57, 17.57, 17.57, 17.57, 9.05, 9.05, 9.05, 9.05, 9.05), LAT = c("69.27166667",
"69.27166667", "69.27166667", "69.27166667", "69.27166667", "69.27166667",
"69.27166667", "69.27166667", "69.27166667", "69.27166667", "69.25386667",
"69.25386667", "69.25386667", "69.25386667", "69.25386667", "69.25386667",
"69.25386667", "69.25386667", "69.25386667", "69.25386667", "69.23883333",
"69.23883333", "69.23883333", "69.23883333", "69.23883333"),
LONG = c("-25.15166667", "-25.15166667", "-25.15166667",
"-25.15166667", "-25.15166667", "-25.15166667", "-25.15166667",
"-25.15166667", "-25.15166667", "-25.15166667", "-25.0637",
"-25.0637", "-25.0637", "-25.0637", "-25.0637", "-25.0637",
"-25.0637", "-25.0637", "-25.0637", "-25.0637", "-24.9915",
"-24.9915", "-24.9915", "-24.9915", "-24.9915"), bottom_depth = c("226",
"226", "226", "226", "226", "226", "226", "226", "226", "226",
"229", "229", "229", "229", "229", "229", "229", "229", "229",
"229", "255", "255", "255", "255", "255"), timeS = c(223.5,
227.236, 228.558, 228.916, 229.336, 229.749, 230.119, 230.49,
228.871, 231.587, 242.497, 241.334, 241.591, 241.869, 242.205,
242.601, 249.68, 249.758, 250.013, 250.3, 266.6, 267.148,
267.57, 267.899, 268.245), pressure = c(0.607, 0.707, 0.808,
0.909, 1.01, 1.112, 1.212, 1.313, 1.516, 1.616, 0.503, 0.101,
0.202, 0.303, 0.404, 0.505, 0.606, 0.707, 0.808, 0.909, 0.303,
0.404, 0.505, 0.606, 0.707), depth = c(0.6, 0.7, 0.8, 0.9,
1, 1.1, 1.2, 1.3, 1.5, 1.6, 0.5, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7), salinity = c(30.195,
30.1989, 30.199, 30.1737, 30.1894, 30.1824, 30.1546, 30.2275,
29.9942, 30.2885, 28.2285, 28.1332, 28.2182, 28.3279, 28.3815,
28.4242, 29.0786, 29.1525, 29.2647, 29.3672, 32.484, 32.3028,
31.885, 31.3775, 30.978), temperature = c(2.6459, 2.6477,
2.6263, 2.5913, 2.6188, 2.608, 2.6106, 2.6746, 2.6973, 2.6837,
1.9945, 2.0299, 1.9315, 1.7799, 1.734, 1.7082, 1.9263, 1.928,
1.9184, 1.8562, 1.1441, 1.073, 1.052, 1.0518, 1.0603), oxygen = c(352.891,
352.339, 352.12, 352.03, 351.884, 351.702, 351.505, 351.296,
347.288, 351.001, 345.827, 345.879, 345.866, 345.841, 345.782,
345.65, 346.849, 346.78, 346.718, 346.609, 340.888, 340.026,
339.348, 338.819, 338.245), oxygen2 = c(102.182, 102.228,
102.257, 102.281, 102.313, 102.347, 102.383, 102.427, 101.491,
102.644, 97.258, 97.311, 97.301, 97.29, 97.275, 97.253, 98.285,
98.293, 98.325, 98.357, 97.857, 97.839, 97.829, 97.812, 97.788
), fluorescence = c(0.17892, 0.17232, 0.17139, 0.17278, 0.17479,
0.17701, 0.1792, 0.18158, 0.18525, 0.18909, 0.10588, 0.10647,
0.10697, 0.10818, 0.10978, 0.11157, 0.1479, 0.14969, 0.15256,
0.15607, 0.30833, 0.31088, 0.31321, 0.31532, 0.31758), turbidity = c(0.164,
0.164, 0.164, 0.164, 0.163, 0.163, 0.163, 0.163, 0.161, 0.163,
0.165, 0.165, 0.165, 0.165, 0.165, 0.164, 0.159, 0.16, 0.16,
0.16, 4.742, 4.747, 4.751, 4.753, 4.755), conductivity = c(2.742475,
2.743057, 2.74087, 2.73591, 2.740138, 2.738197, 2.736985,
2.747572, 2.72913, 2.752912, 2.524555, 2.525942, 2.524426,
2.523692, 2.525012, 2.526692, 2.596153, 2.602249, 2.609861,
2.613963, 2.806078, 2.786396, 2.752134, 2.712379, 2.682113
), pH = c(8.094, 8.098, 8.099, 8.099, 8.099, 8.099, 8.099,
8.099, 8.014, 8.099, 8.16, 8.16, 8.16, 8.16, 8.159, 8.159,
8.149, 8.148, 8.147, 8.146, 8.121, 8.12, 8.12, 8.119, 8.118
), par = c(2627.7, 2627.7, 2627.7, 2627.7, 2627.7, 2627.7,
2627.7, 2627.7, 2600, 2627.7, 2627.8, 2627.8, 2627.8, 2627.8,
2627.8, 2627.8, 2627.8, 2627.8, 2627.8, 2627.8, 2627.6, NA,
2627.6, 2627.6, 2627.6), soundSpeed = c(1454.65, 1454.65,
1454.57, 1454.38, 1454.53, 1454.47, 1454.45, 1454.82, 1439.8,
1454.96, 1449.08, 1449.22, 1448.88, 1448.36, 1448.23, 1448.18,
1449.98, 1450.1, 1450.19, 1450.08, 1450.99, 1450.48, 1449.83,
1449.19, 1448.69), svDM = c(1454.55, 1454.55, 1454.48, 1454.29,
1454.43, 1454.37, 1454.35, 1454.72, 1439.71, 1454.86, 1448.98,
1449.12, 1448.78, 1448.26, 1448.14, 1448.08, 1449.89, 1450.01,
1450.1, 1449.98, 1450.91, 1450.4, 1449.76, 1449.12, 1448.61
), svWM = c(1454.57, 1454.57, 1454.49, 1454.3, 1454.45, 1454.39,
1454.37, 1454.74, 1439.73, 1454.89, 1448.89, 1449.03, 1448.69,
1448.18, 1448.05, 1448, 1449.84, 1449.96, 1450.05, 1449.94,
1451.01, 1450.49, 1449.83, 1449.16, 1448.62)), row.names = c(NA,
25L), class = "data.frame")

#to correspond to the number of observations, the rows are duplicated

PdataLaKo <- as.data.frame(lapply(PdataLaKo, rep, 250))

为此,我使用了包 dplyr 中的函数 group_split,它创建了与每个​​组 (ctd_file) 对应的不同列表的小标题。在我尝试将 rstatix 包中的 shapiro.test 应用于具有 sapply 的组中的每个选定变量以及具有 的每个组之后应用

PdataLaKo <- PdataLaKo %>%
group_split(ctd_file)
lnorm <- PdataLaKo[, c(11:24)] %>%
map(~ lapply(., sapply(., shapiro.test)))

如果我使用过滤器提取一组并且不创建 tibble,则应用于每个变量的 sapply 函数会起作用。因此,问题来自 tibble 而不是对选定列的应用。

#non-grouped data
PdataLaKot <- PdataLaKo %>% filter(ctd_file == "2018-08-05T092503 SBE0251090")
lnorm <- PdataLaKot[, c(11:24)] %>% sapply(shapiro.test)

最佳答案

library(tidyverse)

shapiro_test_safely <- possibly(.f = shapiro.test, otherwise = NA)

a <- df %>%
group_nest(ctd_file) %>%
mutate(
data = map(data, ~select(.x, where(is.numeric))[, -c(1:4)]),
res = map(data, ~map(.x, shapiro_test_safely)),
res = purrr::set_names(res, nm = ctd_file))

fltr <- map(a$res, ~!is.na(.x))

b <- map2(.x = a$res, .y = fltr, ~.x[.y])

map_df(b, ~map_df(.x, broom::glance)) %>%
mutate(nm = map(b, names) %>% unlist()) %>%
mutate(SET = rep(names(b), sapply(b, length)))

#> # A tibble: 43 x 5
#> statistic p.value method nm SET
#> <dbl> <dbl> <chr> <chr> <chr>
#> 1 0.862 0.0807 Shapiro-Wilk normality test timeS 2018-08-02T162038~
#> 2 0.969 0.879 Shapiro-Wilk normality test pressure 2018-08-02T162038~
#> 3 0.969 0.882 Shapiro-Wilk normality test depth 2018-08-02T162038~
#> 4 0.800 0.0144 Shapiro-Wilk normality test salinity 2018-08-02T162038~
#> 5 0.946 0.625 Shapiro-Wilk normality test temperature 2018-08-02T162038~
#> 6 0.711 0.00119 Shapiro-Wilk normality test oxygen 2018-08-02T162038~
#> 7 0.776 0.00738 Shapiro-Wilk normality test oxygen2 2018-08-02T162038~
#> 8 0.939 0.543 Shapiro-Wilk normality test fluorescence 2018-08-02T162038~
#> 9 0.750 0.00356 Shapiro-Wilk normality test turbidity 2018-08-02T162038~
#> 10 0.978 0.954 Shapiro-Wilk normality test conductivity 2018-08-02T162038~
#> # ... with 33 more rows

reprex package 创建于 2021-09-16 (v2.0.1)

数据

df <-
structure(
list(
ctd_file = c(
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090",
"2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090"
),
station = c(
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"2",
"2",
"2",
"2",
"2",
"2",
"2",
"2",
"2",
"2",
"3",
"3",
"3",
"3",
"3"
),
month = c(
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L,
8L
),
day = c(
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
2L,
3L,
3L,
3L,
3L,
3L
),
year = c(
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L,
2018L
),
time = c(
16.38,
16.38,
16.38,
16.38,
16.38,
16.38,
16.38,
16.38,
16.38,
16.38,
17.57,
17.57,
17.57,
17.57,
17.57,
17.57,
17.57,
17.57,
17.57,
17.57,
9.05,
9.05,
9.05,
9.05,
9.05
),
LAT = c(
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.27166667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.25386667",
"69.23883333",
"69.23883333",
"69.23883333",
"69.23883333",
"69.23883333"
),
LONG = c(
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.15166667",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-25.0637",
"-24.9915",
"-24.9915",
"-24.9915",
"-24.9915",
"-24.9915"
),
bottom_depth = c(
"226",
"226",
"226",
"226",
"226",
"226",
"226",
"226",
"226",
"226",
"229",
"229",
"229",
"229",
"229",
"229",
"229",
"229",
"229",
"229",
"255",
"255",
"255",
"255",
"255"
),
timeS = c(
223.5,
227.236,
228.558,
228.916,
229.336,
229.749,
230.119,
230.49,
228.871,
231.587,
242.497,
241.334,
241.591,
241.869,
242.205,
242.601,
249.68,
249.758,
250.013,
250.3,
266.6,
267.148,
267.57,
267.899,
268.245
),
pressure = c(
0.607,
0.707,
0.808,
0.909,
1.01,
1.112,
1.212,
1.313,
1.516,
1.616,
0.503,
0.101,
0.202,
0.303,
0.404,
0.505,
0.606,
0.707,
0.808,
0.909,
0.303,
0.404,
0.505,
0.606,
0.707
),
depth = c(
0.6,
0.7,
0.8,
0.9,
1,
1.1,
1.2,
1.3,
1.5,
1.6,
0.5,
0.1,
0.2,
0.3,
0.4,
0.5,
0.6,
0.7,
0.8,
0.9,
0.3,
0.4,
0.5,
0.6,
0.7
),
salinity = c(
30.195,
30.1989,
30.199,
30.1737,
30.1894,
30.1824,
30.1546,
30.2275,
29.9942,
30.2885,
28.2285,
28.1332,
28.2182,
28.3279,
28.3815,
28.4242,
29.0786,
29.1525,
29.2647,
29.3672,
32.484,
32.3028,
31.885,
31.3775,
30.978
),
temperature = c(
2.6459,
2.6477,
2.6263,
2.5913,
2.6188,
2.608,
2.6106,
2.6746,
2.6973,
2.6837,
1.9945,
2.0299,
1.9315,
1.7799,
1.734,
1.7082,
1.9263,
1.928,
1.9184,
1.8562,
1.1441,
1.073,
1.052,
1.0518,
1.0603
),
oxygen = c(
352.891,
352.339,
352.12,
352.03,
351.884,
351.702,
351.505,
351.296,
347.288,
351.001,
345.827,
345.879,
345.866,
345.841,
345.782,
345.65,
346.849,
346.78,
346.718,
346.609,
340.888,
340.026,
339.348,
338.819,
338.245
),
oxygen2 = c(
102.182,
102.228,
102.257,
102.281,
102.313,
102.347,
102.383,
102.427,
101.491,
102.644,
97.258,
97.311,
97.301,
97.29,
97.275,
97.253,
98.285,
98.293,
98.325,
98.357,
97.857,
97.839,
97.829,
97.812,
97.788
),
fluorescence = c(
0.17892,
0.17232,
0.17139,
0.17278,
0.17479,
0.17701,
0.1792,
0.18158,
0.18525,
0.18909,
0.10588,
0.10647,
0.10697,
0.10818,
0.10978,
0.11157,
0.1479,
0.14969,
0.15256,
0.15607,
0.30833,
0.31088,
0.31321,
0.31532,
0.31758
),
turbidity = c(
0.164,
0.164,
0.164,
0.164,
0.163,
0.163,
0.163,
0.163,
0.161,
0.163,
0.165,
0.165,
0.165,
0.165,
0.165,
0.164,
0.159,
0.16,
0.16,
0.16,
4.742,
4.747,
4.751,
4.753,
4.755
),
conductivity = c(
2.742475,
2.743057,
2.74087,
2.73591,
2.740138,
2.738197,
2.736985,
2.747572,
2.72913,
2.752912,
2.524555,
2.525942,
2.524426,
2.523692,
2.525012,
2.526692,
2.596153,
2.602249,
2.609861,
2.613963,
2.806078,
2.786396,
2.752134,
2.712379,
2.682113
),
pH = c(
8.094,
8.098,
8.099,
8.099,
8.099,
8.099,
8.099,
8.099,
8.014,
8.099,
8.16,
8.16,
8.16,
8.16,
8.159,
8.159,
8.149,
8.148,
8.147,
8.146,
8.121,
8.12,
8.12,
8.119,
8.118
),
par = c(
2627.7,
2627.7,
2627.7,
2627.7,
2627.7,
2627.7,
2627.7,
2627.7,
2600,
2627.7,
2627.8,
2627.8,
2627.8,
2627.8,
2627.8,
2627.8,
2627.8,
2627.8,
2627.8,
2627.8,
2627.6,
NA,
2627.6,
2627.6,
2627.6
),
soundSpeed = c(
1454.65,
1454.65,
1454.57,
1454.38,
1454.53,
1454.47,
1454.45,
1454.82,
1439.8,
1454.96,
1449.08,
1449.22,
1448.88,
1448.36,
1448.23,
1448.18,
1449.98,
1450.1,
1450.19,
1450.08,
1450.99,
1450.48,
1449.83,
1449.19,
1448.69
),
svDM = c(
1454.55,
1454.55,
1454.48,
1454.29,
1454.43,
1454.37,
1454.35,
1454.72,
1439.71,
1454.86,
1448.98,
1449.12,
1448.78,
1448.26,
1448.14,
1448.08,
1449.89,
1450.01,
1450.1,
1449.98,
1450.91,
1450.4,
1449.76,
1449.12,
1448.61
),
svWM = c(
1454.57,
1454.57,
1454.49,
1454.3,
1454.45,
1454.39,
1454.37,
1454.74,
1439.73,
1454.89,
1448.89,
1449.03,
1448.69,
1448.18,
1448.05,
1448,
1449.84,
1449.96,
1450.05,
1449.94,
1451.01,
1450.49,
1449.83,
1449.16,
1448.62
)
),
row.names = c(NA,
25L),
class = "data.frame"
)

关于r - 多个列表上多个变量的统计检验(tibble),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69210795/

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com