在 R 中使用聚集收集多列

如何解决在 R 中使用聚集收集多列

我想在 R 中收集数据帧的多列以使其“整洁”。

library(tidyverse)
set.seed(123)

df <- data.frame(time = seq(1,5,1),value_model_a = rnorm(5),ci_low_model_a = rnorm(5),ci_high_model_a = rnorm(5),value_model_b = rnorm(5),ci_low_model_b = rnorm(5),ci_high_model_b = rnorm(5))

#  time value_model_a ci_low_model_a ci_high_model_a value_model_b ci_low_model_b ci_high_model_b
#1    1    -0.3591146     -0.5881655      -0.4486189     0.7821898     -0.5315449      0.06015936
#2    2     0.8952444     -1.6314973       0.5618802     0.8228834     -0.2663575     -0.09029613
#3    3    -1.8961105      1.1529703       0.8896495    -0.1524523      0.5989563      0.35738994
#4    4     0.3471419      0.4373451      -0.7503646     0.3670275      1.7109441      0.11028077
#5    5     1.2780844     -1.3069509      -0.1286071     1.4340957      1.1876910     -1.69710214

预期输出

# time model  value       ci_low      ci_high
# 1    a      -0.3591146  -0.5881655  -0.4486189
# 2    a      0.8952444   ... and so on

问题

我正在努力使用 gather 包中的 tidyr 函数。如何正确使用它来重新组织此数据框？

解决方法

一开始可能会很艰难。

gather() 的新版本是 pivot_longer()。

以下是实现预期输出的方法。

首先，您可以告诉函数将所有内容设为默认值，仅使用时间作为标识符：

pivot_longer(df,-time) %>% head(5)
#> # A tibble: 30 x 3
#>     time name             value
#>    <dbl> <chr>            <dbl>
#>  1     1 value_model_a   -0.560
#>  2     1 ci_low_model_a   1.72 
#>  3     1 ci_high_model_a  1.22 
#>  4     1 value_model_b    1.79 
#>  5     1 ci_low_model_b  -1.07

这是一个开始，但您可以通过设置名称分隔符来进一步。您也可以使用 names_pattern 使用正则表达式。

df_l = pivot_longer(df,-time,names_sep="_model_",names_to=c("name","model"))
df_l
#> # A tibble: 30 x 4
#>     time name    model  value
#>    <dbl> <chr>   <chr>  <dbl>
#>  1     1 value   a     -0.560
#>  2     1 ci_low  a      1.72 
#>  3     1 ci_high a      1.22 
#>  4     1 value   b      1.79 
#>  5     1 ci_low  b     -1.07 
#>  6     1 ci_high b     -1.69 
#>  7     2 value   a     -0.230
#>  8     2 ci_low  a      0.461
#>  9     2 ci_high a      0.360
#> 10     2 value   b      0.498
#> # ... with 20 more rows

最后，您可以通过使用带有默认值的 pivot_wider() 来实现您的预期输出（我明确地出于学术目的而编写）：

pivot_wider(df_l,names_from = "name",values_from = "value")
#> # A tibble: 10 x 5
#>     time model   value ci_low ci_high
#>    <dbl> <chr>   <dbl>  <dbl>   <dbl>
#>  1     1 a     -0.560   1.72    1.22 
#>  2     1 b      1.79   -1.07   -1.69 
#>  3     2 a     -0.230   0.461   0.360
#>  4     2 b      0.498  -0.218   0.838
#>  5     3 a      1.56   -1.27    0.401
#>  6     3 b     -1.97   -1.03    0.153
#>  7     4 a      0.0705 -0.687   0.111
#>  8     4 b      0.701  -0.729  -1.14 
#>  9     5 a      0.129  -0.446  -0.556
#> 10     5 b     -0.473  -0.625   1.25

^{由 reprex package (v1.0.0) 于 2021 年 3 月 3 日创建}

set.seed(123)

library(tidyverse)
df <- data.frame(time = seq(1,5,1),value_model_a = rnorm(5),ci_low_model_a = rnorm(5),ci_high_model_a = rnorm(5),value_model_b = rnorm(5),ci_low_model_b = rnorm(5),ci_high_model_b = rnorm(5))

pivot_longer(df,names_sep = "_model_",names_to = c(".value","model"))
#> # A tibble: 10 x 5
#>     time model   value ci_low ci_high
#>    <dbl> <chr>   <dbl>  <dbl>   <dbl>
#>  1     1 a     -0.560   1.72    1.22 
#>  2     1 b      1.79   -1.07   -1.69 
#>  3     2 a     -0.230   0.461   0.360
#>  4     2 b      0.498  -0.218   0.838
#>  5     3 a      1.56   -1.27    0.401
#>  6     3 b     -1.97   -1.03    0.153
#>  7     4 a      0.0705 -0.687   0.111
#>  8     4 b      0.701  -0.729  -1.14 
#>  9     5 a      0.129  -0.446  -0.556
#> 10     5 b     -0.473  -0.625   1.25

^{由 reprex package (v1.0.0) 于 2021 年 3 月 3 日创建}