dplyr包中函数使用

本文总结一下dplyr包中函数的使用，包括：

select()
rename()
filter()
mutate()
arrange()
summarise()
group_by()
left_join()
right_join()
full_join()
semi_join()
anti_join()
%>%
separate()
unite()
distinct()
across()

1. select()

一些operators：

：a range of consecutive variables
！补集，即非
&、|交集与、并集或
c()combine selections

一些辅助函数如下，在select()中直接使用列时不需要引用””，但使用上述辅助函数时必须引用””。
对于列：

everything()选择所有变量，一般调整数据集中变量顺序时使用
last_cols()Select last variable, possibly with an offset.
group_cols()Select all grouping columns.

根据名字选择：

starts_with("X") 以 “X”开头的变量名
ends_with("X") 以 “X”结束的变量名
contains("X") 包含 “X”的变量名
matches("X")匹配正则表达式”X”的变量名
num_ranges("x",1:5)变量名为 x01, x02, x03, x04 and x05

library(tidyverse)

my_iris <- as_tibble(iris)
my_iris
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
 1          5.1         3.5          1.4         0.2 setosa 
 2          4.9         3            1.4         0.2 setosa 
 3          4.7         3.2          1.3         0.2 setosa 
# ... with 147 more rows

选择一列：

my_iris %>% select(Sepal.Length)
# A tibble: 150 x 1
   Sepal.Length
          <dbl>
 1          5.1
 2          4.9
 3          4.7
# ... with 147 more rows

选择多列：

my_iris %>% select(Species,Petal.Length)
# A tibble: 150 x 2
   Species Petal.Length
   <fct>          <dbl>
 1 setosa           1.4
 2 setosa           1.4
 3 setosa           1.3
# ... with 147 more rows

选择范围的列:

my_iris %>% select(Sepal.Length:Petal.Length)
# A tibble: 150 x 3
   Sepal.Length Sepal.Width Petal.Length
          <dbl>       <dbl>        <dbl>
 1          5.1         3.5          1.4
 2          4.9         3            1.4
 3          4.7         3.2          1.3
# ... with 147 more rows

反向选择列：

my_iris %>% select(!Sepal.Length)
# A tibble: 150 x 4
   Sepal.Width Petal.Length Petal.Width Species
         <dbl>        <dbl>       <dbl> <fct>  
 1         3.5          1.4         0.2 setosa 
 2         3            1.4         0.2 setosa 
 3         3.2          1.3         0.2 setosa 
# ... with 147 more rows

my_iris %>% select(!(Sepal.Length:Petal.Length))
# A tibble: 150 x 2
   Petal.Width Species
         <dbl> <fct>  
 1         0.2 setosa 
 2         0.2 setosa 
 3         0.2 setosa 
# ... with 147 more rows

根据列名选择列：

my_iris %>% select(starts_with("Petal"))
# A tibble: 150 x 2
   Petal.Length Petal.Width
          <dbl>       <dbl>
 1          1.4         0.2
 2          1.4         0.2
 3          1.3         0.2
 # ... with 147 more rows
my_iris %>% select(ends_with("Width"))
# A tibble: 150 x 2
   Sepal.Width Petal.Width
         <dbl>       <dbl>
 1         3.5         0.2
 2         3           0.2
 3         3.2         0.2
 # ... with 147 more rows
my_iris %>% select(contains("Petal"))
# A tibble: 150 x 2
   Petal.Length Petal.Width
          <dbl>       <dbl>
 1          1.4         0.2
 2          1.4         0.2
 3          1.3         0.2
 # ... with 147 more rows

其他用法：

my_iris %>% select(-(1:3))
my_iris %>% select(-1,-3)
my_iris %>% selcet(-"Petal.Width")

my_iris %>% select(!starts_with("Petal"))
my_iris %>% select(!ends_with("Width"))
my_iris %>% select(!contains("Petal"))
my_iris %>% select(matches(".Wid."))
my_iris %>% select(one_of(c("col1","col2")))

my_iris %>% select(is.numeric)
my_iris %>% select(where(is.character))

my_iris %>% select(starts_with("Petal") & ends_with("Width"))
my_iris %>% select(starts_with("Petal") | ends_with("Width"))
my_iris %>% select(starts_with("Petal") & !ends_with("Width"))

2. rename()

修改列的名字：

1	rename(.data,new_name=old_name)

my_iris %>% 
 select(Species,Petal.Length) %>%
 rename(Spe = Species)
# A tibble: 150 x 2
   Spe    Petal.Length
   <fct>         <dbl>
 1 setosa          1.4
 2 setosa          1.4
 3 setosa          1.3
# ... with 147 more rows

3. filter()

对数据行进行筛选，保留符合条件的行，以下为关系运算符：

== , > , >= etc
& , |,!, xor()
is.na() 、 !is.na()
%in%
between() , near()

my_iris %>% filter(Species=="virginica")
# A tibble: 50 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
          <dbl>       <dbl>        <dbl>       <dbl> <fct>    
 1          6.3         3.3          6           2.5 virginica
 2          5.8         2.7          5.1         1.9 virginica
 3          7.1         3            5.9         2.1 virginica
# ... with 47 more rows

my_iris %>% filter(Species=="virginica",Sepal.Length>5)
my_iris %>% filter(Species=="virginica"&Sepal.Length>5)
# A tibble: 49 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
          <dbl>       <dbl>        <dbl>       <dbl> <fct>    
 1          6.3         3.3          6           2.5 virginica
 2          5.8         2.7          5.1         1.9 virginica
 3          7.1         3            5.9         2.1 virginica
# ... with 46 more rows

my_iris %>% filter(Species=="virginica"&Sepal.Length>mean(Sepal.Length))
# A tibble: 44 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
          <dbl>       <dbl>        <dbl>       <dbl> <fct>    
 1          6.3         3.3          6           2.5 virginica
 2          7.1         3            5.9         2.1 virginica
 3          6.3         2.9          5.6         1.8 virginica
# ... with 41 more rows

1	my_iris %>% filter(Sepal.Length == max(Sepal.Length))

其他：

# 选择全部为0的列：
myfun <- function(x) sum(x)==0
df %>% select(where(myfun))

#去掉全部为NA的列：
df %>% select(where(~ !all(is.na(.x))))
df %>% filter(if_any(everything(),~ !is.na(.x)))

4. mutate()

函数用于给数据生成新的列。


```R
my_iris %>% 
	select(Sepal.Length,Petal.Length,Species) %>%
	mutate(Length=Sepal.Length+Petal.Length,
		Length2=Sepal.Length-Petal.Length) %>%
	filter(Species=="virginica")
# A tibble: 50 x 5
   Sepal.Length Petal.Length Species   Length Length2
          <dbl>        <dbl> <fct>      <dbl>   <dbl>
 1          6.3          6   virginica   12.3   0.300
 2          5.8          5.1 virginica   10.9   0.7  
 3          7.1          5.9 virginica   13     1.2  
# ... with 47 more rows
# i Use `print(n = ...)` to see more rows

my_iris %>% 
  mutate(
    new_col = case_when(
      Petal.Length < 5                     ~ "A",
      Petal.Length >= 5 & Petal.Length < 6 ~ "B",
      Petal.Length >= 6 & Petal.Length < 7 ~ "C",
      TRUE                                 ~ "D"
  )
)

5. arrange()

根据一个变量或多个变量对数据的行进行排序，默认为升序排列。
根据变量1升序排列：

my_iris %>% arrange(Petal.Length)
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
 1          4.6         3.6          1           0.2 setosa 
 2          4.3         3            1.1         0.1 setosa 
 3          5.8         4            1.2         0.2 setosa 
 4          5           3.2          1.2         0.2 setosa 
 5          4.7         3.2          1.3         0.2 setosa 
# ... with 145 more rows

根据变量1降序排列,用desc()或加负号-：

my_iris %>% arrange(-Petal.Length)
my_iris %>% arrange(desc(Petal.Length))
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
          <dbl>       <dbl>        <dbl>       <dbl> <fct>    
 1          7.7         2.6          6.9         2.3 virginica
 2          7.7         3.8          6.7         2.2 virginica
 3          7.7         2.8          6.7         2   virginica
 4          7.6         3            6.6         2.1 virginica
 5          7.9         3.8          6.4         2   virginica
# ... with 145 more rows

先根据变量1升序排列，再根据变量2升序排列：

my_iris %>% arrange(Petal.Length,Petal.Width)
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
 1          4.6         3.6          1           0.2 setosa 
 2          4.3         3            1.1         0.1 setosa 
 3          5.8         4            1.2         0.2 setosa 
 4          5           3.2          1.2         0.2 setosa 
 5          4.7         3.2          1.3         0.2 setosa 
# ... with 145 more rows

先根据变量1降序排列，再根据变量2升序排列：

my_iris %>% arrange(desc(Petal.Length),Petal.Width)
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
          <dbl>       <dbl>        <dbl>       <dbl> <fct>    
 1          7.7         2.6          6.9         2.3 virginica
 2          7.7         2.8          6.7         2   virginica
 3          7.7         3.8          6.7         2.2 virginica
 4          7.6         3            6.6         2.1 virginica
 5          7.9         3.8          6.4         2   virginica
# ... with 145 more rows

my_iris%>% arrange(across(starts_with("Sepal")))
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1            4.3         3.0          1.1         0.1     setosa
#> 2            4.4         2.9          1.4         0.2     setosa
#> 3            4.4         3.0          1.3         0.2     setosa
#> 4            4.4         3.2          1.3         0.2     setosa
#> 5            4.5         2.3          1.3         0.3     setosa
......
my_iris %>% arrange(across(starts_with("Sepal"),desc))
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1            7.9         3.8          6.4         2.0  virginica
#> 2            7.7         3.8          6.7         2.2  virginica
#> 3            7.7         3.0          6.1         2.3  virginica
#> 4            7.7         2.8          6.7         2.0  virginica
#> 5            7.7         2.6          6.9         2.3  virginica
......

6. summarise()

用于统计汇总，往往与其他函数配合使用。
计算均值：

my_iris %>% summarise(mean_s.len = mean(Sepal.Length))
# A tibble: 1 x 1
  mean_s.len
       <dbl>
1       5.84

计算标准差：

my_iris %>% summarise(sd_s.len = sd(Sepal.Length))
# A tibble: 1 x 1
  sd_s.len
     <dbl>
1    0.828

多个统计：

my_iris %>% summarise(
	mean_s.len = mean(Sepal.Length),
	sd_s.len = sd(Sepal.Length),
	n=n(),
	sum_s.len = sum(Sepal.Length))
# A tibble: 1 x 4
  mean_s.len sd_s.len     n sum_s.len
       <dbl>    <dbl> <int>     <dbl>
1       5.84    0.828   150      876.

7. group_by()

经常和 ```group_by()``` 一起用，用于分组统计：


```R
my_iris %>% 
 group_by(Species) %>%
 summarise(
	mean_s.len = mean(Sepal.Length),
	sd_s.len = sd(Sepal.Length),
	n=n(),
	sum_s.len = sum(Sepal.Length))
# A tibble: 3 x 5
  Species    mean_s.len sd_s.len     n sum_s.len
  <fct>           <dbl>    <dbl> <int>     <dbl>
1 setosa           5.01    0.352    50      250.
2 versicolor       5.94    0.516    50      297.
3 virginica        6.59    0.636    50      329.

8. xxx_join()

xxx_join():

left_join() :左连接
right_jion() ：右连接
full_join() ：全连接
inner_join() ：内连接
semi_join(x,y) 、 anti_join(x,y) ：筛选连接

左连接，以df1中的gene_name为准

1 2	left_join(df1,df2,by="gene_name") df1 %>% left_join(df2,by = "gene_name")

右连接，以df2中的gene_name为准

1 2	right_join(df1,df2,by="gene_name") df1 %>% right_join(df2,by = "gene_name")

全连接，df1和df2中的gene_name的行全部都会保留

1 2	full_join(df1,df2,by="gene_name") df1 %>% full_join(df2,by = "gene_name")

内连接，df1和df2中都有的gene_name的行才会保留

1 2	inner_join(df1,df2,by="gene_name") df1 %>% inner_join(df2,by = "gene_name")

筛选连接，不改变x数据框的数量，但会影响x的观测，即剔除一些行，类似于filter()。
半连接 semi_join(x,y) 保留gene_name与df2的gene_name一样的所有行：

1 2	semi_join(df1,df2,by = "gene_name") df1 %>% semi_join(df2,by = "gene_name")

等价于：

1	df1 %>% filter(gene_name %in% df2$gene_name)

反连接 anti_join(x,y) ,丢弃与df2中gene_name一样的所有行：

1 2	anti_join(df1,df2,by = "gene_name") df1 %>% anti_join(df2,by = "gene_name")

等价于：

1	df1 %>% filter(!gene_name %in% df2$gene_name)

9. 管道符%>%

把管道符左边的传给右边。

1
2
3

df %>% head()
df %>% tail()
df %>% slice(1)

以下三个等价，取出Sepal.Length值最大的行：

my_iris %>% 
 group_by(Species) %>% 
 slice(1)

my_iris %>% 
 arrange(desc(Sepal.Length)) %>%
 slice(1)

my_iris %>% slice_max(Sepal.Length)

10. separate()

将某一列根据指定符号，分割成两列：

```R
tb <- tibble::tribble(
  ~day, ~price,
  1,   "30-45",
  2,   "40-95",
  3,   "89-65",
  4,   "45-63",
  5,   "52-42"
)
tb1 <- tb %>% 
  separate(price, into = c("low", "high"), sep = "-")
tb1
# A tibble: 5 x 3
    day low   high 
  <dbl> <chr> <chr>
1     1 30    45   
2     2 40    95   
3     3 89    65   
4     4 45    63   
5     5 52    42

11. unite()

将某两个列合并为一个列：

```R
tb1 %>% 
  unite(col = "price", c(low, high), sep = ":", remove = FALSE)
tb1
## # A tibble: 5 × 4
##     day price low   high 
##   <dbl> <chr> <chr> <chr>
## 1     1 30:45 30    45   
## 2     2 40:95 40    95   
## 3     3 89:65 89    65   
## 4     4 45:63 45    63   
## 5     5 52:42 52    42

12. distinct()

对数据框进行筛选，筛选出不重复的行，类似于unique()。

df <- tibble::tribble(
  ~x, ~y, ~z,
  1, 1, 1,
  1, 1, 2,
  1, 1, 1,
  2, 1, 2,
  2, 2, 3,
  3, 3, 1)
df
## # A tibble: 6 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     1     1
## 2     1     1     2
## 3     1     1     1
## 4     2     1     2
## 5     2     2     3
## 6     3     3     1
df %>%
  distinct()
## # A tibble: 5 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     1     1
## 2     1     1     2
## 3     2     1     2
## 4     2     2     3
## 5     3     3     1

筛选出df中不重复的x

df %>%
  distinct(x)
## # A tibble: 3 × 1
##       x
##   <dbl>
## 1     1
## 2     2
## 3     3

筛选不重复的x和y。

df %>%
  distinct(x, y)
## # A tibble: 4 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     2     2
## 4     3     3

df %>%
  distinct(x, y, .keep_all = TRUE) # 只保留最先出现的row
## # A tibble: 4 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     1     1
## 2     2     1     2
## 3     2     2     3
## 4     3     3     1
df %>%
  distinct(
    across(c(x, y)),
    .keep_all = TRUE)
df %>%
  group_by(x) %>%
  distinct(y, .keep_all = TRUE)
## # A tibble: 4 × 3
## # Groups:   x [3]
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     1     1
## 2     2     1     2
## 3     2     2     3
## 4     3     3     1

处理的对象是vector；功能是统计不同的元素有多少个；返回一个数值。

```R
c(1, 1, 1, 2, 2, 1, 3, 3) %>% n_distinct()
## [1] 3

df$z %>% n_distinct()
## [1] 3

根据x分组统计每个元素的个数：

df %>%
  group_by(x) %>%
  summarise(
    n = n_distinct(z))
## # A tibble: 3 × 2
##       x     n
##   <dbl> <int>
## 1     1     2
## 2     2     2
## 3     3     1

13. 统计

统计数据框行数：

1 2	my_iris %>% summarise(n=n())

统计每个Species出现的次数：

my_iris %>% 
 group_by(Species) %>%
 summarise(n=n())

my_iris %>% 
 count(Species)

统计A、B不同组合出现的次数：

1	my_df %>% count(A,B)

以下两个等价，count通过创建一个新的变量进行计数(注：变量内容为TRUE/FALSE)：

my_iris %>% 
 filter(Sepal.Length>7) %>%
 summarise(n=n())

my_iris %>% count(new_col = Sepal.Length>7)

统计Sepal.Length>7的比例：

my_iris %>% 
 mutate(new_col = Sepal.Length >7) %>% 
 summarise(mean=mean(new_col))
# A tibble: 1 x 1
   mean
  <dbl>
1  0.08

14. across()

函数对多列执行相同的函数操作，返回数据框。


对Sepal.Length和Petal.Length两列求均值：

```R
my_iris %>% summarise(
	across(c(Sepal.Length,Petal.Length),mean)
	)

my_iris %>% summarise(
	across(ends_with("Length"),mean)
	)

多列多个统计函数：

my_iris %>% 
 group_by(Species) %>% 
 summarise(
 	across(ends_with("Length"),list(mean=mean,sd=sd),na.rm=TRUE)
 	)
# A tibble: 3 x 5
  Species    Sepal.Length_mean Sepal.Length_sd Petal.Length_mean Petal.Length_sd
  <fct>                  <dbl>           <dbl>             <dbl>           <dbl>
1 setosa                  5.01           0.352              1.46           0.174
2 versicolor              5.94           0.516              4.26           0.470
3 virginica               6.59           0.636              5.55           0.552

参考资料：