library(tidyverse)
library(tidyselect)
library(janitor)
library(lubridate)
Might as well load these packages, they are always useful.
Converting
Convert all character variables to numeric
%>%
mtcars mutate(across(where(is_character), as.numeric))
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
# … with 22 more rows
Convert all numeric variables to factor variables
%>%
mtcars mutate(across(where(is.numeric), as_factor))
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
1 21 6 160 110 3.9 2.62 16.46 0 1 4 4
2 21 6 160 110 3.9 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
5 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
6 18.1 6 225 105 2.76 3.46 20.22 1 0 3 1
7 14.3 8 360 245 3.21 3.57 15.84 0 0 3 4
8 24.4 4 146.7 62 3.69 3.19 20 1 0 4 2
9 22.8 4 140.8 95 3.92 3.15 22.9 1 0 4 2
10 19.2 6 167.6 123 3.92 3.44 18.3 1 0 4 4
# … with 22 more rows
Convert all character variables to factor variables
%>%
starwars mutate(across(where(is_character),as_factor))
# A tibble: 87 × 14
name height mass hair_color skin_color eye_color birth_year sex gender
<fct> <int> <dbl> <fct> <fct> <fct> <dbl> <fct> <fct>
1 Luke Sk… 172 77 blond fair blue 19 male mascu…
2 C-3PO 167 75 <NA> gold yellow 112 none mascu…
3 R2-D2 96 32 <NA> white, bl… red 33 none mascu…
4 Darth V… 202 136 none white yellow 41.9 male mascu…
5 Leia Or… 150 49 brown light brown 19 fema… femin…
6 Owen La… 178 120 brown, gr… light blue 52 male mascu…
7 Beru Wh… 165 75 brown light blue 47 fema… femin…
8 R5-D4 97 32 <NA> white, red red NA none mascu…
9 Biggs D… 183 84 black light brown 24 male mascu…
10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male mascu…
# … with 77 more rows, and 5 more variables: homeworld <fct>, species <fct>,
# films <list>, vehicles <list>, starships <list>
Convert some (but not all) character variables to factor variables
%>%
starwars mutate(across(c(hair_color, skin_color), as_factor))
# A tibble: 87 × 14
name height mass hair_color skin_color eye_color birth_year sex gender
<chr> <int> <dbl> <fct> <fct> <chr> <dbl> <chr> <chr>
1 Luke Sk… 172 77 blond fair blue 19 male mascu…
2 C-3PO 167 75 <NA> gold yellow 112 none mascu…
3 R2-D2 96 32 <NA> white, bl… red 33 none mascu…
4 Darth V… 202 136 none white yellow 41.9 male mascu…
5 Leia Or… 150 49 brown light brown 19 fema… femin…
6 Owen La… 178 120 brown, gr… light blue 52 male mascu…
7 Beru Wh… 165 75 brown light blue 47 fema… femin…
8 R5-D4 97 32 <NA> white, red red NA none mascu…
9 Biggs D… 183 84 black light brown 24 male mascu…
10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male mascu…
# … with 77 more rows, and 5 more variables: homeworld <chr>, species <chr>,
# films <list>, vehicles <list>, starships <list>
Convert multiple variables to date variables
::flights %>%
nycflights13select(time_hour) %>%
mutate(across(c(time_hour), ymd_hms))
# A tibble: 336,776 × 1
time_hour
<dttm>
1 2013-01-01 05:00:00
2 2013-01-01 05:00:00
3 2013-01-01 05:00:00
4 2013-01-01 05:00:00
5 2013-01-01 06:00:00
6 2013-01-01 05:00:00
7 2013-01-01 06:00:00
8 2013-01-01 06:00:00
9 2013-01-01 06:00:00
10 2013-01-01 06:00:00
# … with 336,766 more rows
Collapsing
Collapse all character variables into 5 categories and an “other” category based on frequency
%>%
starwars mutate(across(where(is.character), fct_lump_n, n = 5, other_level = 'forgotten category'))
# A tibble: 87 × 14
name height mass hair_color skin_color eye_color birth_year sex gender
<fct> <int> <dbl> <fct> <fct> <fct> <dbl> <fct> <fct>
1 Luke Sk… 172 77 blond fair blue 19 male mascu…
2 C-3PO 167 75 <NA> forgotten… yellow 112 none mascu…
3 R2-D2 96 32 <NA> forgotten… forgotte… 33 none mascu…
4 Darth V… 202 136 none forgotten… yellow 41.9 male mascu…
5 Leia Or… 150 49 brown light brown 19 fema… femin…
6 Owen La… 178 120 forgotten… light blue 52 male mascu…
7 Beru Wh… 165 75 brown light blue 47 fema… femin…
8 R5-D4 97 32 <NA> forgotten… forgotte… NA none mascu…
9 Biggs D… 183 84 black light brown 24 male mascu…
10 Obi-Wan… 182 77 forgotten… fair forgotte… 57 male mascu…
# … with 77 more rows, and 5 more variables: homeworld <fct>, species <fct>,
# films <list>, vehicles <list>, starships <list>
Renaming
Add a prefix to all variables of a particular type
%>%
iris rename_with(~str_c("test_", .), where(is.factor))
# A tibble: 150 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width test_Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# … with 140 more rows
Add a prefix to all variable names
%>%
mtcars rename_with(~paste0("prefix_", .), everything())
# A tibble: 32 × 11
prefix_mpg prefix_cyl prefix_disp prefix_hp prefix_drat prefix_wt prefix_qsec
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5
2 21 6 160 110 3.9 2.88 17.0
3 22.8 4 108 93 3.85 2.32 18.6
4 21.4 6 258 110 3.08 3.22 19.4
5 18.7 8 360 175 3.15 3.44 17.0
6 18.1 6 225 105 2.76 3.46 20.2
7 14.3 8 360 245 3.21 3.57 15.8
8 24.4 4 147. 62 3.69 3.19 20
9 22.8 4 141. 95 3.92 3.15 22.9
10 19.2 6 168. 123 3.92 3.44 18.3
# … with 22 more rows, and 4 more variables: prefix_vs <dbl>, prefix_am <dbl>,
# prefix_gear <dbl>, prefix_carb <dbl>
Limit variable names to a particular length
%>%
starwars set_names(substr(names(.), 1,3))
# A tibble: 87 × 14
nam hei mas hai ski eye bir sex gen hom spe fil veh
<chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <lis> <lis>
1 Luke… 172 77 blond fair blue 19 male masc… Tato… Human <chr> <chr>
2 C-3PO 167 75 <NA> gold yell… 112 none masc… Tato… Droid <chr> <chr>
3 R2-D2 96 32 <NA> whit… red 33 none masc… Naboo Droid <chr> <chr>
4 Dart… 202 136 none white yell… 41.9 male masc… Tato… Human <chr> <chr>
5 Leia… 150 49 brown light brown 19 fema… femi… Alde… Human <chr> <chr>
6 Owen… 178 120 brow… light blue 52 male masc… Tato… Human <chr> <chr>
7 Beru… 165 75 brown light blue 47 fema… femi… Tato… Human <chr> <chr>
8 R5-D4 97 32 <NA> whit… red NA none masc… Tato… Droid <chr> <chr>
9 Bigg… 183 84 black light brown 24 male masc… Tato… Human <chr> <chr>
10 Obi-… 182 77 aubu… fair blue… 57 male masc… Stew… Human <chr> <chr>
# … with 77 more rows, and 1 more variable: sta <list>
Creating
Duplicate each row “X” number of times
%>%
iris uncount(3)
# A tibble: 450 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.1 3.5 1.4 0.2 setosa
2 5.1 3.5 1.4 0.2 setosa
3 5.1 3.5 1.4 0.2 setosa
4 4.9 3 1.4 0.2 setosa
5 4.9 3 1.4 0.2 setosa
6 4.9 3 1.4 0.2 setosa
7 4.7 3.2 1.3 0.2 setosa
8 4.7 3.2 1.3 0.2 setosa
9 4.7 3.2 1.3 0.2 setosa
10 4.6 3.1 1.5 0.2 setosa
# … with 440 more rows
Create an ID variable based on row number
%>%
iris rowid_to_column("id")
# A tibble: 150 × 6
id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<int> <dbl> <dbl> <dbl> <dbl> <fct>
1 1 5.1 3.5 1.4 0.2 setosa
2 2 4.9 3 1.4 0.2 setosa
3 3 4.7 3.2 1.3 0.2 setosa
4 4 4.6 3.1 1.5 0.2 setosa
5 5 5 3.6 1.4 0.2 setosa
6 6 5.4 3.9 1.7 0.4 setosa
7 7 4.6 3.4 1.4 0.3 setosa
8 8 5 3.4 1.5 0.2 setosa
9 9 4.4 2.9 1.4 0.2 setosa
10 10 4.9 3.1 1.5 0.1 setosa
# … with 140 more rows
Create an ID variable based on row number - 2nd method
%>%
iris mutate(id = row_number())
# A tibble: 150 × 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
<dbl> <dbl> <dbl> <dbl> <fct> <int>
1 5.1 3.5 1.4 0.2 setosa 1
2 4.9 3 1.4 0.2 setosa 2
3 4.7 3.2 1.3 0.2 setosa 3
4 4.6 3.1 1.5 0.2 setosa 4
5 5 3.6 1.4 0.2 setosa 5
6 5.4 3.9 1.7 0.4 setosa 6
7 4.6 3.4 1.4 0.3 setosa 7
8 5 3.4 1.5 0.2 setosa 8
9 4.4 2.9 1.4 0.2 setosa 9
10 4.9 3.1 1.5 0.1 setosa 10
# … with 140 more rows
Create unique identifiers based on other variables
%>%
mtcars group_by(cyl, carb) %>%
mutate(id = cur_group_id()) %>%
ungroup()
# A tibble: 32 × 12
mpg cyl disp hp drat wt qsec vs am gear carb id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 4
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 3
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 6
6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 3
7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 8
8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 2
9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 2
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 4
# … with 22 more rows
Create quantiles
%>%
iris mutate(quartiles = ntile(Sepal.Length, 4))
# A tibble: 150 × 6
Sepal.Length Sepal.Width Petal.Length Petal.Width Species quartiles
<dbl> <dbl> <dbl> <dbl> <fct> <int>
1 5.1 3.5 1.4 0.2 setosa 1
2 4.9 3 1.4 0.2 setosa 1
3 4.7 3.2 1.3 0.2 setosa 1
4 4.6 3.1 1.5 0.2 setosa 1
5 5 3.6 1.4 0.2 setosa 1
6 5.4 3.9 1.7 0.4 setosa 2
7 4.6 3.4 1.4 0.3 setosa 1
8 5 3.4 1.5 0.2 setosa 1
9 4.4 2.9 1.4 0.2 setosa 1
10 4.9 3.1 1.5 0.1 setosa 1
# … with 140 more rows
Altering
Reverse 0s and 1s in a vector
%>%
mtcars mutate(vs = case_when(
== 1 ~ 0,
vs == 0 ~ 1
vs ))
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 1 1 4 4
2 21 6 160 110 3.9 2.88 17.0 1 1 4 4
3 22.8 4 108 93 3.85 2.32 18.6 0 1 4 1
4 21.4 6 258 110 3.08 3.22 19.4 0 0 3 1
5 18.7 8 360 175 3.15 3.44 17.0 1 0 3 2
6 18.1 6 225 105 2.76 3.46 20.2 0 0 3 1
7 14.3 8 360 245 3.21 3.57 15.8 1 0 3 4
8 24.4 4 147. 62 3.69 3.19 20 0 0 4 2
9 22.8 4 141. 95 3.92 3.15 22.9 0 0 4 2
10 19.2 6 168. 123 3.92 3.44 18.3 0 0 4 4
# … with 22 more rows
Replace all particular values in a dataframe with another value
%>%
mtcars mutate(across(everything(), ~replace(., . == 0 , "Zero")))
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 21 6 160 110 3.9 2.62 16.46 Zero 1 4 4
2 21 6 160 110 3.9 2.875 17.02 Zero 1 4 4
3 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
4 21.4 6 258 110 3.08 3.215 19.44 1 Zero 3 1
5 18.7 8 360 175 3.15 3.44 17.02 Zero Zero 3 2
6 18.1 6 225 105 2.76 3.46 20.22 1 Zero 3 1
7 14.3 8 360 245 3.21 3.57 15.84 Zero Zero 3 4
8 24.4 4 146.7 62 3.69 3.19 20 1 Zero 4 2
9 22.8 4 140.8 95 3.92 3.15 22.9 1 Zero 4 2
10 19.2 6 167.6 123 3.92 3.44 18.3 1 Zero 4 4
# … with 22 more rows
Rounding to 1 digit in a single variable
%>%
mtcars mutate(wt = round(wt, 1))
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.6 16.5 0 1 4 4
2 21 6 160 110 3.9 2.9 17.0 0 1 4 4
3 22.8 4 108 93 3.85 2.3 18.6 1 1 4 1
4 21.4 6 258 110 3.08 3.2 19.4 1 0 3 1
5 18.7 8 360 175 3.15 3.4 17.0 0 0 3 2
6 18.1 6 225 105 2.76 3.5 20.2 1 0 3 1
7 14.3 8 360 245 3.21 3.6 15.8 0 0 3 4
8 24.4 4 147. 62 3.69 3.2 20 1 0 4 2
9 22.8 4 141. 95 3.92 3.1 22.9 1 0 4 2
10 19.2 6 168. 123 3.92 3.4 18.3 1 0 4 4
# … with 22 more rows
Rounding to 1 digit in all variables
%>%
mtcars adorn_rounding(digits = 1)
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.6 16.5 0 1 4 4
2 21 6 160 110 3.9 2.9 17 0 1 4 4
3 22.8 4 108 93 3.9 2.3 18.6 1 1 4 1
4 21.4 6 258 110 3.1 3.2 19.4 1 0 3 1
5 18.7 8 360 175 3.1 3.4 17 0 0 3 2
6 18.1 6 225 105 2.8 3.5 20.2 1 0 3 1
7 14.3 8 360 245 3.2 3.6 15.8 0 0 3 4
8 24.4 4 147. 62 3.7 3.2 20 1 0 4 2
9 22.8 4 141. 95 3.9 3.1 22.9 1 0 4 2
10 19.2 6 168. 123 3.9 3.4 18.3 1 0 4 4
# … with 22 more rows
Selecting/sorting
Select variables according to the number of distinct levels within that variable
%>%
starwars select(where(~ n_distinct(.) >= 20))
# A tibble: 87 × 8
name height mass skin_color birth_year homeworld species films
<chr> <int> <dbl> <chr> <dbl> <chr> <chr> <lis>
1 Luke Skywalker 172 77 fair 19 Tatooine Human <chr>
2 C-3PO 167 75 gold 112 Tatooine Droid <chr>
3 R2-D2 96 32 white, bl… 33 Naboo Droid <chr>
4 Darth Vader 202 136 white 41.9 Tatooine Human <chr>
5 Leia Organa 150 49 light 19 Alderaan Human <chr>
6 Owen Lars 178 120 light 52 Tatooine Human <chr>
7 Beru Whitesun lars 165 75 light 47 Tatooine Human <chr>
8 R5-D4 97 32 white, red NA Tatooine Droid <chr>
9 Biggs Darklighter 183 84 light 24 Tatooine Human <chr>
10 Obi-Wan Kenobi 182 77 fair 57 Stewjon Human <chr>
# … with 77 more rows
Select variables with a certain degree of missingness
%>%
airquality discard(~sum(is.na(.x))/length(.x)*100 >= 5)
# A tibble: 153 × 5
Solar.R Wind Temp Month Day
<int> <dbl> <int> <int> <int>
1 190 7.4 67 5 1
2 118 8 72 5 2
3 149 12.6 74 5 3
4 313 11.5 62 5 4
5 NA 14.3 56 5 5
6 NA 14.9 66 5 6
7 299 8.6 65 5 7
8 99 13.8 59 5 8
9 19 20.1 61 5 9
10 194 8.6 69 5 10
# … with 143 more rows
Selecting variables that contain a certain pattern
%>%
starwars select(contains("color"))
# A tibble: 87 × 3
hair_color skin_color eye_color
<chr> <chr> <chr>
1 blond fair blue
2 <NA> gold yellow
3 <NA> white, blue red
4 none white yellow
5 brown light brown
6 brown, grey light blue
7 brown light blue
8 <NA> white, red red
9 black light brown
10 auburn, white fair blue-gray
# … with 77 more rows
Sort variables alphabetically
%>%
mtcars select(sort(peek_vars()))
# A tibble: 32 × 11
am carb cyl disp drat gear hp mpg qsec vs wt
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 4 6 160 3.9 4 110 21 16.5 0 2.62
2 1 4 6 160 3.9 4 110 21 17.0 0 2.88
3 1 1 4 108 3.85 4 93 22.8 18.6 1 2.32
4 0 1 6 258 3.08 3 110 21.4 19.4 1 3.22
5 0 2 8 360 3.15 3 175 18.7 17.0 0 3.44
6 0 1 6 225 2.76 3 105 18.1 20.2 1 3.46
7 0 4 8 360 3.21 3 245 14.3 15.8 0 3.57
8 0 2 4 147. 3.69 4 62 24.4 20 1 3.19
9 0 2 4 141. 3.92 4 95 22.8 22.9 1 3.15
10 0 4 6 168. 3.92 4 123 19.2 18.3 1 3.44
# … with 22 more rows