Lump together least/most common factor levels into "other"

fct_lump(f, n, prop, w = NULL, other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max"))

fct_lump_min(f, min, w = NULL, other_level = "Other")

Arguments

f A factor (or character vector). If both n and prop are missing, fct_lump lumps together the least frequent levels into "other", while ensuring that "other" is still the smallest level. It's particularly useful in conjunction with fct_inorder(). Positive n preserves the most common n values. Negative n preserves the least common -n values. It there are ties, you will get at least abs(n) values. Positive prop preserves values that appear at least prop of the time. Negative prop preserves values that appear at most -prop of the time. An optional numeric vector giving weights for frequency of each value (not level) in f. Value of level used for "other" values. Always placed at end of levels. A character string specifying how ties are treated. See rank() for details. Preserves values that appear at least min number of times.

fct_other() to convert specified levels to other.

Examples

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()#> .
#>  A  B  C  D  E  F  G  H  I
#> 40 10  5 27  1  1  1  1  1 x %>% fct_lump() %>% table()#> .
#>     A     D Other
#>    40    27    20 x %>% fct_lump() %>% fct_inorder() %>% table()#> .
#>     A Other     D
#>    40    20    27
x <- factor(letters[rpois(100, 5)])
x#>   [1] c c d c f e g b c d g d h e c d c b f g c e f d f f e d d d c h f d g h f
#>  [38] f d c b b f c h h e h h c f e f d b e c b f e c h d b f e e f d c g f g d
#>  [75] d e c a b d c g e f d f f g g e c f b g g b g e g i
#> Levels: a b c d e f g h itable(x)#> x
#>  a  b  c  d  e  f  g  h  i
#>  1 10 17 17 14 19 13  8  1 table(fct_lump(x))#>
#>     b     c     d     e     f     g     h Other
#>    10    17    17    14    19    13     8     2
# Use positive values to collapse the rarest
fct_lump(x, n = 3)#>   [1] c     c     d     c     f     Other Other Other c     d     Other d
#>  [13] Other Other c     d     c     Other f     Other c     Other f     d
#>  [25] f     f     Other d     d     d     c     Other f     d     Other Other
#>  [37] f     f     d     c     Other Other f     c     Other Other Other Other
#>  [49] Other c     f     Other f     d     Other Other c     Other f     Other
#>  [61] c     Other d     Other f     Other Other f     d     c     Other f
#>  [73] Other d     d     Other c     Other Other d     c     Other Other f
#>  [85] d     f     f     Other Other Other c     f     Other Other Other Other
#>  [97] Other Other Other Other
#> Levels: c d f Otherfct_lump(x, prop = 0.1)#>   [1] c     c     d     c     f     e     g     Other c     d     g     d
#>  [13] Other e     c     d     c     Other f     g     c     e     f     d
#>  [25] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [37] f     f     d     c     Other Other f     c     Other Other e     Other
#>  [49] Other c     f     e     f     d     Other e     c     Other f     e
#>  [61] c     Other d     Other f     e     e     f     d     c     g     f
#>  [73] g     d     d     e     c     Other Other d     c     g     e     f
#>  [85] d     f     f     g     g     e     c     f     Other g     g     Other
#>  [97] g     e     g     Other
#> Levels: c d e f g Other
# Use negative values to collapse the most common
fct_lump(x, n = -3)#>   [1] Other Other Other Other Other Other Other Other Other Other Other Other
#>  [13] h     Other Other Other Other Other Other Other Other Other Other Other
#>  [25] Other Other Other Other Other Other Other h     Other Other Other h
#>  [37] Other Other Other Other Other Other Other Other h     h     Other h
#>  [49] h     Other Other Other Other Other Other Other Other Other Other Other
#>  [61] Other h     Other Other Other Other Other Other Other Other Other Other
#>  [73] Other Other Other Other Other a     Other Other Other Other Other Other
#>  [85] Other Other Other Other Other Other Other Other Other Other Other Other
#>  [97] Other Other Other i
#> Levels: a h i Otherfct_lump(x, prop = -0.1)#>   [1] Other Other Other Other Other Other Other b     Other Other Other Other
#>  [13] h     Other Other Other Other b     Other Other Other Other Other Other
#>  [25] Other Other Other Other Other Other Other h     Other Other Other h
#>  [37] Other Other Other Other b     b     Other Other h     h     Other h
#>  [49] h     Other Other Other Other Other b     Other Other b     Other Other
#>  [61] Other h     Other b     Other Other Other Other Other Other Other Other
#>  [73] Other Other Other Other Other a     b     Other Other Other Other Other
#>  [85] Other Other Other Other Other Other Other Other b     Other Other b
#>  [97] Other Other Other i
#> Levels: a b h i Other
# Use weighted frequencies
w <- c(rep(2, 50), rep(1, 50))
fct_lump(x, n = 5, w = w)#>   [1] c     c     d     c     f     e     g     Other c     d     g     d
#>  [13] Other e     c     d     c     Other f     g     c     e     f     d
#>  [25] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [37] f     f     d     c     Other Other f     c     Other Other e     Other
#>  [49] Other c     f     e     f     d     Other e     c     Other f     e
#>  [61] c     Other d     Other f     e     e     f     d     c     g     f
#>  [73] g     d     d     e     c     Other Other d     c     g     e     f
#>  [85] d     f     f     g     g     e     c     f     Other g     g     Other
#>  [97] g     e     g     Other
#> Levels: c d e f g Other
# Use ties.method to control how tied factors are collapsed
fct_lump(x, n = 6)#>   [1] c     c     d     c     f     e     g     b     c     d     g     d
#>  [13] Other e     c     d     c     b     f     g     c     e     f     d
#>  [25] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [37] f     f     d     c     b     b     f     c     Other Other e     Other
#>  [49] Other c     f     e     f     d     b     e     c     b     f     e
#>  [61] c     Other d     b     f     e     e     f     d     c     g     f
#>  [73] g     d     d     e     c     Other b     d     c     g     e     f
#>  [85] d     f     f     g     g     e     c     f     b     g     g     b
#>  [97] g     e     g     Other
#> Levels: b c d e f g Otherfct_lump(x, n = 6, ties.method = "max")#>   [1] c     c     d     c     f     e     g     b     c     d     g     d
#>  [13] Other e     c     d     c     b     f     g     c     e     f     d
#>  [25] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [37] f     f     d     c     b     b     f     c     Other Other e     Other
#>  [49] Other c     f     e     f     d     b     e     c     b     f     e
#>  [61] c     Other d     b     f     e     e     f     d     c     g     f
#>  [73] g     d     d     e     c     Other b     d     c     g     e     f
#>  [85] d     f     f     g     g     e     c     f     b     g     g     b
#>  [97] g     e     g     Other
#> Levels: b c d e f g Other
x <- factor(letters[rpois(100, 5)])
fct_lump_min(x, min = 10)#>   [1] e     d     Other Other d     c     d     Other d     c     Other d
#>  [13] c     d     d     f     f     d     e     d     Other d     d     e
#>  [25] f     f     Other f     e     Other d     f     f     e     c     f
#>  [37] e     f     c     d     f     e     Other e     c     Other f     c
#>  [49] d     c     c     f     e     c     d     c     Other e     Other Other
#>  [61] c     c     f     c     d     f     c     Other f     e     Other d
#>  [73] Other c     Other e     d     e     d     Other Other Other f     e
#>  [85] e     d     Other Other Other Other Other e     Other d     f     d
#>  [97] Other f     d     c
#> Levels: c d e f Other