A family for lumping together levels that meet some criteria.

• fct_lump_min(): lumps levels that appear fewer than min times.

• fct_lump_prop(): lumps levels that appear in fewer prop * n times.

• fct_lump_n() lumps all levels except for the n most frequent (or least frequent if n < 0)

• fct_lump_lowfreq() lumps together the least frequent levels, ensuring that "other" is still the smallest level.

fct_lump() exists primarily for historical reasons, as it automatically picks between these different methods depending on its arguments. We no longer recommend that you use it.

fct_lump(
f,
n,
prop,
w = NULL,
other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")
)

fct_lump_min(f, min, w = NULL, other_level = "Other")

fct_lump_prop(f, prop, w = NULL, other_level = "Other")

fct_lump_n(
f,
n,
w = NULL,
other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")
)

fct_lump_lowfreq(f, other_level = "Other")

## Arguments

f A factor (or character vector). Positive n preserves the most common n values. Negative n preserves the least common -n values. It there are ties, you will get at least abs(n) values. Positive prop lumps values which do not appear at least prop of the time. Negative prop lumps values that do not appear at most -prop of the time. An optional numeric vector giving weights for frequency of each value (not level) in f. Value of level used for "other" values. Always placed at end of levels. A character string specifying how ties are treated. See rank() for details. Preserve levels that appear at least min number of times.

fct_other() to convert specified levels to other.

## Examples

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()
#> .
#>  A  B  C  D  E  F  G  H  I
#> 40 10  5 27  1  1  1  1  1 x %>% fct_lump_n(3) %>% table()
#> .
#>     A     B     D Other
#>    40    10    27    10 x %>% fct_lump_prop(0.10) %>% table()
#> .
#>     A     B     D Other
#>    40    10    27    10 x %>% fct_lump_min(5) %>% table()
#> .
#>     A     B     C     D Other
#>    40    10     5    27     5 x %>% fct_lump_lowfreq() %>% table()
#> .
#>     A     D Other
#>    40    27    20
x <- factor(letters[rpois(100, 5)])
x
#>   [1] h e c d c b f g c e f d f f e d d d c h f d g h f f d c b b f c h h e h h
#>  [38] c f e f d b e c b f e c h d b f e e f d c g f g d d e c a b d c g e f d f
#>  [75] f g g e c f b g g b g e g i e d g g d c d i d c b d
#> Levels: a b c d e f g h itable(x)
#> x
#>  a  b  c  d  e  f  g  h  i
#>  1 10 15 19 14 18 13  8  2 table(fct_lump_lowfreq(x))
#>
#>     b     c     d     e     f     g     h Other
#>    10    15    19    14    18    13     8     3
# Use positive values to collapse the rarest
fct_lump_n(x, n = 3)
#>   [1] Other Other c     d     c     Other f     Other c     Other f     d
#>  [13] f     f     Other d     d     d     c     Other f     d     Other Other
#>  [25] f     f     d     c     Other Other f     c     Other Other Other Other
#>  [37] Other c     f     Other f     d     Other Other c     Other f     Other
#>  [49] c     Other d     Other f     Other Other f     d     c     Other f
#>  [61] Other d     d     Other c     Other Other d     c     Other Other f
#>  [73] d     f     f     Other Other Other c     f     Other Other Other Other
#>  [85] Other Other Other Other Other d     Other Other d     c     d     Other
#>  [97] d     c     Other d
#> Levels: c d f Otherfct_lump_prop(x, prop = 0.1)
#>   [1] Other e     c     d     c     Other f     g     c     e     f     d
#>  [13] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [25] f     f     d     c     Other Other f     c     Other Other e     Other
#>  [37] Other c     f     e     f     d     Other e     c     Other f     e
#>  [49] c     Other d     Other f     e     e     f     d     c     g     f
#>  [61] g     d     d     e     c     Other Other d     c     g     e     f
#>  [73] d     f     f     g     g     e     c     f     Other g     g     Other
#>  [85] g     e     g     Other e     d     g     g     d     c     d     Other
#>  [97] d     c     Other d
#> Levels: c d e f g Other
# Use negative values to collapse the most common
fct_lump_n(x, n = -3)
#>   [1] h     Other Other Other Other Other Other Other Other Other Other Other
#>  [13] Other Other Other Other Other Other Other h     Other Other Other h
#>  [25] Other Other Other Other Other Other Other Other h     h     Other h
#>  [37] h     Other Other Other Other Other Other Other Other Other Other Other
#>  [49] Other h     Other Other Other Other Other Other Other Other Other Other
#>  [61] Other Other Other Other Other a     Other Other Other Other Other Other
#>  [73] Other Other Other Other Other Other Other Other Other Other Other Other
#>  [85] Other Other Other i     Other Other Other Other Other Other Other i
#>  [97] Other Other Other Other
#> Levels: a h i Otherfct_lump_prop(x, prop = -0.1)
#>   [1] h     Other Other Other Other b     Other Other Other Other Other Other
#>  [13] Other Other Other Other Other Other Other h     Other Other Other h
#>  [25] Other Other Other Other b     b     Other Other h     h     Other h
#>  [37] h     Other Other Other Other Other b     Other Other b     Other Other
#>  [49] Other h     Other b     Other Other Other Other Other Other Other Other
#>  [61] Other Other Other Other Other a     b     Other Other Other Other Other
#>  [73] Other Other Other Other Other Other Other Other b     Other Other b
#>  [85] Other Other Other i     Other Other Other Other Other Other Other i
#>  [97] Other Other b     Other
#> Levels: a b h i Other
# Use weighted frequencies
w <- c(rep(2, 50), rep(1, 50))
fct_lump_n(x, n = 5, w = w)
#>   [1] h     e     c     d     c     Other f     Other c     e     f     d
#>  [13] f     f     e     d     d     d     c     h     f     d     Other h
#>  [25] f     f     d     c     Other Other f     c     h     h     e     h
#>  [37] h     c     f     e     f     d     Other e     c     Other f     e
#>  [49] c     h     d     Other f     e     e     f     d     c     Other f
#>  [61] Other d     d     e     c     Other Other d     c     Other e     f
#>  [73] d     f     f     Other Other e     c     f     Other Other Other Other
#>  [85] Other e     Other Other e     d     Other Other d     c     d     Other
#>  [97] d     c     Other d
#> Levels: c d e f h Other
# Use ties.method to control how tied factors are collapsed
fct_lump_n(x, n = 6)
#>   [1] Other e     c     d     c     b     f     g     c     e     f     d
#>  [13] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [25] f     f     d     c     b     b     f     c     Other Other e     Other
#>  [37] Other c     f     e     f     d     b     e     c     b     f     e
#>  [49] c     Other d     b     f     e     e     f     d     c     g     f
#>  [61] g     d     d     e     c     Other b     d     c     g     e     f
#>  [73] d     f     f     g     g     e     c     f     b     g     g     b
#>  [85] g     e     g     Other e     d     g     g     d     c     d     Other
#>  [97] d     c     b     d
#> Levels: b c d e f g Otherfct_lump_n(x, n = 6, ties.method = "max")
#>   [1] Other e     c     d     c     b     f     g     c     e     f     d
#>  [13] f     f     e     d     d     d     c     Other f     d     g     Other
#>  [25] f     f     d     c     b     b     f     c     Other Other e     Other
#>  [37] Other c     f     e     f     d     b     e     c     b     f     e
#>  [49] c     Other d     b     f     e     e     f     d     c     g     f
#>  [61] g     d     d     e     c     Other b     d     c     g     e     f
#>  [73] d     f     f     g     g     e     c     f     b     g     g     b
#>  [85] g     e     g     Other e     d     g     g     d     c     d     Other
#>  [97] d     c     b     d
#> Levels: b c d e f g Other
# Use fct_lump_min() to lump together all levels with fewer than n values
table(fct_lump_min(x, min = 10))
#>
#>     b     c     d     e     f     g Other
#>    10    15    19    14    18    13    11 table(fct_lump_min(x, min = 15))
#>
#>     c     d     f Other
#>    15    19    18    48