A family for lumping together levels that meet some criteria.
fct_lump_min()
: lumps levels that appear fewer thanmin
times.fct_lump_prop()
: lumps levels that appear in fewer than (or equal to)prop * n
times.fct_lump_n()
lumps all levels except for then
most frequent (or least frequent ifn < 0
)fct_lump_lowfreq()
lumps together the least frequent levels, ensuring that "other" is still the smallest level.
fct_lump()
exists primarily for historical reasons, as it automatically
picks between these different methods depending on its arguments.
We no longer recommend that you use it.
Usage
fct_lump(
f,
n,
prop,
w = NULL,
other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")
)
fct_lump_min(f, min, w = NULL, other_level = "Other")
fct_lump_prop(f, prop, w = NULL, other_level = "Other")
fct_lump_n(
f,
n,
w = NULL,
other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")
)
fct_lump_lowfreq(f, w = NULL, other_level = "Other")
Arguments
- f
A factor (or character vector).
- n
Positive
n
preserves the most commonn
values. Negativen
preserves the least common-n
values. It there are ties, you will get at leastabs(n)
values.- prop
Positive
prop
lumps values which do not appear at leastprop
of the time. Negativeprop
lumps values that do not appear at most-prop
of the time.- w
An optional numeric vector giving weights for frequency of each value (not level) in f.
- other_level
Value of level used for "other" values. Always placed at end of levels.
- ties.method
A character string specifying how ties are treated. See
rank()
for details.- min
Preserve levels that appear at least
min
number of times.
See also
fct_other()
to convert specified levels to other.
Examples
x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()
#> .
#> A B C D E F G H I
#> 40 10 5 27 1 1 1 1 1
x %>%
fct_lump_n(3) %>%
table()
#> .
#> A B D Other
#> 40 10 27 10
x %>%
fct_lump_prop(0.10) %>%
table()
#> .
#> A B D Other
#> 40 10 27 10
x %>%
fct_lump_min(5) %>%
table()
#> .
#> A B C D Other
#> 40 10 5 27 5
x %>%
fct_lump_lowfreq() %>%
table()
#> .
#> A D Other
#> 40 27 20
x <- factor(letters[rpois(100, 5)])
x
#> [1] h e e d i a f e m a j i b d e e e f g g b g h a f c d c d e c e a h d
#> [36] h d d d e f f d g c d b f e e f h h a g c f c g d c e e e d f d c d i
#> [71] e e e d i d j e d i c e f f c c d g h f c b d d d e g e e
#> Levels: a b c d e f g h i j m
table(x)
#> x
#> a b c d e f g h i j m
#> 5 4 12 21 22 12 8 7 5 2 1
table(fct_lump_lowfreq(x))
#>
#> a b c d e f g h i Other
#> 5 4 12 21 22 12 8 7 5 3
# Use positive values to collapse the rarest
fct_lump_n(x, n = 3)
#> [1] Other e e d Other Other f e Other Other Other
#> [12] Other Other d e e e f Other Other Other Other
#> [23] Other Other f c d c d e c e Other
#> [34] Other d Other d d d e f f d Other
#> [45] c d Other f e e f Other Other Other Other
#> [56] c f c Other d c e e e d f
#> [67] d c d Other e e e d Other d Other
#> [78] e d Other c e f f c c d Other
#> [89] Other f c Other d d d e Other e e
#> Levels: c d e f Other
fct_lump_prop(x, prop = 0.1)
#> [1] Other e e d Other Other f e Other Other Other
#> [12] Other Other d e e e f Other Other Other Other
#> [23] Other Other f c d c d e c e Other
#> [34] Other d Other d d d e f f d Other
#> [45] c d Other f e e f Other Other Other Other
#> [56] c f c Other d c e e e d f
#> [67] d c d Other e e e d Other d Other
#> [78] e d Other c e f f c c d Other
#> [89] Other f c Other d d d e Other e e
#> Levels: c d e f Other
# Use negative values to collapse the most common
fct_lump_n(x, n = -3)
#> [1] Other Other Other Other Other Other Other Other m Other j
#> [12] Other b Other Other Other Other Other Other Other b Other
#> [23] Other Other Other Other Other Other Other Other Other Other Other
#> [34] Other Other Other Other Other Other Other Other Other Other Other
#> [45] Other Other b Other Other Other Other Other Other Other Other
#> [56] Other Other Other Other Other Other Other Other Other Other Other
#> [67] Other Other Other Other Other Other Other Other Other Other j
#> [78] Other Other Other Other Other Other Other Other Other Other Other
#> [89] Other Other Other b Other Other Other Other Other Other Other
#> Levels: b j m Other
fct_lump_prop(x, prop = -0.1)
#> [1] h Other Other Other i a Other Other m a j
#> [12] i b Other Other Other Other Other g g b g
#> [23] h a Other Other Other Other Other Other Other Other a
#> [34] h Other h Other Other Other Other Other Other Other g
#> [45] Other Other b Other Other Other Other h h a g
#> [56] Other Other Other g Other Other Other Other Other Other Other
#> [67] Other Other Other i Other Other Other Other i Other j
#> [78] Other Other i Other Other Other Other Other Other Other g
#> [89] h Other Other b Other Other Other Other g Other Other
#> Levels: a b g h i j m Other
# Use weighted frequencies
w <- c(rep(2, 50), rep(1, 50))
fct_lump_n(x, n = 5, w = w)
#> Error in fct_lump_n(x, n = 5, w = w): `w` must be the same length as `f` (99), not length 100.
# Use ties.method to control how tied factors are collapsed
fct_lump_n(x, n = 6)
#> [1] h e e d Other Other f e Other Other Other
#> [12] Other Other d e e e f g g Other g
#> [23] h Other f c d c d e c e Other
#> [34] h d h d d d e f f d g
#> [45] c d Other f e e f h h Other g
#> [56] c f c g d c e e e d f
#> [67] d c d Other e e e d Other d Other
#> [78] e d Other c e f f c c d g
#> [89] h f c Other d d d e g e e
#> Levels: c d e f g h Other
fct_lump_n(x, n = 6, ties.method = "max")
#> [1] h e e d Other Other f e Other Other Other
#> [12] Other Other d e e e f g g Other g
#> [23] h Other f c d c d e c e Other
#> [34] h d h d d d e f f d g
#> [45] c d Other f e e f h h Other g
#> [56] c f c g d c e e e d f
#> [67] d c d Other e e e d Other d Other
#> [78] e d Other c e f f c c d g
#> [89] h f c Other d d d e g e e
#> Levels: c d e f g h Other
# Use fct_lump_min() to lump together all levels with fewer than `n` values
table(fct_lump_min(x, min = 10))
#>
#> c d e f Other
#> 12 21 22 12 32
table(fct_lump_min(x, min = 15))
#>
#> d e Other
#> 21 22 56