In R, prioritize n-order minimum value when sorting multiple numeric columns by group

89 views Asked by At

I'm currently using a code to order my data in R from minimum to maximum values within group.
Where multiple species share the same minimum value, the species with the lower second value(if exist) is assigned a lower rank order.

library(tidyverse)

species <- data.frame(
  species = c("dog", "dog", "cat", "cat", "fish", "fish", "lion"),
  overall.percentage = c(12, 13, 20, 12, 20, 50, 12)
)

find_min <- function(x, rank) {
  if (rank > length(x)) return(-Inf)
  x[row_number(x) == rank]
}

rank <- species |> 
  summarise(
    min_1 = find_min(overall.percentage, 1L),
    min_2 = find_min(overall.percentage, 2L),
    .by = species
  ) |> 
  mutate(rank = row_number(pick(min_1, min_2))) |> 
  select(species, rank)

species |> 
  left_join(rank, join_by(species))

#>   species overall.percentage  rank
#>   <chr>                <dbl> <int>
#> 1 dog                     12     2
#> 2 dog                     13     2
#> 3 cat                     20     3
#> 4 cat                     12     3
#> 5 fish                    20     4
#> 6 fish                    50     4
#> 7 lion                    12     1

I would like to refactor the code so when multiple species share also the second/third/N minimum value, the species with the lower N+1 value is assigned a lower rank order.
And when two species have the same minimum values, the species with less columns will assign the lower rank.

So the output of the following:

species <- data.frame(
  species = c("dog", "dog", "dog", "cat", "cat", "cat", "lion", "lion", "Fish"),
  overall.percentage = c(11, 12, 14, 11, 12, 13, 11, 12, 20)
)

will be:

#>   species overall.percentage  rank
#>   <chr>                <dbl> <int>
#> 1 dog                     11     3
#> 2 dog                     12     3
#> 3 dog                     14     3
#> 4 cat                     11     2
#> 5 cat                     12     2
#> 6 cat                     13     2
#> 7 lion                    11     1
#> 8 lion                    12     1
#> 8 Fish                    20     4
3

There are 3 answers

0
one On

We can programmatically create N minimum value and then the rank.

Note that I am using find_min(rank,x) instead of find_min(x,rank)

library(dplyr)

n_min <- 3
values <- setNames(seq(n_min),paste0("min_",seq(n_min)))

find_min <- function(rank,x) {
  if (rank > length(x)) return(-Inf)
  x[row_number(x) == rank]
}

species %>%
  mutate(data.frame(lapply(values,find_min,overall.percentage)),.by=species) %>%
  arrange(pick(starts_with("min_"))) %>%
  select(-starts_with("min_")) %>%
  mutate(rank=cur_group_id(),.by=species)

  species overall.percentage rank
1    lion                 11    1
2    lion                 12    1
3     cat                 11    2
4     cat                 12    2
5     cat                 13    2
6     dog                 11    3
7     dog                 12    3
8     dog                 14    3
9    Fish                 20    4
0
jblood94 On

With data.table using transpose followed by setorder and a join.

x <- setDT(species)[,.(.(sort(overall.percentage))), species]
species[
  setorder(setDT(transpose(x[[2]]))[,species := x[[1]]])[,rank := .I],
  on = "species", rank := rank
][]
#>    species overall.percentage  rank
#>     <char>              <num> <int>
#> 1:     dog                 11     3
#> 2:     dog                 12     3
#> 3:     dog                 14     3
#> 4:     cat                 11     2
#> 5:     cat                 12     2
#> 6:     cat                 13     2
#> 7:    lion                 11     1
#> 8:    lion                 12     1
#> 9:    Fish                 20     4
0
Ben On

Just in case if helpful, I placed the data into wide form (this assumes that within species the values are already sorted from lowest to highest):

library(tidyverse)

df <- species %>%
  mutate(rn = row_number(), .by = species) %>%
  pivot_wider(id_cols = species, names_from = rn, values_from = overall.percentage)

  species   `1`   `2`   `3`
  <chr>   <dbl> <dbl> <dbl>
1 dog        11    12    14
2 cat        11    12    13
3 lion       11    12    NA
4 Fish       20    NA    NA

And then used order with na.last = FALSE to sequentially order with NA higher than all numeric values. Using arrange in dplyr does not have that option.

df[do.call(order, c(df[-1], na.last = FALSE)),] %>%
  mutate(rank = row_number()) %>%
  select(species, rank) %>%
  right_join(species)

  species  rank overall.percentage
  <chr>   <int>              <dbl>
1 lion        1                 11
2 lion        1                 12
3 cat         2                 11
4 cat         2                 12
5 cat         2                 13
6 dog         3                 11
7 dog         3                 12
8 dog         3                 14
9 Fish        4                 20