Mam następujący zestaw danych z identyfikatorami i wartością liczbową na identyfikator. Chcę pokazać 10 największych identyfikatorów i zgrupować resztę w kategorii „Inne”. Zbadałem to pytanie/...

1
Andres Mora 24 czerwiec 2021, 17:23

5 odpowiedzi

Najlepsza odpowiedź

Możesz zrobić coś takiego, jeśli dobrze rozumiem.

Posortuj kolumnę Value, zachowaj identyfikator dla pierwszej dziesiątki, a resztę nazwij „Inne”. Potem po prostu zsumuj ten nowy identyfikator.

To pokazuje pierwszą dziesiątkę plus "inne" pole. Z oczekiwanych wyników możesz zamiast tego chcieć n = 9.

library(dplyr)

n <- 10

df %>% 
  arrange(desc(Value)) %>% 
  mutate(ID2 = if_else(row_number() <= n, ID, "Other"),
         SortOrder = if_else(row_number() <= n, row_number(), as.integer(n + 1))) %>% 
  group_by(ID2, SortOrder) %>%
  summarize(Value = sum(Value), .groups = "drop") %>% 
  arrange(SortOrder) %>% 
  select(ID = ID2, Value)

# # A tibble: 11 x 2
#    ID             Value
#    <chr>          <dbl>
#  1 63366849    42084408
#  2 63475529    21879648
#  3 6774736     21321888
#  4 91274582    17393328
#  5 63445915    15215002
#  6 1097911044   9926442
#  7 1095812405   8971332
#  8 1097910557   5388376
#  9 37548920     5114854
# 10 1005153076   5093730
# 11 Other      148494118
2
Adam 24 czerwiec 2021, 14:45

Aby uzyskać 10 największych identyfikatorów (na podstawie wartości, jak przypuszczam):

top10 <- df %>%
   arrange(desc(Value)) %>%
   slice_head(n = 10)

W przypadku kategorii „Inne” chcesz utworzyć nową kolumnę, czy je wyodrębnić? (Lub utworzyć grupę, tak jak robi to group_by() z dplyr?

-2
MonJeanJean 24 czerwiec 2021, 14:30

Nieco czystsza opcja przy użyciu pakietu forcats:

df %>%
  mutate(ID = fct_lump_n(factor(ID), n = 9, w = Value, other_level = "OTHERS")) %>%
  group_by(ID) %>%
  summarize(Value = sum(Value)) %>%
  arrange(ID == "OTHERS", desc(Value))

# A tibble: 10 × 2
# ID    Value
# <fct> <dbl>
# 63366849  42084408
# 63475529  21879648
# 6774736   21321888
# 91274582  17393328
# 63445915  15215002
# 1097911044    9926442
# 1095812405    8971332
# 1097910557    5388376
# 37548920  5114854
# OTHERS    153587848
4
Robin Gertenbach 24 czerwiec 2021, 14:55

Oto rozwiązanie z dplyr:

library(dplyr)

arrange(df, desc(Value)) %>%
  split(c(rep("top", 10), rep("rest", nrow(.) - 10))) %>%
  (\(s) bind_rows(s$top, data.frame(ID = "Other", Value = sum(s$rest$Value))))

Zwroty:

# A tibble: 11 x 2
   ID             Value
   <chr>          <dbl>
 1 63366849    42084408
 2 63475529    21879648
 3 6774736     21321888
 4 91274582    17393328
 5 63445915    15215002
 6 1097911044   9926442
 7 1095812405   8971332
 8 1097910557   5388376
 9 37548920     5114854
10 1005153076   5093730
11 Other      148494118

(Wykorzystane dane:)

df <- structure(list(ID = c("63366849", "63475529", "6774736", "91274582", "63445915", "1097911044", "1095812405", "1097910557", "37548920", "1005153076", "13513021", "51991938", "1010090155", "91265898", "91237574", "91344448", "1095956598", "28065538", "63341531", "63335642", "5637749", "17419836", "5567029", "5651301", "37839500", "63450446", "28424298", "29264885", "63328148", "63562603", "51702988", "7416450", "1005234045", "91520220", "91159937", "13801492", "9260536", "37722978", "63355924", "1098711718", "63443769", "60306461", "28485149", "2151526", "2127233", "1096063398", "27948572", "13834945", "27938888", "91179848", "41678031", "37837953", "91233367", "5557908", "28012681", "33446782", "91243709", "1007790961", "1095826946", "1095926858", "20189860", "24049724", "27903426", "28133395", "41446577", "37319481", "2033552", "27950302", "91200190", "91205127", "9527919", "1095303440", "13826841", "91218008", "63280472", "24074310", "37805433", "13801074", "28037341", "63353740", "1095793583", "1102371309", "1116493966", "2127316", "2153230", "37316948", "51778219", "5503745", "1099735096", "1222254600", "27786180", "2162536", "27964677", "2020408", "91349686", "37514200", "30208806", "1102351224", "28354311", "27929976", "63523710", "13838438", "28296329", "28410262", "28494063", "5721778", "91275248", "91353706", "37751608", "1097610166", "1098080648", "13810090", "63444976", "1221464992", "1097497475", "91100475", "1095312933", "28131889", "63506738", "7922341", "28238547", "27985819", "27958501", "23912889", "5707630", "1102388693", "24028654", "28128637", "5554160", "63369230", "36521093", "91525097", "27929902", "63441041", "1097919703", "40984557", "1097129811", "13843871", "63301433", "28236969", "37831774", "1000942168", "1005220322", "1098661759", "1095843775", "28399609", "37211106", "5683927", "28254346", "91268076", "28295325", "28496274", "13820294", "28354968", "37793321", "63495244", "63299126", "13849979", "63357743", "60262321", "1098753025", "91435400", "2195064", "63345546", "63251135", "13827671", "2007126", "40378506", "1097121371", "37828831", "5707059", "63324477", "1102644458", "37796841", "28334234", "28010824", "28495363", "27929493", "5558783", "51974763", "28296333", "63489129", "1102385538", "13814150", "13829893", "28377189", "91342964", "164497637", "52326169", "63286143", "1099738639", "63503298", "1097913686", "91070753", "37652636", "60352132"), Value = c(42084408, 21879648, 21321888, 17393328, 15215002, 9926442, 8971332, 5388376, 5114854, 5093730, 5004621, 3696756, 3424872, 3347432, 3337895, 3105759, 3104409, 3056238, 2748870, 2738808, 2662730, 2555784, 2268480, 2126700, 2051811, 2035200, 2025495, 1962000, 1926012, 1829736, 1822980, 1657659, 1574901, 1546320, 1539355, 1480530, 1456140, 1372302, 1372302, 1367235, 1350330, 1330560, 1327761, 1289520, 1265556, 1258350, 1245060, 1228770, 1228545, 1209705, 1196100, 1180533, 1175274, 1170286, 1144792, 1106820, 1106820, 1080621, 1080621, 1080621, 1080621, 1080621, 1080621, 1080621, 1080621, 1080264, 1067400, 1067400, 1003000, 993510, 992520, 988560, 957408, 957408, 922350, 889893, 888300, 883800, 855150, 835920, 833481, 833481, 833481, 833481, 833481, 833481, 833481, 833481, 823458, 775300, 741420, 734760, 734760, 729882, 672660, 666740, 631020, 621000, 621000, 600651, 598050, 553410, 553410, 553410, 553410, 553410, 553410, 553410, 552690, 503340, 468120, 439800, 416040, 415136, 403235, 398700, 383629, 383629, 383629, 367380, 364941, 360207, 360088, 346680, 339582, 339201, 339201, 339201, 339201, 303120, 300450, 289080, 282042, 238080, 234060, 219531, 211300, 195447, 195447, 192960, 192960, 169000, 169000, 169000, 163868, 163868, 156540, 143460, 136080, 132048, 127818, 127818, 124920, 124920, 124920, 124920, 105120, 93690, 93690, 77580, 66024, 64650, 62460, 62460, 62376, 58800, 58338, 57240, 50524, 49260, 49260, 48240, 46528, 44100, 43110, 42930, 41640, 40755, 38715, 37050, 35685, 33012, 30687, 28620, 28620, 24678, 24570, 15189, 15120, 15120, 14504, 14310, 11424, 6000, 5940, 5428)), row.names = c(NA, -196L), class = c("tbl_df", "tbl", "data.frame"))
1
ktiu 24 czerwiec 2021, 15:28

Jeszcze jeden schludny sposób

library(tidyverse)
df %>%
  split(dense_rank(-.$Value) >= 10) %>% 
  map_if(c(F, T), ~ .x %>% summarise(ID = 'OTHERS',
                                            Value = sum(Value))) %>%
  map_dfr(~.x)

# A tibble: 10 x 2
   ID             Value
   <chr>          <dbl>
 1 63366849    42084408
 2 63475529    21879648
 3 6774736     21321888
 4 91274582    17393328
 5 63445915    15215002
 6 1097911044   9926442
 7 1095812405   8971332
 8 1097910557   5388376
 9 37548920     5114854
10 OTHERS     153587848
0
AnilGoyal 24 czerwiec 2021, 15:54