I Webscrape adres URL i otrzymuję zagnieżdżoną listę. W następnym kroku chcę przekonwertować to na dataframe. Ale moje rozwiązanie nie działa. Myślę, że problem polega na tym, że ladelem ma nazwy i nie mogę go usunąć przed konwersją. Mam nadzieję, że masz jakieś wskazówki.

require(tidyverse)
require(rvest)

#create a list, which looks like this what I get after webscrape ( my code for scraping is at the end)
item1 <- data.frame(id_course1 = c("id_course1", "id_course1"),
                    course1 = c("participants", 15),
                    course1 = c("mark1", 1),
                    course1 = c("mark2", 2),
                    course1 = c("mark3", 3),
                    course1 = c("mark4", 4),
                    course1 = c("mark5", 5))
item2 <- data.frame(id_course2 = c("id_course2", "id_course2"),
                    course2 = c("participants", 30),
                    course2 = c("mark1", 10),
                    course2 = c("mark2", 8),
                    course2 = c("mark3", 6),
                    course2 = c("mark4", 4),
                    course2 = c("mark5", 2))
item3 <- data.frame(id_course3 = c("id_course3", "id_course3"),
                    course3 = c("participants", 15),
                    course3 = c("mark1", 2),
                    course3 = c("mark2", 4),
                    course3 = c("mark3", 5),
                    course3 = c("mark4", 3),
                    course3 = c("mark5", 1))
my.list <- list(item1, item2, item3)

#create dataframe, but the result is not what I want
require(data.table)
data.table::rbindlist(my.list, fill=TRUE)
dplyr::bind_rows(my.list)
dplyr::bind_rows(unname(my.list))

# try to use only the second row of the table, but the result is not what I want
do.call("cbind", lapply(my.list, "[[", 2) )
do.call("rbind", lapply(my.list, "[[", 2) )
lapply(my.list, "[[", 2) %>% dplyr::bind_rows

#at the end I want a table that looks like this
df_what_i_want <- data.frame(t(data.frame(c("id_course1", 15, 1, 2, 3, 4, 5 ),
                                        c("id_course2", 30, 10, 8, 6, 4, 2 ),
                                        c("id_course3", 15, 2, 4, 5, 3, 1 ))))
rownames(df_what_i_want) <- NULL
colnames(df_what_i_want) <- c("id_course1", "participants", "mark1", "mark2", "mark3", "mark4", "mark5" )


# scrape the website
url <- "https://www.fernuni-hagen.de/wirtschaftswissenschaft/studium/klausurstatistik.shtml"
courses_list <- read_html(url) %>%
  html_nodes("li") %>%
  html_nodes("table") %>%
  html_table(fill = TRUE) 
2
Alexander 26 marzec 2021, 12:07

2 odpowiedzi

Najlepsza odpowiedź

To również zrobiłoby

library(janitor)
library(tidyverse)
map_dfr(my.list, ~(as.data.frame(.) %>% janitor::row_to_names(1) %>% setNames(my.list[[1]][1,])))

  id_course1 participants mark1 mark2 mark3 mark4 mark5
1 id_course1           15     1     2     3     4     5
2 id_course2           30    10     8     6     4     2
3 id_course3           15     2     4     5     3     1
1
AnilGoyal 26 marzec 2021, 10:05

Myślę, że byłoby lepiej, jeśli poprawisz kod podczas samego skrobania. Spróbuj tego :

library(rvest)

url <- "https://www.fernuni-hagen.de/wirtschaftswissenschaft/studium/klausurstatistik.shtml"

read_html(url) %>%
  html_nodes("li") %>%
  html_nodes("table") %>%
  head %>% #remove this later
  html_table(fill = TRUE)  %>%
  purrr::map_df(~.x %>% setNames(.[1, ]) %>% slice(-1)) -> result

result
1
Ronak Shah 26 marzec 2021, 09:21