如何解决抓取 HTML 时将值从网站添加到表格
这是我需要的数据:
我已经将表导入到 R 中:
library(tidyverse)
library(rvest)
webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.PHP?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=normal%20table&dr=&page=0")
tbls <- html_nodes(webpage,"table")
tbls_ls <- webpage %>%
html_nodes("table") %>%
.[5] %>%
html_table(fill = TRUE)
data = as.tibble(tbls_ls[[1]])
然而,我还需要在表格中添加一件事。对于某些陨石,有可用的氧同位素值。单击“地块”部分下的陨石名称时,可以看到这一点。单击该图时,我们将被重定向到具有三个同位素值的页面。我想要做的是在我的表中添加三列,包含每个陨石各自的同位素值。我尝试分别为每个“情节”部分编写代码,但我觉得可能有一个更优雅的解决方案。
解决方法
如果您决定使用同位素,您可以抓取没有同位素的表格,然后模仿页面所做的发布请求;然后在 Name
列上左连接这两个。您将获得比左表中更多的行(没有同位素),因为有多个 Change values
,但这与您在查看您描述的同位素的方法中看到的相匹配,其中有逗号分隔的值列表同位素,在图中,而不是按行分割。
我选择更具选择性的 css 选择器来最初定位感兴趣的特定表,而不是索引到列表中。
我使用 write_excel_csv
在写出时保留标头的字符编码(我从 @stefan 得到的想法)。
在写出(子集/选择等)之前,您可以从 joint_table
的输出中删除不需要的列。
r
library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)
webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
'sfor' = "names",'stype' = "contains",'country' = "All",'categ' = "Ungrouped achondrites",'page' = "0",'map' = "ge",'srt' = "name",'lrec' = "200",'pnt' = "Oxygen isotopes",'mblist' = "All",'snew' = "0",'sea' = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php",body = data)
isotopes <- content(r,"text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes,isotopes,by = "Name",copy = FALSE)
write_excel_csv(x = joint_table,path = "joint.csv",col_names = T,na = "")
示例输出:
编辑:
根据您在评论中的要求添加来自其他网址的附加信息。我必须动态确定要选择哪个表号,以及处理没有表的情况。
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
library(readr)
library(furrr)
get_table <- function(url) {
page <- read_html(url)
test_list <- page %>%
html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
html_text() # get left hand column %>%
index <- match(TRUE,stringr::str_detect(test_list,"Data from:")) + 1
table <- page %>%
html_node(paste0("#maintable tr:nth-of-type(",index,") table")) %>%
html_table() %>%
as_tibble()
temp <- set_names(data.frame(t(table[,-1]),row.names = c()),t(table[,1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
return(temp)
}
start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
"sfor" = "names","stype" = "contains","country" = "All","categ" = "Ungrouped achondrites","page" = "0","map" = "ge","srt" = "name","lrec" = "200","pnt" = "Oxygen isotopes","mblist" = "All","snew" = "0","sea" = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php","text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes,copy = FALSE)
lookups <- webpage %>%
html_node("#maintable") %>%
html_nodes("td:nth-of-type(1) a") %>%
map_df(~ c(html_text(.),html_attr(.,"href")) %>%
set_names("Name","Link")) %>%
mutate(Link = paste0(base,gsub("\\s+","%20",Link)))
error_df <- tibble(
`State/Prov/County:` = NA_character_,`Origin or pseudonym:` = NA_character_,`Date:` = NA_character_,`Latitude:` = NA_character_,`Longitude:` = NA_character_,`Mass (g):` = NA_character_,`Pieces:` = NA_character_,`Class:` = NA_character_,`Shock stage:` = NA_character_,`Fayalite (mol%):` = NA_character_,`Ferrosilite (mol%):` = NA_character_,`Wollastonite (mol%):` = NA_character_,`Magnetic suscept.:` = NA_character_,`Classifier:` = NA_character_,`Type spec mass (g):` = NA_character_,`Type spec location:` = NA_character_,`Main mass:` = NA_character_,`Finder:` = NA_character_,`Comments:` = NA_character_,)
no_cores <- future::availableCores() - 1
future::plan(future::multisession,workers = no_cores)
df <- furrr::future_map_dfr(lookups$Link,~ tryCatch(get_table(.x),error = function(e) error_df))
colnames(df) <- sub(":","",colnames(df))
df2 <- df %>%
mutate(
`Mass (g)` = gsub(",",`Mass (g)`),across(c(`Mass (g)`,`Magnetic suscept.`),as.numeric)
)
if (nrow(df2) == nrow(no_isotopes)) {
additional_info <- cbind(lookups,df2)
joint_table$Name <- gsub(" \\*\\*",joint_table$Name)
final_table <- dplyr::left_join(joint_table,additional_info,copy = FALSE)
write_excel_csv(x = final_table,file = "joint.csv",na = "")
}
由 reprex package (v0.3.0) 于 2021 年 2 月 27 日创建
注意
OP 由于某种原因在查找变量方面存在问题,所以这里是我写的一个对他们有用的替代方法:
lookups <- map_df(
webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a"),~
data.frame(
Name = .x %>% html_text(),Link = paste0(base,.x %>% html_attr("href")))
)
) %>% as_tibble()
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。