R 函数在 webscraper 中循环相同的数据

如何解决R 函数在 webscraper 中循环相同的数据

这是我写的程序

    library(rvest)
    library(RCurl)
    library(XML)
    library(stringr)


    #Getting the number of Page
    getPageNumber <- function(URL){
      parsedDocument = read_html(URL)
      Sort1 <- html_nodes(parsedDocument,'div')
      Sort2 <- Sort1[which(html_attr(Sort1,"class") == "pageNumbers al-pageNumbers")] 
      P <- str_count(html_text(Sort2),pattern = " \\d+\r\n")
      return(ifelse(length(P) == 0,max(P)))
    }


    #Getting all articles based off of their DOI
    getAllArticles <-function(URL){
      parsedDocument = read_html(URL)
      Sort1 <- html_nodes(parsedDocument,'div')
      Sort2 <-  Sort1[which(html_attr(Sort1,"class") == "al-citation-list")]
      ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
      URL3 <- "https://doi.org/10.1093/dnares/"
      URL4 <- paste(URL3,ArticleDOInumber,sep = "")
      return(URL4)
    }


    Title <- function(parsedDocument){
      Sort1 <- html_nodes(parsedDocument,'h1')
      Title <- gsub("<h1>\\n|\\n</h1>",Sort1)
      return(Title)
    }


    #main function with input as parameter year
    findURL <- function(year_chosen){
      if(year_chosen >= 1994){
      noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
      pagesURl = "&fl_SiteID=5275&startpage="
      URL = paste(noYearURL,pagesURl,sep = "")
      #URL is working with parameter year_chosen
      Page <- getPageNumber(URL)
      

      Page2 <- 0
      while(Page < Page2 | Page != Page2){
        Page <- Page2
        URL3 <- paste(URL,Page-1,sep = "")
        Page2 <- getPageNumber(URL3)    
      }
      R_Data <- data.frame()
      for(i in 1:Page){ #0:Page-1
        URL2 <- getAllArticles(paste(URL,i,sep = ""))
        for(j in 1:(length(URL2))){
          parsedDocument <- read_html(URL2[j])
          print(URL2[j])
          R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
          #R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
          R_Data <- rbind(R_Data,R)
        } 
      }
      paste(URL2)
      suppressWarnings(write.csv(R_Data,"DNAresearch.csv",row.names = FALSE,sep = "\t"))
      #return(R_Data)
      } else {
        print("The Year you provide is out of range,this journal only contain articles from 2005 to present")
      }
    }

    findURL(2003)

我的代码的输出如下：

[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"

我正在尝试以年份为参数抓取日记。我已经抓取了一页，但是当我应该更改页面时，我的循环只是返回到页面顶部并循环访问相同的数据。我的代码应该是正确的，我不明白为什么会这样。提前致谢

解决方法

它并不是在读取相同的 url。正是您选择了错误的节点，而这恰好会产生重复的信息。正如我在上一个问题中提到的，您需要重新处理 Title 函数。下面的 Title 重写将根据类名和单节点匹配提取实际文章标题。

请注意删除了您的 sep 参数。代码中还有一些其他区域看起来可能可以在逻辑方面进行简化。

标题功能：

Title <- function(parsedDocument) {
  Title <- parsedDocument %>%
    html_node(".article-title-main") %>%
    html_text() %>%
    gsub("\\r\\n\\s+","",.) %>%
    trimws(.)
  return(Title)
}

R：

library(rvest)
library(XML)
library(stringr)


# Getting the number of Page
getPageNumber <- function(URL) {
  # print(URL)
  parsedDocument <- read_html(URL)
  Sort1 <- html_nodes(parsedDocument,"div")
  Sort2 <- Sort1[which(html_attr(Sort1,"class") == "pagination al-pagination")]
  P <- str_count(html_text(Sort2),pattern = " \\d+\r\n")
  return(ifelse(length(P) == 0,max(P)))
}

# Getting all articles based off of their DOI
getAllArticles <- function(URL) {
  print(URL)
  parsedDocument <- read_html(URL)
  Sort1 <- html_nodes(parsedDocument,"class") == "al-citation-list")]
  ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/",html_text(Sort2)))
  URL3 <- "https://doi.org/10.1093/dnares/"
  URL4 <- paste(URL3,ArticleDOInumber,sep = "")
  return(URL4)
}


Title <- function(parsedDocument) {
  Title <- parsedDocument %>%
    html_node(".article-title-main") %>%
    html_text() %>%
    gsub("\\r\\n\\s+",.) %>%
    trimws(.)
  return(Title)
}


# main function with input as parameter year
findURL <- function(year_chosen) {
  if (year_chosen >= 1994) {
    noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
    pagesURl <- "&fl_SiteID=5275&page="
    URL <- paste(noYearURL,pagesURl,sep = "")
    # URL is working with parameter year_chosen
    Page <- getPageNumber(URL)


    if (Page == 5) {
      Page2 <- 0
      while (Page < Page2 | Page != Page2) {
        Page <- Page2
        URL3 <- paste(URL,Page - 1,sep = "")
        Page2 <- getPageNumber(URL3)
      }
    }
    R_Data <- data.frame()
    for (i in 1:Page) {
      URL2 <- getAllArticles(paste(URL,i,sep = ""))
      for (j in 1:(length(URL2))) {
        parsedDocument <- read_html(URL2[j])
        #print(URL2[j])
        #print(Title(parsedDocument))
        R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
        #print(R)
        R_Data <- rbind(R_Data,R)
      }
    }
    write.csv(R_Data,"Group4.csv",row.names = FALSE)
  } else {
    print("The Year you provide is out of range,this journal only contain articles from 2005 to present")
  }
}

findURL(2003)