如何解决抓取 50 个网页,每个网页包含 10 个网页链接
我需要抓取 50 个主要网页,每个网页包含 10 个文章链接。日期和作者从主页上刮下来,垂直和描述从访问每个 url 链接时刮下来,所以在第一个主网页上刮下 10 个链接后,我需要点击下一页,循环继续 50 页。请帮助我,这是我的代码。
#Importing essential libraries required for scraping articles.
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import xml.etree.ElementTree as ET
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.action_chains import ActionChains
driver=webdriver.Chrome(r"C:\Users\Scp\Desktop\fliprobo\chromedriver.exe")
Dates=[]
Authors=[]
Verticals=[]
Headlines=[]
Descriptions=[]
Hrefs=[]
driver.get("https://www.ebmnews.com/2020/page/948/")
start=948
end=997
for page in range(start,end+1):
authors=driver.find_elements_by_xpath('//i[@class="post-author author"]')
for i in authors:
Authors.append(i.text)
dates=driver.find_elements_by_xpath('//time[@class="post-published updated"]')
for i in dates:
Dates.append(i.text)
urls=driver.find_elements_by_xpath('//a[@class="post-url post-title"]')
urls=driver.find_elements_by_xpath('//a[@class="post-url post-title"]')
for i in urls:
driver.get(i.get_attribute('href'))
headlines=driver.find_elements_by_xpath('//*[@id="post-99531"]/div[1]/h1/span')
for i in headlines:
Headlines.append(i.text)
desc=driver.find_elements_by_xpath('//*[@id="post-99531"]/div[2]/p/span')
for i in desc:
Descriptions.append(i.text)
verticals=driver.find_elements_by_xpath('//*[@id="post-99531"]/div[1]/div[1]/div/span/a')
for i in verticals:
Verticals.append(i.text)
driver.back()
try:
element = driver.find_element_by_xpath('//*[text()=" Older Posts"]')
webdriver.ActionChains(driver).move_to_element(element ).click(element ).perform()
except StaleElementReferenceException as e:
old_post_btn=driver.find_element_by_xpath('//*[text()=" Older Posts"]')
old_post_btn.click()
解决方法
如果我理解您的问题,以下内容应该以正确的方式达到目的。我使用 requests 模块而不是 selenium 来使其健壮。
import requests
from bs4 import BeautifulSoup
url = 'https://www.ebmnews.com/2020/page/{}/'
current_page = 948
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/87.0.4280.88 Safari/537.36'
while current_page!=998: #highest page to traverse
r = s.get(url.format(current_page))
soup = BeautifulSoup(r.text,"html.parser")
for item in soup.select('article.listing-item'):
try:
post_author = item.select_one("i.post-author").get_text(strip=True)
except AttributeError: post_author = ""
try:
post_date = item.select_one("span.time > time").get_text(strip=True)
except AttributeError: post_date = ""
inner_link = item.select_one("h2.title > a").get("href")
res = s.get(inner_link)
sauce = BeautifulSoup(res.text,"html.parser")
title = sauce.select_one("span[itemprop='headline']").get_text(strip=True)
desc = ' '.join([item.get_text(strip=True) for item in sauce.select(".entry-content > p")])
print(post_author,post_date,title,desc)
current_page+=1
,
您应该使用scrapy selenium 比scrapy 和complex 慢。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。