如何解决使用 Requests bs4 Python3.8 从长元素中抓取文本
我在 Ubuntu 20.04 上使用 Python3.8.5。我怎样才能抓取下面显示的这个 html 并在 Pandas DataFrame 中找到 here。
这是我当前的代码:
import pathlib
import sys
import lxml
import pandas as pd
import requests
from bs4 import BeautifulSoup
response = requests.get('http://nemweb.com.au/Reports/Current/')
soup = BeautifulSoup(response.text,'lxml')
names = soup.find('body')
print(
f"Type = {type(names)}\n"
f"Length = {len(names)}\n"
)
name_list = names.find('pre')
print(name_list.text)
for elem in name_list.text:
print(elem)
#Do I need to use regex here?
解决方法
如果你想要一个 DataFrame
,你可能想试试这个:
顺便说一下,这适用于来自nemweb.com.au的任何报告网址 - /Reports/Current/
注意:我使用 .head(10)
显示给定数据帧的前 10 个项目。
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
headers = ["Date","Time","Type","URL"]
def make_soup(catalog_url: str):
return BeautifulSoup(requests.get(catalog_url).text,"lxml")
def process_soup(soup: BeautifulSoup) -> tuple:
text = soup.getText().split()[8:]
follow_urls = [a["href"] for a in soup.find_all("a",href=True)[1:]]
catalog = [text[i:i + 8] for i in range(0,len(text),8)]
return follow_urls,catalog
def build_dataframe(processed_soup: tuple) -> pd.DataFrame:
follow_urls,catalog = processed_soup
frame = []
for index,item in enumerate(catalog):
*date,hour,am,type_,_ = item
frame.append(
[
" ".join(date),f"{hour} {am}",f"http://nemweb.com.au{follow_urls[index]}"]
)
return pd.DataFrame(frame,columns=headers)
def dump_to_csv(dataframe: pd.DataFrame,file_name: str = "default_name"):
dataframe.to_csv(f"{file_name}.csv",index=False)
print(f"File {file_name} saved!")
if __name__ == "__main__":
target_url = "http://nemweb.com.au/Reports/Current/"
df = build_dataframe(process_soup(make_soup(target_url)))
print(tabulate(df.head(10),headers=headers,showindex=False,tablefmt="pretty"))
dump_to_csv(df,file_name=target_url.rsplit("/")[-2])
输出:
+-----------------------------+----------+-------+-------------------------------------------------------------------+
| Date | Time | Type | URL |
+-----------------------------+----------+-------+-------------------------------------------------------------------+
| Saturday,April 3,2021 | 9:50 AM | <dir> | http://nemweb.com.au/Reports/Current/Adjusted_Prices_Reports/ |
| Monday,April 5,2021 | 8:00 AM | <dir> | http://nemweb.com.au/Reports/Current/Alt_Limits/ |
| Monday,2021 | 1:12 AM | <dir> | http://nemweb.com.au/Reports/Current/Ancillary_Services_Payments/ |
| Monday,2021 | 11:30 AM | <dir> | http://nemweb.com.au/Reports/Current/Auction_Units_Reports/ |
| Monday,2021 | 4:43 AM | <dir> | http://nemweb.com.au/Reports/Current/Bidmove_Complete/ |
| Thursday,April 1,2021 | 4:44 AM | <dir> | http://nemweb.com.au/Reports/Current/Bidmove_Summary/ |
| Wednesday,December 2,2020 | 10:44 AM | <dir> | http://nemweb.com.au/Reports/Current/Billing/ |
| Monday,2021 | 7:40 AM | <dir> | http://nemweb.com.au/Reports/Current/Causer_Pays/ |
| Thursday,February 4,2021 | 9:10 PM | <dir> | http://nemweb.com.au/Reports/Current/Causer_Pays_Elements/ |
| Monday,November 28,2016 | 7:50 PM | <dir> | http://nemweb.com.au/Reports/Current/Causer_Pays_Rslcpf/ |
+-----------------------------+----------+-------+-------------------------------------------------------------------+
File Current saved!
,
import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
print([x.text for x in soup.findAll('a',href=True)][1:])
main('http://nemweb.com.au/Reports/Current/')
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。