微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

格式化 Python beautifulsoup 数据并删除重复的第一列值

如何解决格式化 Python beautifulsoup 数据并删除重复的第一列值

我有以下代码段已经可用,但是我想通过删除一些重复的第一列数据来清理格式中的一些内容,使其更具可读性。

dropdown-toggle

电流输出

from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
import re,random,ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/83.0.4103.97 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML,like Gecko) Version/13.1.1 Safari/605.1.15'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/32.0.1667.0 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000 #orig

req = requests.get(url,header,timeout=10)
soup = BeautifulSoup(req.content,'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url,tds[8].find('a')['href'])
    print (str(txnhash) + "  " + str(value) + "   " + str(token))

需要改进:

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768    TrusterCoin (TSC)
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  0.62679168    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227    WoofCoin (WOOF)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  0.003    Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117    Binance-Peg ... (BUSD)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  1.251364193609566793    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.03997685638568537    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.041171860015645402    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.089939749761843203    Wrapped BNB (WBNB)

解决方法

试试这个:

from urllib.request import Request,urlopen,urljoin
from bs4 import BeautifulSoup
import re,random,ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/83.0.4103.97 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML,like Gecko) Version/13.1.1 Safari/605.1.15'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/32.0.1667.0 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000

req = requests.get(url,header,timeout=10)
soup = BeautifulSoup(req.content,'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

ne=[]
for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url,tds[8].find('a')['href'])
    if str(txnhash) not in ne:
        ne.append(str(txnhash))
        print (str(txnhash),end=" ")
    else:# If you want those tab also then. Otherwise remove else
        print("\t\t\t",end=" ")
    print(str(value) + "   " + str(token))

我们在 list of txnhash 中创建 ne,然后每次检查新的 txnhash 是否在该列表中。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。