格式化 Python beautifulsoup 数据并删除重复的第一列值

如何解决格式化 Python beautifulsoup 数据并删除重复的第一列值

我有以下代码段已经可用，但是我想通过删除一些重复的第一列数据来清理格式中的一些内容，使其更具可读性。

dropdown-toggle

电流输出：

from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
import re,random,ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/83.0.4103.97 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML,like Gecko) Version/13.1.1 Safari/605.1.15'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/32.0.1667.0 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000 #orig

req = requests.get(url,header,timeout=10)
soup = BeautifulSoup(req.content,'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url,tds[8].find('a')['href'])
    print (str(txnhash) + "  " + str(value) + "   " + str(token))

需要改进：

0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  899.885819768    TrusterCoin (TSC)
0x70e16e1cbcd30d1c3a2abb03a3d3c43fc324aa794c45b10cd5ef1001e9af0915  0.62679168    Wrapped BNB (WBNB)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  388,214,984,514.909719227    WoofCoin (WOOF)
0x52d862d3f920370d84039f2dccb40edc7343699310d3436b71738d4176997398  0.003    Wrapped BNB (WBNB)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  26.737674146727101117    Binance-Peg ... (BUSD)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  1.251364193609566793    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.03997685638568537    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.041171860015645402    Binance-Peg ... (ADA)
0x4fe83f2ebad772b4292e81f418a6f54572f7462934358a356787f8d777c58c8b  0.089939749761843203    Wrapped BNB (WBNB)

解决方法

试试这个：

from urllib.request import Request,urlopen,urljoin
from bs4 import BeautifulSoup
import re,random,ctypes
import requests
from time import sleep

url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/83.0.4103.97 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML,like Gecko) Version/13.1.1 Safari/605.1.15'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/32.0.1667.0 Safari/537.36'}","header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]

header = random.choice(user_agent_list)
pausesleep = float(random.randint(10000,30000)) / 10000

req = requests.get(url,header,timeout=10)
soup = BeautifulSoup(req.content,'html.parser')
rows = soup.findAll('table')[0].findAll('tr')

ne=[]
for row in rows[1:]:
    tds = row.find_all('td')
    txnhash = tds[1].text[0:]
    age = tds[2].text[0:]
    value = tds[7].text[0:]
    token = tds[8].text[0:]
    link = urljoin(url,tds[8].find('a')['href'])
    if str(txnhash) not in ne:
        ne.append(str(txnhash))
        print (str(txnhash),end=" ")
    else:# If you want those tab also then. Otherwise remove else
        print("\t\t\t",end=" ")
    print(str(value) + "   " + str(token))

我们在 list of txnhash 中创建 ne，然后每次检查新的 txnhash 是否在该列表中。