Python中登录身份验证后面的网页抓取

如何解决Python中登录身份验证后面的网页抓取

from bs4 import BeautifulSoup
import requests

# Start the session
session = requests.Session()

# Create the payload
payload = {'_username': '[USERNAME]','_password': '[PASSWORD]'}

# Post the payload to the site to log in
s = session.post("https://github.com/login",data=payload)

# Navigate to the next page and scrape the data
s = session.get('https://github.com/[USERNAME]')

soup = BeautifulSoup(s.text,'html.parser')
results = soup.find(class_='js-pinned-items-reorder-container')
job_elems = results.find_all(
    'div',class_='Box pinned-item-list-item d-flex p-3 width-full js-pinned-item-list-item public fork reorderable sortable-button-item')
for job_elem in job_elems:
    title_elem = job_elem.find('span',class_='repo')
    print(title_elem.text.strip())

这行不通。请帮忙。上面代码中的"[USERNAME]"和"[PASSWORD]"分别是github账号的用户名和密码。

解决方法

试试这个代码

from bs4 import BeautifulSoup
import requests


login = 'USERNAME'
password = 'PASSWORD'

data = {'login': login,'password': password,'js-webauthn-support': 'supported','js-webauthn-iuvpaa-support': 'unsupported','commit': 'Sign in'}

with requests.session() as sess:
    post_data = sess.get('https://github.com/login')
    html = BeautifulSoup(post_data.text,'html.parser')
    
    #Update data
    data.update(timestamp_secret = html.find("input",{'name':'timestamp_secret'}).get('value'))
    data.update(authenticity_token= html.find("input",{'name':'authenticity_token'}).get('value'))
    data.update(timestamp = html.find("input",{'name':'timestamp'}).get('value'))
    #Login
    res = sess.post("https://github.com/session",data=data,headers=headers)
    
    #Check login
    res = sess.get('https://github.com/')
    try:
        username = BeautifulSoup(res.text,'html.parser').find('meta',{'name': 'user-login'}).get('content')
    except:
        print ('Your username or password is incorrect')
    else:
        print ("You have successfully logged in as",username)