微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

golang)HTTP基本认证机制及使用gocolly登录爬取

内网有个网页用了HTTP基本认证机制,想用gocolly爬取,不知道怎么登录,只好研究HTTP基本认证机制

参考这里:https://www.jb51.net/article/89070.htm  

下面开始参考作者dotcoo了:-)

看了<<http权威指南>>第12章HTTP基本认证机制(本站下载地址://www.jb51.net/books/93254.html),感觉讲的蛮详细的,写了一个小小例子测试.

请求响应过程:

==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
<==
HTTP/1.1 401 Unauthorized
WWW-Authenticate: Basic realm="Dotcoo User Login"
==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
Authorization: Basic YWRtaW46YWRtaW5wd2Q=
<==
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8

golang HTTP基本认证机制的实现代码

package main
import (
    "fmt"
    "io"
    "net/http"
    "log"
    "encoding/base64"
    "strings"
)
// hello world,the web server
func HelloServer(w http.ResponseWriter,req *http.Request) {
    auth := req.Header.Get("Authorization")
    if auth == "" {
        w.Header().Set("WWW-Authenticate",`Basic realm="Dotcoo User Login"`)
        w.WriteHeader(http.StatusUnauthorized)
        return
    }
    fmt.Println(auth)
    auths := strings.SplitN(auth," ",2)
    if len(auths) != 2 {
        fmt.Println("error")
        return
    }
    authMethod := auths[0]
    authB64 := auths[1]
    switch authMethod {
    case "Basic":
        authstr,err := base64.StdEncoding.DecodeString(authB64)
        if err != nil {
            fmt.Println(err)
            io.WriteString(w,"Unauthorized!\n")
            return
        }
        fmt.Println(string(authstr))
        userPwd := strings.SplitN(string(authstr),":",2)
        if len(userPwd) != 2 {
            fmt.Println("error")
            return
        }
        username := userPwd[0]
        password := userPwd[1]
        fmt.Println("Username:",username)
        fmt.Println("Password:",password)
        fmt.Println()
    default:
        fmt.Println("error")
        return
    }
    io.WriteString(w,"hello,world!\n")
}
func main() {
    http.HandleFunc("/hello",HelloServer)
    err := http.ListenAndServe(":8000",nil)
    if err != nil {
        log.Fatal("ListenAndServe: ",err)
    }
}

试验了上面的例子后,基本明白了HTTP基本认证的过程。但是怎么用gocolly访问呢?

参考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account

但是答复者Matías Insaurralde提供的模拟浏览器访问的例子编译不通过,不明白其中的hptsKey的意思。代码放在下面供参考(可跳过):

package evernote

import (
    "bytes"
    "errors"
    "fmt"
    "io/IoUtil"
    "net/http"
    "net/http/cookiejar"
    "net/url"
    "regexp"
    "strings"
)

const (
    evernoteLoginURL = "https://www.evernote.com/Login.action"
)

var (
    evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById\("(.*)"\).value = "(.*)"`)
    evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`)

    errNoMatches   = errors.New("No matches")
    errRedirectURL = errors.New("Redirect URL not found")
)

// EvernoteClient wraps all methods required to interact with the website.
type EvernoteClient struct {
    Username   string
    Password   string
    httpClient *http.Client

    // These parameters persist during the login process:
    hpts  string
    hptsh string
}

// NewEvernoteClient initializes a new Evernote client.
func NewEvernoteClient(username,password string) *EvernoteClient {
    // Allocate a new cookie jar to mimic the browser behavior:
    cookieJar,_ := cookiejar.New(nil)

    // Fill up basic data:
    c := &EvernoteClient{
        Username: username,Password: password,}

    // When initializing the http.Client,copy default values from http.DefaultClient
    // Pass a pointer to the cookie jar that was created earlier:
    c.httpClient = &http.Client{
        Transport:     http.DefaultTransport,CheckRedirect: http.DefaultClient.CheckRedirect,Jar:           cookieJar,Timeout:       http.DefaultClient.Timeout,}
    return c
}

func (e *EvernoteClient) extractJSParams(body []byte) (err error) {
    matches := evernoteJSParamsExpr.FindAllSubmatch(body,-1)
    if len(matches) == 0 {
        return errNoMatches
    }
    for _,submatches := range matches {
        if len(submatches) < 3 {
            err = errNoMatches
            break
        }
        key := submatches[1]
        val := submatches[2]

        if bytes.Compare(key,hptsKey) == 0 {
            e.hpts = string(val)
        }
        if bytes.Compare(key,hptshKey) == 0 {
            e.hptsh = string(val)
        }
    }
    return nil
}

// Login handles the login action.
func (e *EvernoteClient) Login() error {
    // First step: fetch the login page as a browser visitor would do:
    res,err := e.httpClient.Get(evernoteLoginURL)
    if err != nil {
        return err
    }
    if res.Body == nil {
        return errors.New("No response body")
    }
    body,err := IoUtil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    err = e.extractJSParams(body)
    if err != nil {
        return err
    }

    // Second step: we have extracted the "hpts" and "hptsh" parameters
    // We send a request using only the username and setting "evaluateUsername":
    values := &url.Values{}
    values.Set("username",e.Username)
    values.Set("evaluateUsername","")
    values.Set("analyticsloginorigin","login_action")
    values.Set("clipperFlow","false")
    values.Set("showSwitchService","true")
    values.Set("hpts",e.hpts)
    values.Set("hptsh",e.hptsh)

    rawValues := values.Encode()
    req,err := http.NewRequest(http.MethodPost,evernoteLoginURL,bytes.NewBufferString(rawValues))
    if err != nil {
        return err
    }
    req.Header.Set("Accept","application/json")
    req.Header.Set("Content-Type","application/x-www-form-urlencoded; charset=UTF-8")
    req.Header.Set("x-requested-with","XMLHttpRequest")
    req.Header.Set("referer",evernoteLoginURL)
    res,err = e.httpClient.Do(req)
    if err != nil {
        return err
    }
    body,err = IoUtil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    bodyStr := string(body)
    if !strings.Contains(bodyStr,`"usePasswordAuth":true`) {
        return errors.New("Password auth not enabled")
    }

    // Third step: do the final request,append password to form data:
    values.Del("evaluateUsername")
    values.Set("password",e.Password)
    values.Set("login","Sign in")

    rawValues = values.Encode()
    req,err = http.NewRequest(http.MethodPost,"text/html")
    req.Header.Set("Content-Type",err = e.httpClient.Do(req)
    if err != nil {
        return err
    }

    // Check the body in order to find the redirect URL:
    body,err = IoUtil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    bodyStr = string(body)
    matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr,-1)
    if len(matches) == 0 {
        return errRedirectURL
    }
    m := matches[0]
    if len(m) < 2 {
        return errRedirectURL
    }
    redirectURL := m[1]
    fmt.Println("Login is ok,redirect URL:",redirectURL)
    return nil
}
After you successfully get the redirect URL,you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process,the cookie jar plays a very important role here.

To call this code use:

func main() {
    evernoteClient := NewEvernoteClient("[email protected]","password")
    err := evernoteClient.Login()
    if err != nil {
        panic(err)
    }
}

只好自己写,经反复试验,发现对于本文开头自己写的server,只需以下代码即可通过验证,输出了hello,world!(将访问方式改为POST也一样。)

package main

import (
    "fmt"

    "io/IoUtil"
    "net/http"
)

// Login handles the login action.
func Login() {
    //生成client 参数为
    client := &http.Client{}
    //要访问的url
    url := "http://localhost:8000/hello"
    //要提交的请求
    req,_ := http.NewRequest("GET",url,nil)
    //最重要的一句,用户名和密码可随意写
    req.SetBasicAuth("aa","bb")
    fmt.Println("POST访问")
    //返回结果
    res,_ := client.Do(req)
    defer res.Body.Close()
    fmt.Println("header:")
    header := res.Header
    fmt.Println(header)
    fmt.Println("realm:")
    basicRealm := res.Header.Get("Www-Authenticate")
    fmt.Println(basicRealm)
    fmt.Println("body:")
    body,_ := IoUtil.ReadAll(res.Body)
    fmt.Println(string(body))

}

func main() {   
    Login()  
}

查看SetBasicAuth的定义为(liteide中在光标位置按Ctrl+shift+J):

func (r *Request) SetBasicAuth(username,password string) {
    r.Header.Set("Authorization","Basic "+basicAuth(username,password))
}

而basicAuth的定义为

func basicAuth(username,password string) string {
    auth := username + ":" + password
    return base64.StdEncoding.EncodetoString([]byte(auth))
}

那么,用gocolly访问的代码如下:

package main

import (
    "encoding/base64"
    "fmt"
    "net/http"

    "github.com/gocolly/colly"
)

func basicAuth(username,password string) string {
    auth := username + ":" + password
    return base64.StdEncoding.EncodetoString([]byte(auth))
}
func main() {
    c := colly.NewCollector()
    h := http.Header{}
    h.Set("Authorization","Basic "+basicAuth("aaaa","bbbb"))

    c.OnResponse(func(r *colly.Response) {
        //fmt.Println(r)
        fmt.Println(string(r.Body))
    })

    c.Request("GET","http://localhost:8000/hello",nil,h)
}

注:对于其他网站,也许要用fiddler抓包,设置相应的header和cookie才行。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐