微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

javascript – Python:无法在网页中使用selenium下载

我的目的是从https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa

下载一个zip文件
它是此网页https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa中的一个链接.然后将其保存到此目录“/ home / vinvin / shKLSE /(我正在使用pythonaywhere).然后将其解压缩并在目录中解压缩csv文件.

代码运行到最后没有错误,但没有下载.
手动单击https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa时会自动下载zip文件.

使用带有工作用户名和密码的代码.使用真实的用户名和密码,以便更容易理解问题.

    #!/usr/bin/python
    print "hello from python 2"

    import urllib2
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    from pyvirtualdisplay import display
    import requests, zipfile, os    

    display = display(visible=0, size=(800, 600))
    display.start()

    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2)
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
    profile.set_preference('browser.helperApps.neverAsk.savetodisk', '/zip')

    for retry in range(5):
        try:
            browser = webdriver.Firefox(profile)
            print "firefox"
            break
        except:
            time.sleep(3)
    time.sleep(1)

    browser.get("https://www.shareinvestor.com/my")
    time.sleep(10)
    login_main = browser.find_element_by_xpath("//*[@href='/user/login.html']").click()
    print browser.current_url
    username = browser.find_element_by_id("sic_login_header_username")
    password = browser.find_element_by_id("sic_login_header_password")
    print "find id done"
    username.send_keys("bkcollection")
    password.send_keys("123456")
    print "log in done"
    login_attempt = browser.find_element_by_xpath("//*[@type='submit']")
    login_attempt.submit()
    browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
    print browser.current_url
    time.sleep(20)
    dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
    time.sleep(30)

    browser.close()
    browser.quit()
    display.stop()

   zip_ref = zipfile.ZipFile(/home/vinvin/sh/KLSE, 'r')
   zip_ref.extractall(/home/vinvin/sh/KLSE)
   zip_ref.close()
   os.remove(zip_ref)

HTML片段:

<li><a href="/prices/price_download_zip_file.zip?type=history_all&amp;market=bursa">All Historical Data</a> <span>About 220 MB</span></li>

请注意,复制代码段时会显示& amp.它是从视图源隐藏的,所以我猜它是用JavaScript编写的.

观察我发现了

>即使我运行代码没有错误,目录home / vinvin / shKLSE也没有创建
>我尝试下载一个小得多的zip文件,可以在一秒钟内完成,但在等待30秒后仍然无法下载. dl = browser.find_element_by_xpath(“// * [@ href =’/ prices / price_download_zip_file.zip?type = history_daily& date = 20170519& market = bursa’]”).click()

enter image description here

解决方法:

我重写了你的脚本,并用评论解释了为什么我做了我所做的更改.我认为你的主要问题可能是一个糟糕的mimetype,但是,你的脚本有一个系统问题的日志,这会使它最多不可靠.此重写使用显式等待,这完全消除了使用time.sleep()的需要,允许它尽可能快地运行,同时还消除了网络拥塞引起的错误.

您需要执行以下操作以确保安装所有模块:

pip install请求显式selenium重试pyvirtualdisplay

剧本:

#!/usr/bin/python

from __future__ import print_function  # Makes your code portable

import os
import glob
import zipfile
from contextlib import contextmanager

import requests
from retry import retry
from explicit import waiter, XPATH, ID
from selenium import webdriver
from pyvirtualdisplay import display
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import webdriverwait

DOWNLOAD_DIR = "/tmp/shKLSE/"


def build_profile():
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2)
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.download.dir', DOWNLOAD_DIR)
    # I think your `/zip` mime type was incorrect. This works for me
    profile.set_preference('browser.helperApps.neverAsk.savetodisk',
                           'application/vnd.ms-excel,application/zip')

    return profile


# Retry is an elegant way to retry the browser creation
# Though you should narrow the scope to whatever the actual exception is you are
# retrying on
@retry(Exception, tries=5, delay=3)
@contextmanager  # This turns get_browser into a context manager
def get_browser():
    # Use a context manager with display, so it will be closed even if an
    # exception is thrown
    profile = build_profile()
    with display(visible=0, size=(800, 600)):
        browser = webdriver.Firefox(profile)
        print("firefox")
        try:
            yield browser
        finally:
            # Let a try/finally block manage closing the browser, even if an
            # exception is called
            browser.quit()


def main():
    print("hello from python 2")
    with get_browser() as browser:
        browser.get("https://www.shareinvestor.com/my")

        # Click the login button
        # waiter is a helper function that makes it easy to use explicit waits
        # with it you dont need to use time.sleep() calls at all
        login_xpath = '//*/div[@class="sic_logIn-bg"]/a'
        waiter.find_element(browser, login_xpath, XPATH).click()
        print(browser.current_url)

        # Log in
        username = "bkcollection"
        username_id = "sic_login_header_username"
        password = "123456"
        password_id = "sic_login_header_password"
        waiter.find_write(browser, username_id, username, by=ID)
        waiter.find_write(browser, password_id, password, by=ID, send_enter=True)

        # Wait for login process to finish by locating an element only found
        # after logging in, like the Logged In Nav
        nav_id = 'sic_loggedInNav'
        waiter.find_element(browser, nav_id, ID)

        print("log in done")

        # Load the target page
        target_url = ("https://www.shareinvestor.com/prices/price_download.html#/?"
                      "type=price_download_all_stocks_bursa")
        browser.get(target_url)
        print(browser.current_url)

        # CLick download button
        all_data_xpath = ("//*[@href='/prices/price_download_zip_file.zip?"
                          "type=history_all&market=bursa']")
        waiter.find_element(browser, all_data_xpath, XPATH).click()

        # This is a bit challenging: You need to wait until the download is complete
        # This file is 220 MB, it takes a while to complete. This method waits until
        # there is at least one file in the dir, then waits until there are no
        # filenames that end in `.part`
        # Note that is is problematic if there is already a file in the target dir. I
        # suggest looking into using the tempdir module to create a unique, temporary
        # directory for downloading every time you run your script
        print("Waiting for download to complete")
        at_least_1 = lambda x: len(x("{0}/*.ziP*".format(DOWNLOAD_DIR))) > 0
        webdriverwait(glob.glob, 300).until(at_least_1)

        no_parts = lambda x: len(x("{0}/*.part".format(DOWNLOAD_DIR))) == 0
        webdriverwait(glob.glob, 300).until(no_parts)

        print("Download Done")

        # Now do whatever it is you need to do with the zip file
        # zip_ref = zipfile.ZipFile(DOWNLOAD_DIR, 'r')
        # zip_ref.extractall(DOWNLOAD_DIR)
        # zip_ref.close()
        # os.remove(zip_ref)

        print("Done!")


if __name__ == "__main__":
    main()

完全披露:我维护显式模块.它旨在使显式等待变得更容易,对于这样的情况,网站根据用户交互缓慢加载动态内容.您可以使用直接显式等待替换上面的所有waiter.XXX调用.

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐