Python html.parser 模块，HTMLParser() 实例源码

我们从Python开源项目中，提取了以下49个代码示例，用于说明如何使用html.parser.HTMLParser()。

项目：Projects 作者：it2school | 项目源码 | 文件源码

def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = chr(real_name)
        except (ValueError, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：Projects 作者：it2school | 项目源码 | 文件源码

def Feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.Feed(markup)
        except HTMLParseError as e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib),and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：CorpBot.py 作者：corpnewt | 项目源码 | 文件源码

def getXKCDImageTitle ( html ):
    comicBlock = find_last_between( html, 'div id="comic"', "</div>")

    if not comicBlock:
        return None

    imageTitle = find_last_between( comicBlock, "alt=", ">" )
    # Drop srcset= if there
    imageTitle = imageTitle.split('srcset=')[0]
    h = HTMLParser()
    imageTitle = h.unescape(imageTitle)
    imageTitle = imageTitle.replace('"', '').strip()
    imageTitle = imageTitle.replace('/', '').strip()
    return imageTitle

# Garfield Minus Garfield Methods

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：BioQueue 作者：liyao001 | 项目源码 | 文件源码

def get_steps(protocol_id):
    """
    Get steps of a protocol
    :param protocol_id: int,protocol id
    :return: list,list of unresolved steps
    """
    step_list = []

    steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order')
    html_parser = HTMLParser()
    workspace_path = settings['env']['workspace']
    for index, step in enumerate(steps):
        # priority for self-compiled tool
        software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'),
                                     str(step.software))
        if os.path.exists(software_path) and os.path.isfile(software_path):
            step.software = software_path
        step_list.append({
            'id': index,
            'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)),
            'specify_output': step.specify_output,
            'hash': step.hash,
        })
    return step_list

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：chandl 作者：gebn | 项目源码 | 文件源码

def unescape_html(html_):
    """
    Replace HTML entities (e.g. `&pound;`) in a string.

    :param html_: The escaped HTML.
    :return: The input string with entities replaces.
    """

    # http://stackoverflow.com/a/2360639

    if sys.version_info.major == 2:  # 2.7
        # noinspection PyUnresolvedReferences,pycompatibility
        from HTMLParser import HTMLParser
        return HTMLParser().unescape(html_)

    if sys.version_info.minor == 3:  # 3.3
        # noinspection pycompatibility
        from html.parser import HTMLParser
        # noinspection PyDeprecation
        return HTMLParser().unescape(html_)

    # 3.4+
    # noinspection pycompatibility
    import html
    return html.unescape(html_)

项目：isar 作者：ilbers | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：isar 作者：ilbers | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：python-cookbook-3rd 作者：tuanavu | 项目源码 | 文件源码

def _get_links(url):
    class LinkParser(HTMLParser):
        def handle_starttag(self, tag, attrs):
            if tag == 'a':
                attrs = dict(attrs)
                links.add(attrs.get('href').rstrip('/'))

    links = set()
    try:
        log.debug('Getting links from %s' % url)
        u = urlopen(url)
        parser = LinkParser()
        parser.Feed(u.read().decode('utf-8'))
    except Exception as e:
        log.debug('Could not get links. %s', e)
    log.debug('links: %r', links)
    return links

项目：plugin.video.brplay 作者：olavopeixoto | 项目源码 | 文件源码

def _provider_auth(self, url, qs, username, password, html):

        url += '?sid=0'
        # prepare auth
        r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        # authenticate
        post_data = {
            'option': 'credential',
            'urlRedirect': url,
            'Ecom_User_ID': username,
            'Ecom_Password': password,
        }
        r1 = self.session.post(url, data=post_data, headers={'Accept-Encoding': 'gzip'})

        r2 = self.session.get(url, headers={'Accept-Encoding': 'gzip'})

        try:
            html_parser = HTMLParser.HTMLParser()
            redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0]
            argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)])

            return self.session.post(redirurl, data=argsre, headers={'Accept-Encoding': 'gzip'})
        except:
            raise Exception('Invalid user name or password.')

项目：falco 作者：nathan0 | 项目源码 | 文件源码

def zeroclick(irc, source, msgtarget, args):
    params = {"q":args[0]}
    url = "http://duckduckgo.com/lite/?"
    #try:
    data = requests.get(url, params=params).content.decode()
    search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""",data,re.M|re.DOTALL)
    if search:
        answer = HTMLParser().unescape(search[-1].replace("<br>"," ").replace("<code>"," ").replace("</code>"," "))
        answer = re.sub("<[^<]+?>"," ",answer)
        out = re.sub("\s+",answer.strip())
        if out:
            #if len(out.split(" More at")[0].split("}")[-1].strip()) < 400:
            irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip())
            #else:
            #    irc.msg(source.split("!")[0],out.split(" More at")[0].split("}")[-1].strip())
        else: 
            irc.msg(msgtarget, "No results")
    else:
        irc.msg(msgtarget, "No results found.")

项目：UAGS 作者：HoraceAndTheSpider | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：UAGS 作者：HoraceAndTheSpider | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：UAEConfigMaker 作者：HoraceAndTheSpider | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：UAEConfigMaker 作者：HoraceAndTheSpider | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：4channer 作者：fellchase | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：4channer 作者：fellchase | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：librecaptcha 作者：nickolas360 | 项目源码 | 文件源码

def request_first_token(self):
        class Parser(HTMLParser):
            def __init__(p_self):
                p_self.token = None
                super().__init__()

            def handle_starttag(p_self, attrs):
                attrs = dict(attrs)
                if attrs.get("id") == "recaptcha-token":
                    p_self.token = attrs.get("value")

        text = self.get("anchor", params={"co": self.co}).text
        parser = Parser()
        parser.Feed(text)

        if not parser.token:
            raise RuntimeError(
                "Could not get first token. Response:\n{}".format(text),
            )

        self.first_token = parser.token
        self.current_token = self.first_token

项目：ftcommunity-apps 作者：ftCommunity | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：ftcommunity-apps 作者：ftCommunity | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：Stephanie-AI 作者：Elvargy | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：Stephanie-AI 作者：Elvargy | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：Codeforces-Sublime-Plugin 作者：karunk | 项目源码 | 文件源码

def handle_charref(self, OverflowError) as e:
            data = "\N{REPLACEMENT CHaraCTER}"

        self.handle_data(data)

项目：Codeforces-Sublime-Plugin 作者：karunk | 项目源码 | 文件源码

def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.

项目：Projects 作者：it2school | 项目源码 | 文件源码

def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.Feed(data)

项目：Projects 作者：it2school | 项目源码 | 文件源码

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))

    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s Could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

项目：CorpBot.py 作者：corpnewt | 项目源码 | 文件源码

def getimageTitle ( html ):
    imageTitle = find_between( html, "data-title=", "data-tags=" )
    h = HTMLParser()
    imageTitle = h.unescape(imageTitle)
    #print(h.unescape(imageTitle))
    return imageTitle.replace('"', '').strip()

# C&H Methods

项目：CorpBot.py 作者：corpnewt | 项目源码 | 文件源码

def strip_tags(self, html):
        parser = HTMLParser()
        html = parser.unescape(html)
        s = MLStripper()
        s.Feed(html)
        return s.get_data()

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.Feed(data)

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))

    for parser in ["lxml", b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

项目：nstock 作者：ybenitezf | 项目源码 | 文件源码

def Feed(self, data):
        data = data.replace("</' + 'script>", "</ignore>")
        HTMLParser.HTMLParser.Feed(self, data)

项目：nstock 作者：ybenitezf | 项目源码 | 文件源码

def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o('', 0, 'end')

        self.outtext = self.outtext.join(self.outtextlist)
        if self.unicode_snob:
            nbsp = unichr(name2cp('nbsp'))
        else:
            nbsp = u' '
        self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)

        return self.outtext

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.Feed(data)

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))

    for parser in ["lxml", b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.Feed(data)

项目：B.E.N.J.I. 作者：the-ethan-hunt | 项目源码 | 文件源码

def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.Feed(data)

项目：B.E.N.J.I. 作者：the-ethan-hunt | 项目源码 | 文件源码

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))

    for parser in ["lxml", b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

项目：0ops.exed 作者：whisperaven | 项目源码 | 文件源码

def unescape_html(content):
    if unescape is not None:
        return unescape(content)
    else:
        return HTMLParser().unescape(content)

项目：locasploit 作者：lightfaith | 项目源码 | 文件源码

def check(self, silent=None):
        if silent is None:
            silent = positive(self.parameters['SILENT'].value)
        result = CHECK_PROBABLY
        # check parameters
        silent = positive(self.parameters['SILENT'].value)
        if not positive(self.parameters['BACKGROUND'].value) and not negative(self.parameters['BACKGROUND'].value):
            if not silent:
                log.err('Bad %s value: %s.', 'BACKGROUND', self.parameters['BACKGROUND'].value)
            result = CHECK_FAILURE

        # can import urlib and html.parser?
        try:
            from urllib.request import urlretrieve
        except:
            if not silent:
                log.err('Cannot import urllib.request library (urllib5).')
            # Todo other ways?
            result = CHECK_FAILURE
        try:
            from html.parser import HTMLParser
        except:
            if not silent:
                log.err('Cannot import html.parser.')
            result = CHECK_FAILURE
        return result

项目：ExptWizNote 作者：Ext4FAT | 项目源码 | 文件源码

def __init__(self, out=None, baseurl=''):
        HTMLParser.HTMLParser.__init__(self)

        if out is None: self.out = self.outtextf
        else: self.out = out
        self.outtextlist = [] # empty list to store output characters before they are  "joined"
        try:
            self.outtext = unicode()
        except NameError: # python3
            self.outtext = str()
        self.quiet = 0
        self.p_p = 0 # number of newline character to print before next output
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.code = False
        self.br_toggle = ''
        self.lastWasNL = 0
        self.lastWasList = False
        self.style = 0
        self.style_def = {}
        self.tag_stack = []
        self.emphasis = 0
        self.drop_white_space = 0
        self.inheader = False
        self.abbr_title = None # current abbreviation deFinition
        self.abbr_data = None # last inner HTML (for abbr being defined)
        self.abbr_list = {} # stack of abbreviations to write later
        self.baseurl = baseurl

        if options.google_doc:
            del unifiable_n[name2cp('nbsp')]
            unifiable['nbsp'] = '&nbsp_place_holder;'

项目：ExptWizNote 作者：Ext4FAT | 项目源码 | 文件源码

def Feed(self, data)

项目：ExptWizNote 作者：Ext4FAT | 项目源码 | 文件源码

def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o('', 'end')

        self.outtext = self.outtext.join(self.outtextlist)

        if options.google_doc:
            self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ');

        return self.outtext

项目：dcm-spec-tools 作者：mrbean-bremen | 项目源码 | 文件源码

def __init__(self):
        html_parser.HTMLParser.__init__(self)
        self._in_anchor = False
        self.editions = []

项目：isar 作者：ilbers | 项目源码 | 文件源码

def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.Feed(data)

Python html.parser 模块-HTMLParser() 实例源码

Python html.parser 模块，HTMLParser() 实例源码

相关推荐