Python html.parser 模块,HTMLParser() 实例源码
我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用html.parser.HTMLParser()。
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = chr(real_name)
except (ValueError, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.Feed(markup)
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib),and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def getXKCDImageTitle ( html ):
comicBlock = find_last_between( html, 'div id="comic"', "</div>")
if not comicBlock:
return None
imageTitle = find_last_between( comicBlock, "alt=", ">" )
# Drop srcset= if there
imageTitle = imageTitle.split('srcset=')[0]
h = HTMLParser()
imageTitle = h.unescape(imageTitle)
imageTitle = imageTitle.replace('"', '').strip()
imageTitle = imageTitle.replace('/', '').strip()
return imageTitle
# Garfield Minus Garfield Methods
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def get_steps(protocol_id):
"""
Get steps of a protocol
:param protocol_id: int,protocol id
:return: list,list of unresolved steps
"""
step_list = []
steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order')
html_parser = HTMLParser()
workspace_path = settings['env']['workspace']
for index, step in enumerate(steps):
# priority for self-compiled tool
software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'),
str(step.software))
if os.path.exists(software_path) and os.path.isfile(software_path):
step.software = software_path
step_list.append({
'id': index,
'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)),
'specify_output': step.specify_output,
'hash': step.hash,
})
return step_list
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def unescape_html(html_):
"""
Replace HTML entities (e.g. `£`) in a string.
:param html_: The escaped HTML.
:return: The input string with entities replaces.
"""
# http://stackoverflow.com/a/2360639
if sys.version_info.major == 2: # 2.7
# noinspection PyUnresolvedReferences,pycompatibility
from HTMLParser import HTMLParser
return HTMLParser().unescape(html_)
if sys.version_info.minor == 3: # 3.3
# noinspection pycompatibility
from html.parser import HTMLParser
# noinspection PyDeprecation
return HTMLParser().unescape(html_)
# 3.4+
# noinspection pycompatibility
import html
return html.unescape(html_)
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def _get_links(url):
class LinkParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
attrs = dict(attrs)
links.add(attrs.get('href').rstrip('/'))
links = set()
try:
log.debug('Getting links from %s' % url)
u = urlopen(url)
parser = LinkParser()
parser.Feed(u.read().decode('utf-8'))
except Exception as e:
log.debug('Could not get links. %s', e)
log.debug('links: %r', links)
return links
def _provider_auth(self, url, qs, username, password, html):
url += '?sid=0'
# prepare auth
r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
# authenticate
post_data = {
'option': 'credential',
'urlRedirect': url,
'Ecom_User_ID': username,
'Ecom_Password': password,
}
r1 = self.session.post(url, data=post_data, headers={'Accept-Encoding': 'gzip'})
r2 = self.session.get(url, headers={'Accept-Encoding': 'gzip'})
try:
html_parser = HTMLParser.HTMLParser()
redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0]
argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)])
return self.session.post(redirurl, data=argsre, headers={'Accept-Encoding': 'gzip'})
except:
raise Exception('Invalid user name or password.')
def zeroclick(irc, source, msgtarget, args):
params = {"q":args[0]}
url = "http://duckduckgo.com/lite/?"
#try:
data = requests.get(url, params=params).content.decode()
search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""",data,re.M|re.DOTALL)
if search:
answer = HTMLParser().unescape(search[-1].replace("<br>"," ").replace("<code>"," ").replace("</code>"," "))
answer = re.sub("<[^<]+?>"," ",answer)
out = re.sub("\s+",answer.strip())
if out:
#if len(out.split(" More at")[0].split("}")[-1].strip()) < 400:
irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip())
#else:
# irc.msg(source.split("!")[0],out.split(" More at")[0].split("}")[-1].strip())
else:
irc.msg(msgtarget, "No results")
else:
irc.msg(msgtarget, "No results found.")
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def request_first_token(self):
class Parser(HTMLParser):
def __init__(p_self):
p_self.token = None
super().__init__()
def handle_starttag(p_self, attrs):
attrs = dict(attrs)
if attrs.get("id") == "recaptcha-token":
p_self.token = attrs.get("value")
text = self.get("anchor", params={"co": self.co}).text
parser = Parser()
parser.Feed(text)
if not parser.token:
raise RuntimeError(
"Could not get first token. Response:\n{}".format(text),
)
self.first_token = parser.token
self.current_token = self.first_token
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def handle_charref(self, OverflowError) as e:
data = "\N{REPLACEMENT CHaraCTER}"
self.handle_data(data)
def Feed(self,and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.Feed(data)
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception as e:
print("%s Could not parse the markup." % parser)
traceback.print_exc()
if success:
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.Feed(data)
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def close(self):
HTMLParser.HTMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
self.outtext = self.outtext.join(self.outtextlist)
if self.unicode_snob:
nbsp = unichr(name2cp('nbsp'))
else:
nbsp = u' '
self.outtext = self.outtext.replace(u' _place_holder;', nbsp)
return self.outtext
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.Feed(data)
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.Feed(data)
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.Feed(data)
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def check(self, silent=None):
if silent is None:
silent = positive(self.parameters['SILENT'].value)
result = CHECK_PROBABLY
# check parameters
silent = positive(self.parameters['SILENT'].value)
if not positive(self.parameters['BACKGROUND'].value) and not negative(self.parameters['BACKGROUND'].value):
if not silent:
log.err('Bad %s value: %s.', 'BACKGROUND', self.parameters['BACKGROUND'].value)
result = CHECK_FAILURE
# can import urlib and html.parser?
try:
from urllib.request import urlretrieve
except:
if not silent:
log.err('Cannot import urllib.request library (urllib5).')
# Todo other ways?
result = CHECK_FAILURE
try:
from html.parser import HTMLParser
except:
if not silent:
log.err('Cannot import html.parser.')
result = CHECK_FAILURE
return result
def __init__(self, out=None, baseurl=''):
HTMLParser.HTMLParser.__init__(self)
if out is None: self.out = self.outtextf
else: self.out = out
self.outtextlist = [] # empty list to store output characters before they are "joined"
try:
self.outtext = unicode()
except NameError: # python3
self.outtext = str()
self.quiet = 0
self.p_p = 0 # number of newline character to print before next output
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.code = False
self.br_toggle = ''
self.lastWasNL = 0
self.lastWasList = False
self.style = 0
self.style_def = {}
self.tag_stack = []
self.emphasis = 0
self.drop_white_space = 0
self.inheader = False
self.abbr_title = None # current abbreviation deFinition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
if options.google_doc:
del unifiable_n[name2cp('nbsp')]
unifiable['nbsp'] = ' _place_holder;'
def close(self):
HTMLParser.HTMLParser.close(self)
self.pbr()
self.o('', 'end')
self.outtext = self.outtext.join(self.outtextlist)
if options.google_doc:
self.outtext = self.outtext.replace(' _place_holder;', ' ');
return self.outtext
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.Feed(data)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。