22"""Parse a string of HTML data into an Element tree using the 23 BeautifulSoup parser. 24 25 Returns the root ``<html>`` Element of the tree. 26 27 You can pass a different BeautifulSoup parser through the 28 `beautifulsoup` keyword, and a diffent Element factory function 29 through the `makeelement` keyword. By default, the standard 30 ``BeautifulSoup`` class and the default factory of `lxml.html` are 31 used. 32 """ 33return_parse(data,beautifulsoup,makeelement,**bsargs)
37"""Parse a file into an ElemenTree using the BeautifulSoup parser. 38 39 You can pass a different BeautifulSoup parser through the 40 `beautifulsoup` keyword, and a diffent Element factory function 41 through the `makeelement` keyword. By default, the standard 42 ``BeautifulSoup`` class and the default factory of `lxml.html` are 43 used. 44 """ 45ifnothasattr(file,'read'): 46file=open(file) 47root=_parse(file,beautifulsoup,makeelement,**bsargs) 48returnetree.ElementTree(root)
52"""Convert a BeautifulSoup tree to a list of Element trees. 53 54 Returns a list instead of a single root Element to support 55 HTML-like soup with more than one root element. 56 57 You can pass a different Element factory through the `makeelement` 58 keyword. 59 """ 60root=_convert_tree(beautiful_soup_tree,makeelement) 61children=root.getchildren() 62forchildinchildren: 63root.remove(child) 64returnchildren
70ifbeautifulsoupisNone: 71beautifulsoup=BeautifulSoup 72ifhasattr(beautifulsoup,"HTML_ENTITIES"):# bs3 73if'convertEntities'notinbsargs: 74bsargs['convertEntities']='html' 75ifhasattr(beautifulsoup,"DEFAULT_BUILDER_FEATURES"):# bs4 76if'features'notinbsargs: 77bsargs['features']='html.parser'# use Python html parser 78tree=beautifulsoup(source,**bsargs) 79root=_convert_tree(tree,makeelement) 80# from ET: wrap the document in a html root element, if necessary 81iflen(root)==1androot[0].tag=="html": 82returnroot[0] 83root.tag="html" 84returnroot
106ifmakeelementisNone:107makeelement=html.html_parser.makeelement108109# Split the tree into three parts:110# i) everything before the root element: document type111# declaration, comments, processing instructions, whitespace112# ii) the root(s),113# iii) everything after the root: comments, processing114# instructions, whitespace115first_element_idx=last_element_idx=None116html_root=declaration=None117fori,einenumerate(beautiful_soup_tree):118ifisinstance(e,Tag):119iffirst_element_idxisNone:120first_element_idx=i121last_element_idx=i122ifhtml_rootisNoneande.nameande.name.lower()=='html':123html_root=e124elifdeclarationisNoneandisinstance(e,_DECLARATION_OR_DOCTYPE):125declaration=e126127# For a nice, well-formatted document, the variable roots below is128# a list consisting of a single <html> element. However, the document129# may be a soup like '<meta><head><title>Hello</head><body>Hi130# all<\p>'. In this example roots is a list containing meta, head131# and body elements.132iffirst_element_idxisNone:133pre_root=post_root=[]134roots=beautiful_soup_tree.contents135else:136pre_root=beautiful_soup_tree.contents[:first_element_idx]137roots=beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]138post_root=beautiful_soup_tree.contents[last_element_idx+1:]139140# Reorganize so that there is one <html> root...141ifhtml_rootisnotNone:142# ... use existing one if possible, ...143i=roots.index(html_root)144html_root.contents=roots[:i]+html_root.contents+roots[i+1:]145else:146# ... otherwise create a new one.147html_root=_PseudoTag(roots)148149convert_node=_init_node_converters(makeelement)150151# Process pre_root152res_root=convert_node(html_root)153prev=res_root154foreinreversed(pre_root):155converted=convert_node(e)156ifconvertedisnotNone:157prev.addprevious(converted)158prev=converted159160# ditto for post_root161prev=res_root162foreinpost_root:163converted=convert_node(e)164ifconvertedisnotNone:165prev.addnext(converted)166prev=converted167168ifdeclarationisnotNone:169try:170# bs4 provides full Doctype string171doctype_string=declaration.output_ready()172exceptAttributeError:173doctype_string=declaration.string174175match=_parse_doctype_declaration(doctype_string)176ifnotmatch:177# Something is wrong if we end up in here. Since soupparser should178# tolerate errors, do not raise Exception, just let it pass.179pass180else:181external_id,sys_uri=match.groups()182docinfo=res_root.getroottree().docinfo183# strip quotes and update DOCTYPE values (any of None, '', '...')184docinfo.public_id=external_idandexternal_id[1:-1]185docinfo.system_url=sys_uriandsys_uri[1:-1]186187returnres_root
200returnadd201202deffind_best_converter(node):203fortinordered_node_types:204ifisinstance(node,t):205returnconverters[t]206returnNone207208defconvert_node(bs_node,parent=None):209# duplicated in convert_tag() below210try:211handler=converters[type(bs_node)]212exceptKeyError:213handler=converters[type(bs_node)]=find_best_converter(bs_node)214ifhandlerisNone:215returnNone216returnhandler(bs_node,parent)217218defmap_attrs(bs_attrs):219ifisinstance(bs_attrs,dict):# bs4220attribs={}221fork,vinbs_attrs.items():222ifisinstance(v,list):223v=" ".join(v)224attribs[k]=unescape(v)225else:226attribs=dict((k,unescape(v))fork,vinbs_attrs)227returnattribs228229defappend_text(parent,text):230iflen(parent)==0:231parent.text=(parent.textor'')+text232else:233parent[-1].tail=(parent[-1].tailor'')+text234235# converters are tried in order of their definition236237@converter(Tag,_PseudoTag)238defconvert_tag(bs_node,parent):239attrs=bs_node.attrs240ifparentisnotNone:241attribs=map_attrs(attrs)ifattrselseNone242res=etree.SubElement(parent,bs_node.name,attrib=attribs)243else:244attribs=map_attrs(attrs)ifattrselse{}245res=makeelement(bs_node.name,attrib=attribs)246247forchildinbs_node:248# avoid double recursion by inlining convert_node(), see above249try:250handler=converters[type(child)]251exceptKeyError:252pass253else:254ifhandlerisnotNone:255handler(child,res)256continue257convert_node(child,res)258returnres259260@converter(Comment)261defconvert_comment(bs_node,parent):262res=html.HtmlComment(bs_node)263ifparentisnotNone:264parent.append(res)265returnres266267@converter(ProcessingInstruction)268defconvert_pi(bs_node,parent):269ifbs_node.endswith('?'):270# The PI is of XML style (<?as df?>) but BeautifulSoup271# interpreted it as being SGML style (<?as df>). Fix.272bs_node=bs_node[:-1]273res=etree.ProcessingInstruction(*bs_node.split(' ',1))274ifparentisnotNone:275parent.append(res)276returnres277278@converter(NavigableString)279defconvert_text(bs_node,parent):280ifparentisnotNone:281append_text(parent,unescape(bs_node))282returnNone283284returnconvert_node285286287# copied from ET's ElementSoup288289try:290fromhtml.entitiesimportname2codepoint# Python 3291exceptImportError:292fromhtmlentitydefsimportname2codepoint293294295handle_entities=re.compile(r"&(\w+);").sub296297298try:299unichr300exceptNameError:301# Python 3302unichr=chr
306ifnotstring:307return''308# work around oddities in BeautifulSoup's entity handling309defunescape_entity(m):310try:311returnunichr(name2codepoint[m.group(1)])312exceptKeyError:313returnm.group(0)# use as is