56""" 57 Parse a whole document into a string. 58 59 If `guess_charset` is true, or if the input is not Unicode but a 60 byte string, the `chardet` library will perform charset guessing 61 on the string. 62 """ 63ifnotisinstance(html,_strings): 64raiseTypeError('string required') 65 66ifparserisNone: 67parser=html_parser 68 69options={} 70ifguess_charsetisNoneandisinstance(html,bytes): 71# html5lib does not accept useChardet as an argument, if it 72# detected the html argument would produce unicode objects. 73guess_charset=True 74ifguess_charsetisnotNone: 75options['useChardet']=guess_charset 76returnparser.parse(html,**options).getroot()
81"""Parses several HTML elements, returning a list of elements. 82 83 The first item in the list may be a string. If no_leading_text is true, 84 then it will be an error if there is leading text, and it will always be 85 a list of only elements. 86 87 If `guess_charset` is true, the `chardet` library will perform charset 88 guessing on the string. 89 """ 90ifnotisinstance(html,_strings): 91raiseTypeError('string required') 92 93ifparserisNone: 94parser=html_parser 95 96options={} 97ifguess_charsetisNoneandisinstance(html,bytes): 98# html5lib does not accept useChardet as an argument, if it 99# detected the html argument would produce unicode objects.100guess_charset=False101ifguess_charsetisnotNone:102options['useChardet']=guess_charset103children=parser.parseFragment(html,'div',**options)104ifchildrenandisinstance(children[0],_strings):105ifno_leading_text:106ifchildren[0].strip():107raiseetree.ParserError('There is leading text: %r'%108children[0])109delchildren[0]110returnchildren
115"""Parses a single HTML element; it is an error if there is more than116 one element, or if anything but whitespace precedes or follows the117 element.118119 If 'create_parent' is true (or is a tag name) then a parent node120 will be created to encapsulate the HTML in a single element. In121 this case, leading or trailing text is allowed.122123 If `guess_charset` is true, the `chardet` library will perform charset124 guessing on the string.125 """126ifnotisinstance(html,_strings):127raiseTypeError('string required')128129accept_leading_text=bool(create_parent)130131elements=fragments_fromstring(132html,guess_charset=guess_charset,parser=parser,133no_leading_text=notaccept_leading_text)134135ifcreate_parent:136ifnotisinstance(create_parent,_strings):137create_parent='div'138new_root=Element(create_parent)139ifelements:140ifisinstance(elements[0],_strings):141new_root.text=elements[0]142delelements[0]143new_root.extend(elements)144returnnew_root145146ifnotelements:147raiseetree.ParserError('No elements found')148iflen(elements)>1:149raiseetree.ParserError('Multiple elements found')150result=elements[0]151ifresult.tailandresult.tail.strip():152raiseetree.ParserError('Element followed by text: %r'%result.tail)153result.tail=None154returnresult
158"""Parse the html, returning a single element/document.159160 This tries to minimally parse the chunk of text, without knowing if it161 is a fragment or a document.162163 'base_url' will set the document's base_url attribute (and the tree's164 docinfo.URL)165166 If `guess_charset` is true, or if the input is not Unicode but a167 byte string, the `chardet` library will perform charset guessing168 on the string.169 """170ifnotisinstance(html,_strings):171raiseTypeError('string required')172doc=document_fromstring(html,parser=parser,173guess_charset=guess_charset)174175# document starts with doctype or <html>, full document!176start=html[:50]177ifisinstance(start,bytes):178# Allow text comparison in python3.179# Decode as ascii, that also covers latin-1 and utf-8 for the180# characters we need.181start=start.decode('ascii','replace')182183start=start.lstrip().lower()184ifstart.startswith('<html')orstart.startswith('<!doctype'):185returndoc186187head=_find_tag(doc,'head')188189# if the head is not empty we have a full document190iflen(head):191returndoc192193body=_find_tag(doc,'body')194195# The body has just one element, so it was probably a single196# element passed in197if(len(body)==1and(notbody.textornotbody.text.strip())198and(notbody[-1].tailornotbody[-1].tail.strip())):199returnbody[0]200201# Now we have a body which represents a bunch of tags which have the202# content that was passed in. We will create a fake container, which203# is the body tag, except <body> implies too much structure.204if_contains_block_level_tag(body):205body.tag='div'206else:207body.tag='span'208returnbody
212"""Parse a filename, URL, or file-like object into an HTML document213 tree. Note: this returns a tree, not an element. Use214 ``parse(...).getroot()`` to get the document root.215216 If ``guess_charset`` is true, the ``useChardet`` option is passed into217 html5lib to enable character detection. This option is on by default218 when parsing from URLs, off by default when parsing from file(-like)219 objects (which tend to return Unicode more often than not), and on by220 default when parsing from a file path (which is read in binary mode).221 """222ifparserisNone:223parser=html_parser224ifnotisinstance(filename_url_or_file,_strings):225fp=filename_url_or_file226ifguess_charsetisNone:227# assume that file-like objects return Unicode more often than bytes228guess_charset=False229elif_looks_like_url(filename_url_or_file):230fp=urlopen(filename_url_or_file)231ifguess_charsetisNone:232# assume that URLs return bytes233guess_charset=True234else:235fp=open(filename_url_or_file,'rb')236ifguess_charsetisNone:237guess_charset=True238239options={}240# html5lib does not accept useChardet as an argument, if it241# detected the html argument would produce unicode objects.242ifguess_charset:243options['useChardet']=guess_charset244returnparser.parse(fp,**options)
248scheme=urlparse(str)[0]249ifnotscheme:250returnFalse251elif(sys.platform=='win32'and252schemeinstring.ascii_letters253andlen(scheme)==1):254# looks like a 'normal' absolute path255returnFalse256else:257returnTrue