1# Copyright (c) 2004 Ian Bicking. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are 5# met: 6# 7# 1. Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# 10# 2. Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in 12# the documentation and/or other materials provided with the 13# distribution. 14# 15# 3. Neither the name of Ian Bicking nor the names of its contributors may 16# be used to endorse or promote products derived from this software 17# without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""The ``lxml.html`` tool set for HTML handling. 32""" 33 34from__future__importabsolute_import 35 36__all__=[ 37'document_fromstring','fragment_fromstring','fragments_fromstring','fromstring', 38'tostring','Element','defs','open_in_browser','submit_form', 39'find_rel_links','find_class','make_links_absolute', 40'resolve_base_href','iterlinks','rewrite_links','open_in_browser','parse'] 41 42 43importcopy 44importsys 45importre 46fromfunctoolsimportpartial 47 48try: 49# while unnecessary, importing from 'collections.abc' is the right way to do it 50fromcollections.abcimportMutableMapping,MutableSet 51exceptImportError: 52fromcollectionsimportMutableMapping,MutableSet 53 54from..importetree 55from.importdefs 56from._setmixinimportSetMixin 57 58try: 59fromurlparseimporturljoin 60exceptImportError: 61# Python 3 62fromurllib.parseimporturljoin 63 64try: 65unicode 66exceptNameError: 67# Python 3 68unicode=str 69try: 70basestring 71exceptNameError: 72# Python 3 73basestring=(str,bytes)
115"""Convert the result back into the input type. 116 """ 117ifissubclass(typ,bytes): 118returntostring(result,encoding='utf-8') 119elifissubclass(typ,unicode): 120returntostring(result,encoding='unicode') 121else: 122returnresult
154""" 155 Add a class. 156 157 This has no effect if the class is already present. 158 """ 159ifnotvalueorre.search(r'\s',value): 160raiseValueError("Invalid class name: %r"%value) 161classes=self._get_class_value().split() 162ifvalueinclasses: 163return 164classes.append(value) 165self._attributes['class']=' '.join(classes)
168""" 169 Remove a class if it is currently present. 170 171 If the class is not present, do nothing. 172 """ 173ifnotvalueorre.search(r'\s',value): 174raiseValueError("Invalid class name: %r"%value) 175classes=[namefornameinself._get_class_value().split() 176ifname!=value] 177ifclasses: 178self._attributes['class']=' '.join(classes) 179elif'class'inself._attributes: 180delself._attributes['class']
183""" 184 Remove a class; it must currently be present. 185 186 If the class is not present, raise a KeyError. 187 """ 188ifnotvalueorre.search(r'\s',value): 189raiseValueError("Invalid class name: %r"%value) 190super(Classes,self).remove(value)
218""" 219 Add a class name if it isn't there yet, or remove it if it exists. 220 221 Returns true if the class was added (and is now enabled) and 222 false if it was removed (and is now disabled). 223 """ 224ifnotvalueorre.search(r'\s',value): 225raiseValueError("Invalid class name: %r"%value) 226classes=self._get_class_value().split() 227try: 228classes.remove(value) 229enabled=False 230exceptValueError: 231classes.append(value) 232enabled=True 233ifclasses: 234self._attributes['class']=' '.join(classes) 235else: 236delself._attributes['class'] 237returnenabled
243"""set(self, key, value=None) 244 245 Sets an element attribute. If no value is provided, or if the value is None, 246 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 247 for ``form.set('novalidate')``. 248 """ 249super(HtmlElement,self).set(key,value)
269""" 270 Returns the base URL, given when the page was parsed. 271 272 Use with ``urlparse.urljoin(el.base_url, href)`` to get 273 absolute URLs. 274 """ 275returnself.getroottree().docinfo.URL
286""" 287 Return the <body> element. Can be called from a child element 288 to get the document's head. 289 """ 290returnself.xpath('//body|//x:body',namespaces={'x':XHTML_NAMESPACE})[0]
294""" 295 Returns the <head> element. Can be called from a child 296 element to get the document's head. 297 """ 298returnself.xpath('//head|//x:head',namespaces={'x':XHTML_NAMESPACE})[0]
302""" 303 Get or set any <label> element associated with this element. 304 """ 305id=self.get('id') 306ifnotid: 307returnNone 308result=_label_xpath(self,id=id) 309ifnotresult: 310returnNone 311else: 312returnresult[0]
316id=self.get('id') 317ifnotid: 318raiseTypeError( 319"You cannot set a label for an element (%r) that has no id" 320%self) 321if_nons(label.tag)!='label': 322raiseTypeError( 323"You can only assign label to a label element (not %r)" 324%label) 325label.set('for',id)
334""" 335 Removes this element from the tree, including its children and 336 text. The tail text is joined to the previous element or 337 parent. 338 """ 339parent=self.getparent() 340assertparentisnotNone 341ifself.tail: 342previous=self.getprevious() 343ifpreviousisNone: 344parent.text=(parent.textor'')+self.tail 345else: 346previous.tail=(previous.tailor'')+self.tail 347parent.remove(self)
350""" 351 Remove the tag, but not its children or text. The children and text 352 are merged into the parent. 353 354 Example:: 355 356 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 357 >>> h.find('.//b').drop_tag() 358 >>> print(tostring(h, encoding='unicode')) 359 <div>Hello World!</div> 360 """ 361parent=self.getparent() 362assertparentisnotNone 363previous=self.getprevious() 364ifself.textandisinstance(self.tag,basestring): 365# not a Comment, etc. 366ifpreviousisNone: 367parent.text=(parent.textor'')+self.text 368else: 369previous.tail=(previous.tailor'')+self.text 370ifself.tail: 371iflen(self): 372last=self[-1] 373last.tail=(last.tailor'')+self.tail 374elifpreviousisNone: 375parent.text=(parent.textor'')+self.tail 376else: 377previous.tail=(previous.tailor'')+self.tail 378index=parent.index(self) 379parent[index:index+1]=self[:]
382""" 383 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 384 """ 385rel=rel.lower() 386return[elforelin_rel_links_xpath(self) 387ifel.get('rel').lower()==rel]
396""" 397 Get the first element in a document with the given id. If none is 398 found, return the default argument if provided or raise KeyError 399 otherwise. 400 401 Note that there can be more than one element with the same id, 402 and this isn't uncommon in HTML documents found in the wild. 403 Browsers return only the first match, and this function does 404 the same. 405 """ 406try: 407# FIXME: should this check for multiple matches? 408# browsers just return the first one 409return_id_xpath(self,id=id)[0] 410exceptIndexError: 411ifdefault: 412returndefault[0] 413else: 414raiseKeyError(id)
423""" 424 Run the CSS expression on this element and its children, 425 returning a list of the results. 426 427 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 428 -- note that pre-compiling the expression can provide a substantial 429 speedup. 430 """ 431# Do the import here to make the dependency optional. 432fromlxml.cssselectimportCSSSelector 433returnCSSSelector(expr,translator=translator)(self)
434 435######################################## 436## Link functions 437######################################## 438
441""" 442 Make all links in the document absolute, given the 443 ``base_url`` for the document (the full URL where the document 444 came from), or if no ``base_url`` is given, then the ``.base_url`` 445 of the document. 446 447 If ``resolve_base_href`` is true, then any ``<base href>`` 448 tags in the document are used *and* removed from the document. 449 If it is false then any such tag is ignored. 450 451 If ``handle_failures`` is None (default), a failure to process 452 a URL will abort the processing. If set to 'ignore', errors 453 are ignored. If set to 'discard', failing URLs will be removed. 454 """ 455ifbase_urlisNone: 456base_url=self.base_url 457ifbase_urlisNone: 458raiseTypeError( 459"No base_url given, and the document has no base_url") 460ifresolve_base_href: 461self.resolve_base_href() 462 463ifhandle_failures=='ignore': 464deflink_repl(href): 465try: 466returnurljoin(base_url,href) 467exceptValueError: 468returnhref
485""" 486 Find any ``<base href>`` tag in the document, and apply its 487 values to all links found in the document. Also remove the 488 tag once it has been applied. 489 490 If ``handle_failures`` is None (default), a failure to process 491 a URL will abort the processing. If set to 'ignore', errors 492 are ignored. If set to 'discard', failing URLs will be removed. 493 """ 494base_href=None 495basetags=self.xpath('//base[@href]|//x:base[@href]', 496namespaces={'x':XHTML_NAMESPACE}) 497forbinbasetags: 498base_href=b.get('href') 499b.drop_tree() 500ifnotbase_href: 501return 502self.make_links_absolute(base_href,resolve_base_href=False, 503handle_failures=handle_failures)
506""" 507 Yield (element, attribute, link, pos), where attribute may be None 508 (indicating the link is in the text). ``pos`` is the position 509 where the link occurs; often 0, but sometimes something else in 510 the case of links in stylesheets or style tags. 511 512 Note: <base href> is *not* taken into account in any way. The 513 link you get is exactly the link in the document. 514 515 Note: multiple links inside of a single text string or 516 attribute value are returned in reversed order. This makes it 517 possible to replace or delete them from the text string value 518 based on their reported text positions. Otherwise, a 519 modification at one text position can change the positions of 520 links reported later on. 521 """ 522link_attrs=defs.link_attrs 523forelinself.iter(etree.Element): 524attribs=el.attrib 525tag=_nons(el.tag) 526iftag=='object': 527codebase=None 528## <object> tags have attributes that are relative to 529## codebase 530if'codebase'inattribs: 531codebase=el.get('codebase') 532yield(el,'codebase',codebase,0) 533forattribin('classid','data'): 534ifattribinattribs: 535value=el.get(attrib) 536ifcodebaseisnotNone: 537value=urljoin(codebase,value) 538yield(el,attrib,value,0) 539if'archive'inattribs: 540formatchin_archive_re.finditer(el.get('archive')): 541value=match.group(0) 542ifcodebaseisnotNone: 543value=urljoin(codebase,value) 544yield(el,'archive',value,match.start()) 545else: 546forattribinlink_attrs: 547ifattribinattribs: 548yield(el,attrib,attribs[attrib],0) 549iftag=='meta': 550http_equiv=attribs.get('http-equiv','').lower() 551ifhttp_equiv=='refresh': 552content=attribs.get('content','') 553match=_parse_meta_refresh_url(content) 554url=(match.group('url')ifmatchelsecontent).strip() 555# unexpected content means the redirect won't work, but we might 556# as well be permissive and return the entire string. 557ifurl: 558url,pos=_unquote_match( 559url,match.start('url')ifmatchelsecontent.find(url)) 560yield(el,'content',url,pos) 561eliftag=='param': 562valuetype=el.get('valuetype')or'' 563ifvaluetype.lower()=='ref': 564## FIXME: while it's fine we *find* this link, 565## according to the spec we aren't supposed to 566## actually change the value, including resolving 567## it. It can also still be a link, even if it 568## doesn't have a valuetype="ref" (which seems to be the norm) 569## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 570yield(el,'value',el.get('value'),0) 571eliftag=='style'andel.text: 572urls=[ 573# (start_pos, url) 574_unquote_match(match.group(1),match.start(1))[::-1] 575formatchin_iter_css_urls(el.text) 576]+[ 577(match.start(1),match.group(1)) 578formatchin_iter_css_imports(el.text) 579] 580ifurls: 581# sort by start pos to bring both match sets back into order 582# and reverse the list to report correct positions despite 583# modifications 584urls.sort(reverse=True) 585forstart,urlinurls: 586yield(el,None,url,start) 587if'style'inattribs: 588urls=list(_iter_css_urls(attribs['style'])) 589ifurls: 590# return in reversed order to simplify in-place modifications 591formatchinurls[::-1]: 592url,start=_unquote_match(match.group(1),match.start(1)) 593yield(el,'style',url,start)
597""" 598 Rewrite all the links in the document. For each link 599 ``link_repl_func(link)`` will be called, and the return value 600 will replace the old link. 601 602 Note that links may not be absolute (unless you first called 603 ``make_links_absolute()``), and may be internal (e.g., 604 ``'#anchor'``). They can also be values like 605 ``'mailto:email'`` or ``'javascript:expr'``. 606 607 If you give ``base_href`` then all links passed to 608 ``link_repl_func()`` will take that into account. 609 610 If the ``link_repl_func`` returns None, the attribute or 611 tag text will be removed completely. 612 """ 613ifbase_hrefisnotNone: 614# FIXME: this can be done in one pass with a wrapper 615# around link_repl_func 616self.make_links_absolute( 617base_href,resolve_base_href=resolve_base_href) 618elifresolve_base_href: 619self.resolve_base_href() 620 621forel,attrib,link,posinself.iterlinks(): 622new_link=link_repl_func(link.strip()) 623ifnew_link==link: 624continue 625ifnew_linkisNone: 626# Remove the attribute or element content 627ifattribisNone: 628el.text='' 629else: 630delel.attrib[attrib] 631continue 632 633ifattribisNone: 634new=el.text[:pos]+new_link+el.text[pos+len(link):] 635el.text=new 636else: 637cur=el.get(attrib) 638ifnotposandlen(cur)==len(link): 639new=new_link# most common case 640else: 641new=cur[:pos]+new_link+cur[pos+len(link):] 642el.set(attrib,new)
646""" 647 An object that represents a method on an element as a function; 648 the function takes either an element or an HTML string. It 649 returns whatever the function normally returns, or if the function 650 works in-place (and so returns None) it returns a serialized form 651 of the resulting document. 652 """
658result_type=type(doc) 659ifisinstance(doc,basestring): 660if'copy'inkw: 661raiseTypeError( 662"The keyword 'copy' can only be used with element inputs to %s, not a string input"%self.name) 663doc=fromstring(doc,**kw) 664else: 665if'copy'inkw: 666make_a_copy=kw.pop('copy') 667else: 668make_a_copy=self.copy 669ifmake_a_copy: 670doc=copy.deepcopy(doc) 671meth=getattr(doc,self.name) 672result=meth(*args,**kw) 673# FIXME: this None test is a bit sloppy 674ifresultisNone: 675# Then return what we got in 676return_transform_result(result_type,doc) 677else: 678returnresult
708"""A lookup scheme for HTML Element classes. 709 710 To create a lookup instance with different Element classes, pass a tag 711 name mapping of Element classes in the ``classes`` keyword argument and/or 712 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 713 The special key '*' denotes a Mixin class that should be mixed into all 714 Element classes. 715 """ 716_default_element_classes={} 717
775"""Parses several HTML elements, returning a list of elements. 776 777 The first item in the list may be a string. 778 If no_leading_text is true, then it will be an error if there is 779 leading text, and it will always be a list of only elements. 780 781 base_url will set the document's base_url attribute 782 (and the tree's docinfo.URL). 783 """ 784ifparserisNone: 785parser=html_parser 786# FIXME: check what happens when you give html with a body, head, etc. 787ifisinstance(html,bytes): 788ifnot_looks_like_full_html_bytes(html): 789# can't use %-formatting in early Py3 versions 790html=('<html><body>'.encode('ascii')+html+ 791'</body></html>'.encode('ascii')) 792else: 793ifnot_looks_like_full_html_unicode(html): 794html='<html><body>%s</body></html>'%html 795doc=document_fromstring(html,parser=parser,base_url=base_url,**kw) 796assert_nons(doc.tag)=='html' 797bodies=[eforeindocif_nons(e.tag)=='body'] 798assertlen(bodies)==1,("too many bodies: %r in %r"%(bodies,html)) 799body=bodies[0] 800elements=[] 801ifno_leading_textandbody.textandbody.text.strip(): 802raiseetree.ParserError( 803"There is leading text: %r"%body.text) 804ifbody.textandbody.text.strip(): 805elements.append(body.text) 806elements.extend(body) 807# FIXME: removing the reference to the parent artificial document 808# would be nice 809returnelements
814""" 815 Parses a single HTML element; it is an error if there is more than 816 one element, or if anything but whitespace precedes or follows the 817 element. 818 819 If ``create_parent`` is true (or is a tag name) then a parent node 820 will be created to encapsulate the HTML in a single element. In this 821 case, leading or trailing text is also allowed, as are multiple elements 822 as result of the parsing. 823 824 Passing a ``base_url`` will set the document's ``base_url`` attribute 825 (and the tree's docinfo.URL). 826 """ 827ifparserisNone: 828parser=html_parser 829 830accept_leading_text=bool(create_parent) 831 832elements=fragments_fromstring( 833html,parser=parser,no_leading_text=notaccept_leading_text, 834base_url=base_url,**kw) 835 836ifcreate_parent: 837ifnotisinstance(create_parent,basestring): 838create_parent='div' 839new_root=Element(create_parent) 840ifelements: 841ifisinstance(elements[0],basestring): 842new_root.text=elements[0] 843delelements[0] 844new_root.extend(elements) 845returnnew_root 846 847ifnotelements: 848raiseetree.ParserError('No elements found') 849iflen(elements)>1: 850raiseetree.ParserError( 851"Multiple elements found (%s)" 852%', '.join([_element_name(e)foreinelements])) 853el=elements[0] 854ifel.tailandel.tail.strip(): 855raiseetree.ParserError( 856"Element followed by text: %r"%el.tail) 857el.tail=None 858returnel
862""" 863 Parse the html, returning a single element/document. 864 865 This tries to minimally parse the chunk of text, without knowing if it 866 is a fragment or a document. 867 868 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 869 """ 870ifparserisNone: 871parser=html_parser 872ifisinstance(html,bytes): 873is_full_html=_looks_like_full_html_bytes(html) 874else: 875is_full_html=_looks_like_full_html_unicode(html) 876doc=document_fromstring(html,parser=parser,base_url=base_url,**kw) 877ifis_full_html: 878returndoc 879# otherwise, lets parse it out... 880bodies=doc.findall('body') 881ifnotbodies: 882bodies=doc.findall('{%s}body'%XHTML_NAMESPACE) 883ifbodies: 884body=bodies[0] 885iflen(bodies)>1: 886# Somehow there are multiple bodies, which is bad, but just 887# smash them into one body 888forother_bodyinbodies[1:]: 889ifother_body.text: 890iflen(body): 891body[-1].tail=(body[-1].tailor'')+other_body.text 892else: 893body.text=(body.textor'')+other_body.text 894body.extend(other_body) 895# We'll ignore tail 896# I guess we are ignoring attributes too 897other_body.drop_tree() 898else: 899body=None 900heads=doc.findall('head') 901ifnotheads: 902heads=doc.findall('{%s}head'%XHTML_NAMESPACE) 903ifheads: 904# Well, we have some sort of structure, so lets keep it all 905head=heads[0] 906iflen(heads)>1: 907forother_headinheads[1:]: 908head.extend(other_head) 909# We don't care about text or tail in a head 910other_head.drop_tree() 911returndoc 912ifbodyisNone: 913returndoc 914if(len(body)==1and(notbody.textornotbody.text.strip()) 915and(notbody[-1].tailornotbody[-1].tail.strip())): 916# The body has just one element, so it was probably a single 917# element passed in 918returnbody[0] 919# Now we have a body which represents a bunch of tags which have the 920# content that was passed in. We will create a fake container, which 921# is the body tag, except <body> implies too much structure. 922if_contains_block_level_tag(body): 923body.tag='div' 924else: 925body.tag='span' 926returnbody
930""" 931 Parse a filename, URL, or file-like object into an HTML document 932 tree. Note: this returns a tree, not an element. Use 933 ``parse(...).getroot()`` to get the document root. 934 935 You can override the base URL with the ``base_url`` keyword. This 936 is most useful when parsing from a file-like object. 937 """ 938ifparserisNone: 939parser=html_parser 940returnetree.parse(filename_or_url,parser,base_url=base_url,**kw)
944# FIXME: I could do this with XPath, but would that just be 945# unnecessarily slow? 946forelinel.iter(etree.Element): 947if_nons(el.tag)indefs.block_tags: 948returnTrue 949returnFalse
972""" 973 Returns an accessor for all the input elements in the form. 974 975 See `InputGetter` for more information about the object. 976 """ 977returnInputGetter(self)
981""" 982 Dictionary-like object that represents all the fields in this 983 form. You can set values in this dictionary to effect the 984 form. 985 """ 986returnFieldsDict(self.inputs)
990fields=self.fields 991prev_keys=fields.keys() 992forkey,valueinvalue.items(): 993ifkeyinprev_keys: 994prev_keys.remove(key) 995fields[key]=value 996forkeyinprev_keys: 997ifkeyisNone: 998# Case of an unnamed input; these aren't really 999# expressed in form_values() anyway.1000continue1001fields[key]=None
1015"""1016 Return a list of tuples of the field values for the form.1017 This is suitable to be passed to ``urllib.urlencode()``.1018 """1019results=[]1020forelinself.inputs:1021name=el.name1022ifnotnameor'disabled'inel.attrib:1023continue1024tag=_nons(el.tag)1025iftag=='textarea':1026results.append((name,el.value))1027eliftag=='select':1028value=el.value1029ifel.multiple:1030forvinvalue:1031results.append((name,v))1032elifvalueisnotNone:1033results.append((name,el.value))1034else:1035asserttag=='input',(1036"Unexpected tag: %r"%el)1037ifel.checkableandnotel.checked:1038continue1039ifel.typein('submit','image','reset','file'):1040continue1041value=el.value1042ifvalueisnotNone:1043results.append((name,el.value))1044returnresults
1070"""1071 Get/set the form's method. Always returns a capitalized1072 string, and defaults to ``'GET'``1073 """1074returnself.get('method','GET').upper()
1085"""1086 Helper function to submit a form. Returns a file-like object, as from1087 ``urllib.urlopen()``. This object also has a ``.geturl()`` function,1088 which shows the URL if there were any redirects.10891090 You can use this like::10911092 form = doc.forms[0]1093 form.inputs['foo'].value = 'bar' # etc1094 response = form.submit()1095 doc = parse(response)1096 doc.make_links_absolute(response.geturl())10971098 To change the HTTP requester, pass a function as ``open_http`` keyword1099 argument that opens the URL for you. The function must have the following1100 signature::11011102 open_http(method, URL, values)11031104 The action is one of 'GET' or 'POST', the URL is the target URL as a1105 string, and the values are a sequence of ``(name, value)`` tuples with the1106 form data.1107 """1108values=form.form_values()1109ifextra_values:1110ifhasattr(extra_values,'items'):1111extra_values=extra_values.items()1112values.extend(extra_values)1113ifopen_httpisNone:1114open_http=open_http_urllib1115ifform.action:1116url=form.action1117else:1118url=form.base_url1119returnopen_http(form.method,url,values)
1123ifnoturl:1124raiseValueError("cannot submit, no URL provided")1125## FIXME: should test that it's not a relative URL or something1126try:1127fromurllibimporturlencode,urlopen1128exceptImportError:# Python 31129fromurllib.requestimporturlopen1130fromurllib.parseimporturlencode1131ifmethod=='GET':1132if'?'inurl:1133url+='&'1134else:1135url+='?'1136url+=urlencode(values)1137data=None1138else:1139data=urlencode(values)1140ifnotisinstance(data,bytes):1141data=data.encode('ASCII')1142returnurlopen(url,data)
11721173"""1174 An accessor that represents all the input fields in a form.11751176 You can get fields by name from this, with1177 ``form.inputs['field_name']``. If there are a set of checkboxes1178 with the same name, they are returned as a list (a `CheckboxGroup`1179 which also allows value setting). Radio inputs are handled1180 similarly.11811182 You can also iterate over this to get all input elements. This1183 won't return the same thing as if you get all the names, as1184 checkboxes and radio elements are returned individually.1185 """11861187_name_xpath=etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")1188_all_xpath=etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")1189
1202results=self._name_xpath(self.form,name=name)1203ifresults:1204type=results[0].get('type')1205iftype=='radio'andlen(results)>1:1206group=RadioGroup(results)1207group.name=name1208returngroup1209eliftype=='checkbox'andlen(results)>1:1210group=CheckboxGroup(results)1211group.name=name1212returngroup1213else:1214# I don't like throwing away elements like this1215returnresults[0]1216else:1217raiseKeyError(1218"No input element with the name %r"%name)
1233## FIXME: kind of dumb to turn a list into an iterator, only1234## to have it likely turned back into a list again :(1235returniter(self._all_xpath(self.form))
1276"""1277 Get/set the value (which is the contents of this element)1278 """1279content=self.textor''1280ifself.tag.startswith("{%s}"%XHTML_NAMESPACE):1281serialisation_method='xml'1282else:1283serialisation_method='html'1284forelinself:1285# it's rare that we actually get here, so let's not use ''.join()1286content+=etree.tostring(1287el,method=serialisation_method,encoding='unicode')1288returncontent
1305"""1306 ``<select>`` element. You can get the name with ``.name``.13071308 ``.value`` will be the value of the selected option, unless this1309 is a multi-select element (``<select multiple>``), in which case1310 it will be a set-like object. In either case ``.value_options``1311 gives the possible values.13121313 The boolean attribute ``.multiple`` shows if this is a1314 multi-select.1315 """1316@property
1318"""1319 Get/set the value of this select (the selected option).13201321 If this is a multi-select, this is a set-like object that1322 represents all the selected options.1323 """1324ifself.multiple:1325returnMultipleSelectOptions(self)1326options=_options_xpath(self)13271328try:1329selected_option=next(elforelinreversed(options)ifel.get('selected')isnotNone)1330exceptStopIteration:1331try:1332selected_option=next(elforelinoptionsifel.get('disabled')isNone)1333exceptStopIteration:1334returnNone1335value=selected_option.get('value')1336ifvalueisNone:1337value=(selected_option.textor'').strip()1338returnvalue
1342ifself.multiple:1343ifisinstance(value,basestring):1344raiseTypeError("You must pass in a sequence")1345values=self.value1346values.clear()1347values.update(value)1348return1349checked_option=None1350ifvalueisnotNone:1351forelin_options_xpath(self):1352opt_value=el.get('value')1353ifopt_valueisNone:1354opt_value=(el.textor'').strip()1355ifopt_value==value:1356checked_option=el1357break1358else:1359raiseValueError(1360"There is no option with the value of %r"%value)1361forelin_options_xpath(self):1362if'selected'inel.attrib:1363delel.attrib['selected']1364ifchecked_optionisnotNone:1365checked_option.set('selected','')
1408"""1409 Represents all the selected options in a ``<select multiple>`` element.14101411 You can add to this set-like option to select an option, or remove1412 to unselect the option.1413 """1414
1434foroptioninself.options:1435opt_value=option.get('value')1436ifopt_valueisNone:1437opt_value=(option.textor'').strip()1438ifopt_value==item:1439option.set('selected','')1440break1441else:1442raiseValueError(1443"There is no option with the value %r"%item)
1446foroptioninself.options:1447opt_value=option.get('value')1448ifopt_valueisNone:1449opt_value=(option.textor'').strip()1450ifopt_value==item:1451if'selected'inoption.attrib:1452deloption.attrib['selected']1453else:1454raiseValueError(1455"The option %r is not currently selected"%item)1456break1457else:1458raiseValueError(1459"There is not option with the value %r"%item)
1469"""1470 This object represents several ``<input type=radio>`` elements1471 that have the same name.14721473 You can use this like a list, but also use the property1474 ``.value`` to check/uncheck inputs. Also you can use1475 ``.value_options`` to get the possible values.1476 """1477@property
1479"""1480 Get/set the value, which checks the radio with that value (and1481 unchecks any other value).1482 """1483forelinself:1484if'checked'inel.attrib:1485returnel.get('value')1486returnNone
1490checked_option=None1491ifvalueisnotNone:1492forelinself:1493ifel.get('value')==value:1494checked_option=el1495break1496else:1497raiseValueError("There is no radio input with the value %r"%value)1498forelinself:1499if'checked'inel.attrib:1500delel.attrib['checked']1501ifchecked_optionisnotNone:1502checked_option.set('checked','')
1522"""1523 Represents a group of checkboxes (``<input type=checkbox>``) that1524 have the same name.15251526 In addition to using this like a list, the ``.value`` attribute1527 returns a set-like object that you can add to or remove from to1528 check and uncheck checkboxes. You can also use ``.value_options``1529 to get the possible values.1530 """1531@property
1533"""1534 Return a set-like object that can be modified to check or1535 uncheck individual checkboxes according to their value.1536 """1537returnCheckboxValues(self)
1589forelinself.group:1590ifel.get('value')==value:1591if'checked'inel.attrib:1592delel.attrib['checked']1593else:1594raiseKeyError(1595"The checkbox with value %r was already unchecked"%value)1596break1597else:1598raiseKeyError(1599"No checkbox with value %r"%value)
1609"""1610 Represents an ``<input>`` element.16111612 You can get the type with ``.type`` (which is lower-cased and1613 defaults to ``'text'``).16141615 Also you can get and set the value with ``.value``16161617 Checkboxes and radios have the attribute ``input.checkable ==1618 True`` (for all others it is false) and a boolean attribute1619 ``.checked``.16201621 """16221623## FIXME: I'm a little uncomfortable with the use of .checked1624@property
1626"""1627 Get/set the value of this element, using the ``value`` attribute.16281629 Also, if this is a checkbox and it has no value, this defaults1630 to ``'on'``. If it is a checkbox or radio that is not1631 checked, this returns None.1632 """1633ifself.checkable:1634ifself.checked:1635returnself.get('value')or'on'1636else:1637returnNone1638returnself.get('value')
1680"""1681 Boolean attribute to get/set the presence of the ``checked``1682 attribute.16831684 You can only use this on checkable input types.1685 """1686ifnotself.checkable:1687raiseAttributeError('Not a checkable input type')1688return'checked'inself.attrib
1692ifnotself.checkable:1693raiseAttributeError('Not a checkable input type')1694ifvalue:1695self.set('checked','')1696else:1697attrib=self.attrib1698if'checked'inattrib:1699delattrib['checked']
1706"""1707 Represents a ``<label>`` element.17081709 Label elements are linked to other elements with their ``for``1710 attribute. You can access this element with ``label.for_element``.1711 """1712@property
1714"""1715 Get/set the element this label points to. Return None if it1716 can't be found.1717 """1718id=self.get('for')1719ifnotid:1720returnNone1721returnself.body.get_element_by_id(id)
1761"""Convert all tags in an XHTML tree to HTML by removing their1762 XHTML namespace.1763 """1764try:1765xhtml=xhtml.getroot()1766exceptAttributeError:1767pass1768prefix="{%s}"%XHTML_NAMESPACE1769prefix_len=len(prefix)1770forelinxhtml.iter(prefix+"*"):1771el.tag=el.tag[prefix_len:]
177217731774# This isn't a general match, but it's a match for what libxml21775# specifically serialises:1776__str_replace_meta_content_type=re.compile(1777r'<meta http-equiv="Content-Type"[^>]*>').sub1778__bytes_replace_meta_content_type=re.compile(1779r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1784"""Return an HTML string representation of the document.17851786 Note: if include_meta_content_type is true this will create a1787 ``<meta http-equiv="Content-Type" ...>`` tag in the head;1788 regardless of the value of include_meta_content_type any existing1789 ``<meta http-equiv="Content-Type" ...>`` tag will be removed17901791 The ``encoding`` argument controls the output encoding (defauts to1792 ASCII, with &#...; character references for any characters outside1793 of ASCII). Note that you can pass the name ``'unicode'`` as1794 ``encoding`` argument to serialise to a Unicode string.17951796 The ``method`` argument defines the output method. It defaults to1797 'html', but can also be 'xml' for xhtml output, or 'text' to1798 serialise to plain text without markup.17991800 To leave out the tail text of the top-level element that is being1801 serialised, pass ``with_tail=False``.18021803 The ``doctype`` option allows passing in a plain string that will1804 be serialised before the XML tree. Note that passing in non1805 well-formed content here will make the XML output non well-formed.1806 Also, an existing doctype in the document tree will not be removed1807 when serialising an ElementTree instance.18081809 Example::18101811 >>> from lxml import html1812 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')18131814 >>> html.tostring(root)1815 b'<p>Hello<br>world!</p>'1816 >>> html.tostring(root, method='html')1817 b'<p>Hello<br>world!</p>'18181819 >>> html.tostring(root, method='xml')1820 b'<p>Hello<br/>world!</p>'18211822 >>> html.tostring(root, method='text')1823 b'Helloworld!'18241825 >>> html.tostring(root, method='text', encoding='unicode')1826 u'Helloworld!'18271828 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')1829 >>> html.tostring(root[0], method='text', encoding='unicode')1830 u'Helloworld!TAIL'18311832 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)1833 u'Helloworld!'18341835 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')1836 >>> html.tostring(doc, method='html', encoding='unicode')1837 u'<html><body><p>Hello<br>world!</p></body></html>'18381839 >>> print(html.tostring(doc, method='html', encoding='unicode',1840 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'1841 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))1842 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">1843 <html><body><p>Hello<br>world!</p></body></html>1844 """1845html=etree.tostring(doc,method=method,pretty_print=pretty_print,1846encoding=encoding,with_tail=with_tail,1847doctype=doctype)1848ifmethod=='html'andnotinclude_meta_content_type:1849ifisinstance(html,str):1850html=__str_replace_meta_content_type('',html)1851else:1852html=__bytes_replace_meta_content_type(bytes(),html)1853returnhtml
1860"""1861 Open the HTML document in a web browser, saving it to a temporary1862 file to open it. Note that this does not delete the file after1863 use. This is mainly meant for debugging.1864 """1865importos1866importwebbrowser1867importtempfile1868ifnotisinstance(doc,etree._ElementTree):1869doc=etree.ElementTree(doc)1870handle,fn=tempfile.mkstemp(suffix='.html')1871f=os.fdopen(handle,'wb')1872try:1873doc.write(f,method="html",encoding=encodingordoc.docinfo.encodingor"UTF-8")1874finally:1875# we leak the file itself here, but we should at least close it1876f.close()1877url='file://'+fn.replace(os.path.sep,'/')1878print(url)1879webbrowser.open(url)
1880
18811882################################################################################1883# configure Element class lookup1884################################################################################18851886-classHTMLParser(etree.HTMLParser):
1887"""An HTML parser that is configured to return lxml.html Element1888 objects.1889 """
1896"""An XML parser that is configured to return lxml.html Element1897 objects.18981899 Note that this parser is not really XHTML aware unless you let it1900 load a DTD that declares the HTML entities. To do this, make sure1901 you have the XHTML DTDs installed in your catalogs, and create the1902 parser like this::19031904 >>> parser = XHTMLParser(load_dtd=True)19051906 If you additionally want to validate the document, use this::19071908 >>> parser = XHTMLParser(dtd_validation=True)19091910 For catalog support, see http://www.xmlsoft.org/catalog.html.1911 """