lxml.tests.test_htmlparser
1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile , os , os . path , sys
9
10 this_dir = os . path . dirname ( __file__ )
11 if this_dir not in sys . path :
12 sys . path . insert ( 0 , this_dir )
13
14 from common_imports import etree , html , StringIO , BytesIO , fileInTestDir , _bytes , _str
15 from common_imports import SillyFileLike , HelperTestCase , write_to_file , next
16
17 try :
18 unicode
19 except NameError :
20 unicode = str
21
22
24 """HTML parser test cases
25 """
26 etree = etree
27
28 html_str = _bytes ( "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" )
29 html_str_pretty = _bytes ( """\
30 <html>
31 <head><title>test</title></head>
32 <body><h1>page title</h1></body>
33 </html>
34 """ )
35 broken_html_str = _bytes ( "<html><head><title>test"
36 "<body><h1>page title</h3></p></html>" )
37 uhtml_str = _bytes (
38 "<html><head><title>test á</title></head>"
39 "<body><h1>page á title</h1></body></html>" ) . decode ( 'utf8' )
40
44
49
57
59 if sys . maxunicode < 1114111 :
60 return
61 element = self . etree . HTML ( _bytes (
62 '<html><body><p>\\U00026007</p></body></html>'
63 ) . decode ( 'unicode_escape' ) )
64 p_text = element . findtext ( './/p' )
65 self . assertEqual ( 1 , len ( p_text ) )
66 self . assertEqual ( _bytes ( '\\U00026007' ) . decode ( 'unicode_escape' ) ,
67 p_text )
68
76
84
89
96
98 parser = self . etree . HTMLParser ( )
99 Element = parser . makeelement
100
101 el = Element ( 'name' )
102 self . assertRaises ( ValueError , Element , '{}' )
103 self . assertRaises ( ValueError , setattr , el , 'tag' , '{}' )
104
105 self . assertRaises ( ValueError , Element , '{test}' )
106 self . assertRaises ( ValueError , setattr , el , 'tag' , '{test}' )
107
121
123 parser = self . etree . HTMLParser ( )
124 Element = parser . makeelement
125
126 self . assertRaises ( ValueError , Element , 'p"name' )
127 self . assertRaises ( ValueError , Element , "na'me" )
128 self . assertRaises ( ValueError , Element , '{test}"name' )
129 self . assertRaises ( ValueError , Element , "{test}name'" )
130
131 el = Element ( 'name' )
132 self . assertRaises ( ValueError , setattr , el , 'tag' , "pname'" )
133 self . assertRaises ( ValueError , setattr , el , 'tag' , '"pname' )
134 self . assertEqual ( el . tag , "name" )
135
137 parser = self . etree . HTMLParser ( )
138 Element = parser . makeelement
139
140 self . assertRaises ( ValueError , Element , ' name ' )
141 self . assertRaises ( ValueError , Element , 'na me' )
142 self . assertRaises ( ValueError , Element , '{test} name' )
143
144 el = Element ( 'name' )
145 self . assertRaises ( ValueError , setattr , el , 'tag' , ' name ' )
146 self . assertEqual ( el . tag , "name" )
147
157
169
171 parser = self . etree . HTMLParser ( )
172 Element = parser . makeelement
173 SubElement = self . etree . SubElement
174
175 el = Element ( 'name' )
176 self . assertRaises ( ValueError , SubElement , el , "name'" )
177 self . assertRaises ( ValueError , SubElement , el , 'na"me' )
178 self . assertRaises ( ValueError , SubElement , el , "{test}na'me" )
179 self . assertRaises ( ValueError , SubElement , el , '{test}"name' )
180
190
197
205
215
217 text = _str ( 'Søk på nettet' )
218 wrong_head = _str ( '''
219 <head>
220 <meta http-equiv="Content-Type"
221 content="text/html; charset=UTF-8" />
222 </head>''' )
223 html_latin1 = ( _str ( '<html>%s<body><p>%s</p></body></html>' ) % ( wrong_head ,
224 text )
225 ) . encode ( 'iso-8859-1' )
226
227 self . assertRaises ( self . etree . ParseError ,
228 self . etree . parse ,
229 BytesIO ( html_latin1 ) )
230
231 tree = self . etree . parse (
232 BytesIO ( html_latin1 ) ,
233 self . etree . HTMLParser ( encoding = "iso-8859-1" ) )
234 p = tree . find ( "//p" )
235 self . assertEqual ( p . text , text )
236
241
247
251
264
272
273
274
275
276
277
278
279
286
301
303 iterparse = self . etree . iterparse
304 f = BytesIO (
305 '<html><head><title>TITLE</title><body><p>P</p></body></html>' )
306
307 iterator = iterparse ( f , html = True )
308 self . assertEqual ( None , iterator . root )
309
310 events = list ( iterator )
311 root = iterator . root
312 self . assertTrue ( root is not None )
313 self . assertEqual (
314 [ ( 'end' , root [ 0 ] [ 0 ] ) , ( 'end' , root [ 0 ] ) , ( 'end' , root [ 1 ] [ 0 ] ) ,
315 ( 'end' , root [ 1 ] ) , ( 'end' , root ) ] ,
316 events )
317
319 iterparse = self . etree . iterparse
320 f = BytesIO (
321 '<html><head><title>TITLE</title><body><p>P</p></body></html>' )
322
323 iterator = iterparse ( f , html = True )
324 self . assertEqual ( None , iterator . root )
325
326 event , element = next ( iterator )
327 self . assertEqual ( 'end' , event )
328 self . assertEqual ( 'title' , element . tag )
329 self . assertEqual ( None , iterator . root )
330 del element
331
332 event , element = next ( iterator )
333 self . assertEqual ( 'end' , event )
334 self . assertEqual ( 'head' , element . tag )
335 self . assertEqual ( None , iterator . root )
336 del element
337 del iterator
338
340 iterparse = self . etree . iterparse
341 f = BytesIO ( '<head><title>TEST></head><p>P<br></div>' )
342
343 iterator = iterparse ( f , html = True )
344 self . assertEqual ( None , iterator . root )
345
346 events = list ( iterator )
347 root = iterator . root
348 self . assertTrue ( root is not None )
349 self . assertEqual ( 'html' , root . tag )
350 self . assertEqual ( 'head' , root [ 0 ] . tag )
351 self . assertEqual ( 'body' , root [ 1 ] . tag )
352 self . assertEqual ( 'p' , root [ 1 ] [ 0 ] . tag )
353 self . assertEqual ( 'br' , root [ 1 ] [ 0 ] [ 0 ] . tag )
354 self . assertEqual (
355 [ ( 'end' , root [ 0 ] [ 0 ] ) , ( 'end' , root [ 0 ] ) , ( 'end' , root [ 1 ] [ 0 ] [ 0 ] ) ,
356 ( 'end' , root [ 1 ] [ 0 ] ) , ( 'end' , root [ 1 ] ) , ( 'end' , root ) ] ,
357 events )
358
364
366 iterparse = self . etree . iterparse
367 iterator = iterparse ( fileInTestDir ( "shakespeare.html" ) ,
368 html = True )
369
370 self . assertEqual ( None , iterator . root )
371 events = list ( iterator )
372 root = iterator . root
373 self . assertTrue ( root is not None )
374 self . assertEqual ( 249 , len ( events ) )
375 self . assertFalse (
376 [ event for ( event , element ) in events if event != 'end' ] )
377
379 iterparse = self . etree . iterparse
380 f = BytesIO (
381 '<html><head><title>TITLE</title><body><p>P</p></body></html>' )
382
383 iterator = iterparse ( f , html = True , events = ( 'start' , ) )
384 self . assertEqual ( None , iterator . root )
385
386 events = list ( iterator )
387 root = iterator . root
388 self . assertNotEqual ( None , root )
389 self . assertEqual (
390 [ ( 'start' , root ) , ( 'start' , root [ 0 ] ) , ( 'start' , root [ 0 ] [ 0 ] ) ,
391 ( 'start' , root [ 1 ] ) , ( 'start' , root [ 1 ] [ 0 ] ) ] ,
392 events )
393
404
423
438
440 assertFalse = self . assertFalse
441 events = [ ]
442 class Target ( object ) :
443 def start ( self , tag , attrib ) :
444 events . append ( ( "start" , tag ) )
445 assertFalse ( attrib )
446 def end ( self , tag ) :
447 events . append ( ( "end" , tag ) )
448 def close ( self ) :
449 return "DONE"
450
451 parser = self . etree . HTMLParser ( target = Target ( ) )
452
453 parser . feed ( "<html><body></body></html>" )
454 done = parser . close ( )
455
456 self . assertEqual ( "DONE" , done )
457 self . assertEqual ( [
458 ( "start" , "html" ) , ( "start" , "body" ) ,
459 ( "end" , "body" ) , ( "end" , "html" ) ] , events )
460
462 assertFalse = self . assertFalse
463 events = [ ]
464 class Target ( object ) :
465 def start ( self , tag , attrib ) :
466 events . append ( ( "start" , tag ) )
467 assertFalse ( attrib )
468 def end ( self , tag ) :
469 events . append ( ( "end" , tag ) )
470 def doctype ( self , * args ) :
471 events . append ( ( "doctype" , args ) )
472 def close ( self ) :
473 return "DONE"
474
475 parser = self . etree . HTMLParser ( target = Target ( ) )
476 parser . feed ( "<!DOCTYPE><html><body></body></html>" )
477 done = parser . close ( )
478
479 self . assertEqual ( "DONE" , done )
480 self . assertEqual ( [
481 ( "doctype" , ( None , None , None ) ) ,
482 ( "start" , "html" ) , ( "start" , "body" ) ,
483 ( "end" , "body" ) , ( "end" , "html" ) ] , events )
484
486 assertFalse = self . assertFalse
487 events = [ ]
488 class Target ( object ) :
489 def start ( self , tag , attrib ) :
490 events . append ( ( "start" , tag ) )
491 assertFalse ( attrib )
492 def end ( self , tag ) :
493 events . append ( ( "end" , tag ) )
494 def doctype ( self , * args ) :
495 events . append ( ( "doctype" , args ) )
496 def close ( self ) :
497 return "DONE"
498
499 parser = self . etree . HTMLParser ( target = Target ( ) )
500 parser . feed ( "<!DOCTYPE html><html><body></body></html>" )
501 done = parser . close ( )
502
503 self . assertEqual ( "DONE" , done )
504 self . assertEqual ( [
505 ( "doctype" , ( "html" , None , None ) ) ,
506 ( "start" , "html" ) , ( "start" , "body" ) ,
507 ( "end" , "body" ) , ( "end" , "html" ) ] , events )
508
510 assertFalse = self . assertFalse
511 events = [ ]
512 class Target ( object ) :
513 def start ( self , tag , attrib ) :
514 events . append ( ( "start" , tag ) )
515 assertFalse ( attrib )
516 def end ( self , tag ) :
517 events . append ( ( "end" , tag ) )
518 def doctype ( self , * args ) :
519 events . append ( ( "doctype" , args ) )
520 def close ( self ) :
521 return "DONE"
522
523 parser = self . etree . HTMLParser ( target = Target ( ) )
524 parser . feed ( '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
525 '<html><body></body></html>' )
526 done = parser . close ( )
527
528 self . assertEqual ( "DONE" , done )
529 self . assertEqual ( [
530 ( "doctype" , ( "html" , "-//W3C//DTD HTML 4.01//EN" , "sys.dtd" ) ) ,
531 ( "start" , "html" ) , ( "start" , "body" ) ,
532 ( "end" , "body" ) , ( "end" , "html" ) ] , events )
533
535 events = [ ]
536 class Target ( object ) :
537 def start ( self , tag , attrib ) :
538 events . append ( ( "start" , tag ) )
539 raise ValueError ( "START" )
540 def end ( self , tag ) :
541 events . append ( ( "end" , tag ) )
542 raise TypeError ( "END" )
543 def close ( self ) :
544 return "DONE"
545
546 parser = self . etree . HTMLParser ( target = Target ( ) )
547 try :
548 parser . feed ( '<html><body>' )
549 parser . feed ( '</body></html>' )
550 except ValueError as exc :
551 assert "START" in str ( exc )
552 except TypeError as exc :
553 assert "END" in str ( exc )
554 self . assertTrue ( False , "wrong exception raised" )
555 else :
556 self . assertTrue ( False , "no exception raised" )
557
558 self . assertTrue ( ( "start" , "html" ) in events , events )
559 self . assertTrue ( ( "end" , "html" ) not in events , events )
560
562 events = [ ]
563 class Target ( object ) :
564 def start ( self , tag , attrib ) :
565 events . append ( ( "start" , tag ) )
566 raise ValueError ( "START" )
567 def end ( self , tag ) :
568 events . append ( ( "end" , tag ) )
569 raise TypeError ( "END" )
570 def close ( self ) :
571 return "DONE"
572
573 parser = self . etree . HTMLParser ( target = Target ( ) )
574 try :
575 self . etree . fromstring ( '<html><body></body></html>' , parser )
576 except ValueError as exc :
577 assert "START" in str ( exc ) , str ( exc )
578 except TypeError as exc :
579 assert "END" in str ( exc ) , str ( exc )
580 self . assertTrue ( False , "wrong exception raised" )
581 else :
582 self . assertTrue ( False , "no exception raised" )
583
584 self . assertTrue ( ( "start" , "html" ) in events , events )
585 self . assertTrue ( ( "end" , "html" ) not in events , events )
586
588 doc = html . Element ( 'html' ) . getroottree ( )
589 doc . docinfo . public_id = "-//W3C//DTD XHTML 1.0 Strict//EN"
590 doc . docinfo . system_url = \
591 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
592 self . assertEqual ( doc . docinfo . doctype ,
593 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' )
594 self . assertEqual ( self . etree . tostring ( doc ) ,
595 _bytes ( '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
596 <html xmlns="http://www.w3.org/1999/xhtml"></html>''' ) )
597
608
618
628
634
640
641
643 suite = unittest . TestSuite ( )
644 suite . addTests ( [ unittest . makeSuite ( HtmlParserTestCase ) ] )
645 return suite
646
647
648 if __name__ == '__main__' :
649 print ( 'to test use test.py %s' % __file__ )
650