lxml.html.clean
Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11      from urllib import unquote_plus 
 12  except ImportError: 
 13      # Python 3 
 14      from urllib.parse import urlsplit, unquote_plus 
 15  from lxml import etree 
 16  from lxml.html import defs 
 17  from lxml.html import fromstring, XHTML_NAMESPACE 
 18  from lxml.html import xhtml_to_html, _transform_result 
 19   
 20  try: 
 21      unichr 
 22  except NameError: 
 23      # Python 3 
 24      unichr = chr 
 25  try: 
 26      unicode 
 27  except NameError: 
 28      # Python 3 
 29      unicode = str 
 30  try: 
 31      bytes 
 32  except NameError: 
 33      # Python < 2.6 
 34      bytes = str 
 35  try: 
 36      basestring 
 37  except NameError: 
 38      basestring = (str, bytes) 
 39   
 40   
 41  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 42             'word_break', 'word_break_html'] 
 43   
 44  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 45  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 46  # I have multiple kinds of schemes searched; but should schemes be 
 47  #   whitelisted instead? 
 48  # max height? 
 49  # remove images?  Also in CSS?  background attribute? 
 50  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 51  #   allow *just* embedded YouTube movies) 
 52  # Log what was deleted and why? 
 53  # style="behavior: ..." might be bad in IE? 
 54  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 55  #   metas. 
 56  # UTF-7 detections?  Example: 
 57  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 58  #   you don't always have to have the charset set, if the page has no charset 
 59  #   and there's UTF7-like code in it. 
 60  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 61   
 62   
 63  # This is an IE-specific construct you can have in a stylesheet to 
 64  # run some Javascript: 
 65  _css_javascript_re = re.compile( 
 66      r'expression\s*\(.*?\)', re.S|re.I) 
 67   
 68  # Do I have to worry about @\nimport? 
 69  _css_import_re = re.compile( 
 70      r'@\s*import', re.I) 
 71   
 72  # All kinds of schemes besides just javascript: that can cause 
 73  # execution: 
 74  _is_image_dataurl = re.compile( 
 75      r'^data:image/.+;base64', re.I).search 
 76  _is_possibly_malicious_scheme = re.compile( 
 77      r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 
 78      re.I).search 
79 -def _is_javascript_scheme(s):
80 if _is_image_dataurl(s): 81 return None 82 return _is_possibly_malicious_scheme(s)
83 84 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub 85 # FIXME: should data: be blocked? 86 87 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 88 _conditional_comment_re = re.compile( 89 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 90 91 _find_styled_elements = etree.XPath( 92 "descendant-or-self::*[@style]") 93 94 _find_external_links = etree.XPath( 95 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 96 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 97 namespaces={'x':XHTML_NAMESPACE}) 98 99
100 -class Cleaner(object):
101 """ 102 Instances cleans the document of each of the possible offending 103 elements. The cleaning is controlled by attributes; you can 104 override attributes in a subclass, or set them in the constructor. 105 106 ``scripts``: 107 Removes any ``<script>`` tags. 108 109 ``javascript``: 110 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 111 as they could contain Javascript. 112 113 ``comments``: 114 Removes any comments. 115 116 ``style``: 117 Removes any style tags. 118 119 ``inline_style`` 120 Removes any style attributes. Defaults to the value of the ``style`` option. 121 122 ``links``: 123 Removes any ``<link>`` tags 124 125 ``meta``: 126 Removes any ``<meta>`` tags 127 128 ``page_structure``: 129 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 130 131 ``processing_instructions``: 132 Removes any processing instructions. 133 134 ``embedded``: 135 Removes any embedded objects (flash, iframes) 136 137 ``frames``: 138 Removes any frame-related tags 139 140 ``forms``: 141 Removes any form tags 142 143 ``annoying_tags``: 144 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 145 146 ``remove_tags``: 147 A list of tags to remove. Only the tags will be removed, 148 their content will get pulled up into the parent tag. 149 150 ``kill_tags``: 151 A list of tags to kill. Killing also removes the tag's content, 152 i.e. the whole subtree, not just the tag itself. 153 154 ``allow_tags``: 155 A list of tags to include (default include all). 156 157 ``remove_unknown_tags``: 158 Remove any tags that aren't standard parts of HTML. 159 160 ``safe_attrs_only``: 161 If true, only include 'safe' attributes (specifically the list 162 from the feedparser HTML sanitisation web site). 163 164 ``safe_attrs``: 165 A set of attribute names to override the default list of attributes 166 considered 'safe' (when safe_attrs_only=True). 167 168 ``add_nofollow``: 169 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 170 171 ``host_whitelist``: 172 A list or set of hosts that you can use for embedded content 173 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 174 You can also implement/override the method 175 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 176 implement more complex rules for what can be embedded. 177 Anything that passes this test will be shown, regardless of 178 the value of (for instance) ``embedded``. 179 180 Note that this parameter might not work as intended if you do not 181 make the links absolute before doing the cleaning. 182 183 Note that you may also need to set ``whitelist_tags``. 184 185 ``whitelist_tags``: 186 A set of tags that can be included with ``host_whitelist``. 187 The default is ``iframe`` and ``embed``; you may wish to 188 include other tags like ``script``, or you may want to 189 implement ``allow_embedded_url`` for more control. Set to None to 190 include all tags. 191 192 This modifies the document *in place*. 193 """ 194 195 scripts = True 196 javascript = True 197 comments = True 198 style = False 199 inline_style = None 200 links = True 201 meta = True 202 page_structure = True 203 processing_instructions = True 204 embedded = True 205 frames = True 206 forms = True 207 annoying_tags = True 208 remove_tags = None 209 allow_tags = None 210 kill_tags = None 211 remove_unknown_tags = True 212 safe_attrs_only = True 213 safe_attrs = defs.safe_attrs 214 add_nofollow = False 215 host_whitelist = () 216 whitelist_tags = set(['iframe', 'embed']) 217
218 - def __init__(self, **kw):
219 for name, value in kw.items(): 220 if not hasattr(self, name): 221 raise TypeError( 222 "Unknown parameter: %s=%r" % (name, value)) 223 setattr(self, name, value) 224 if self.inline_style is None and 'inline_style' not in kw: 225 self.inline_style = self.style
226 227 # Used to lookup the primary URL for a given tag that is up for 228 # removal: 229 _tag_link_attrs = dict( 230 script='src', 231 link='href', 232 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 233 # From what I can tell, both attributes can contain a link: 234 applet=['code', 'object'], 235 iframe='src', 236 embed='src', 237 layer='src', 238 # FIXME: there doesn't really seem like a general way to figure out what 239 # links an <object> tag uses; links often go in <param> tags with values 240 # that we don't really know. You'd have to have knowledge about specific 241 # kinds of plugins (probably keyed off classid), and match against those. 242 ##object=?, 243 # FIXME: not looking at the action currently, because it is more complex 244 # than than -- if you keep the form, you should keep the form controls. 245 ##form='action', 246 a='href', 247 ) 248
249 - def __call__(self, doc):
250 """ 251 Cleans the document. 252 """ 253 if hasattr(doc, 'getroot'): 254 # ElementTree instance, instead of an element 255 doc = doc.getroot() 256 # convert XHTML to HTML 257 xhtml_to_html(doc) 258 # Normalize a case that IE treats <image> like <img>, and that 259 # can confuse either this step or later steps. 260 for el in doc.iter('image'): 261 el.tag = 'img' 262 if not self.comments: 263 # Of course, if we were going to kill comments anyway, we don't 264 # need to worry about this 265 self.kill_conditional_comments(doc) 266 267 kill_tags = set(self.kill_tags or ()) 268 remove_tags = set(self.remove_tags or ()) 269 allow_tags = set(self.allow_tags or ()) 270 271 if self.scripts: 272 kill_tags.add('script') 273 if self.safe_attrs_only: 274 safe_attrs = set(self.safe_attrs) 275 for el in doc.iter(etree.Element): 276 attrib = el.attrib 277 for aname in attrib.keys(): 278 if aname not in safe_attrs: 279 del attrib[aname] 280 if self.javascript: 281 if not (self.safe_attrs_only and 282 self.safe_attrs == defs.safe_attrs): 283 # safe_attrs handles events attributes itself 284 for el in doc.iter(etree.Element): 285 attrib = el.attrib 286 for aname in attrib.keys(): 287 if aname.startswith('on'): 288 del attrib[aname] 289 doc.rewrite_links(self._remove_javascript_link, 290 resolve_base_href=False) 291 # If we're deleting style then we don't have to remove JS links 292 # from styles, otherwise... 293 if not self.inline_style: 294 for el in _find_styled_elements(doc): 295 old = el.get('style') 296 new = _css_javascript_re.sub('', old) 297 new = _css_import_re.sub('', new) 298 if self._has_sneaky_javascript(new): 299 # Something tricky is going on... 300 del el.attrib['style'] 301 elif new != old: 302 el.set('style', new) 303 if not self.style: 304 for el in list(doc.iter('style')): 305 if el.get('type', '').lower().strip() == 'text/javascript': 306 el.drop_tree() 307 continue 308 old = el.text or '' 309 new = _css_javascript_re.sub('', old) 310 # The imported CSS can do anything; we just can't allow: 311 new = _css_import_re.sub('', old) 312 if self._has_sneaky_javascript(new): 313 # Something tricky is going on... 314 el.text = '/* deleted */' 315 elif new != old: 316 el.text = new 317 if self.comments or self.processing_instructions: 318 # FIXME: why either? I feel like there's some obscure reason 319 # because you can put PIs in comments...? But I've already 320 # forgotten it 321 kill_tags.add(etree.Comment) 322 if self.processing_instructions: 323 kill_tags.add(etree.ProcessingInstruction) 324 if self.style: 325 kill_tags.add('style') 326 if self.inline_style: 327 etree.strip_attributes(doc, 'style') 328 if self.links: 329 kill_tags.add('link') 330 elif self.style or self.javascript: 331 # We must get rid of included stylesheets if Javascript is not 332 # allowed, as you can put Javascript in them 333 for el in list(doc.iter('link')): 334 if 'stylesheet' in el.get('rel', '').lower(): 335 # Note this kills alternate stylesheets as well 336 if not self.allow_element(el): 337 el.drop_tree() 338 if self.meta: 339 kill_tags.add('meta') 340 if self.page_structure: 341 remove_tags.update(('head', 'html', 'title')) 342 if self.embedded: 343 # FIXME: is <layer> really embedded? 344 # We should get rid of any <param> tags not inside <applet>; 345 # These are not really valid anyway. 346 for el in list(doc.iter('param')): 347 found_parent = False 348 parent = el.getparent() 349 while parent is not None and parent.tag not in ('applet', 'object'): 350 parent = parent.getparent() 351 if parent is None: 352 el.drop_tree() 353 kill_tags.update(('applet',)) 354 # The alternate contents that are in an iframe are a good fallback: 355 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 356 if self.frames: 357 # FIXME: ideally we should look at the frame links, but 358 # generally frames don't mix properly with an HTML 359 # fragment anyway. 360 kill_tags.update(defs.frame_tags) 361 if self.forms: 362 remove_tags.add('form') 363 kill_tags.update(('button', 'input', 'select', 'textarea')) 364 if self.annoying_tags: 365 remove_tags.update(('blink', 'marquee')) 366 367 _remove = [] 368 _kill = [] 369 for el in doc.iter(): 370 if el.tag in kill_tags: 371 if self.allow_element(el): 372 continue 373 _kill.append(el) 374 elif el.tag in remove_tags: 375 if self.allow_element(el): 376 continue 377 _remove.append(el) 378 379 if _remove and _remove[0] == doc: 380 # We have to drop the parent-most tag, which we can't 381 # do. Instead we'll rewrite it: 382 el = _remove.pop(0) 383 el.tag = 'div' 384 el.attrib.clear() 385 elif _kill and _kill[0] == doc: 386 # We have to drop the parent-most element, which we can't 387 # do. Instead we'll clear it: 388 el = _kill.pop(0) 389 if el.tag != 'html': 390 el.tag = 'div' 391 el.clear() 392 393 _kill.reverse() # start with innermost tags 394 for el in _kill: 395 el.drop_tree() 396 for el in _remove: 397 el.drop_tag() 398 399 if self.remove_unknown_tags: 400 if allow_tags: 401 raise ValueError( 402 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 403 allow_tags = set(defs.tags) 404 if allow_tags: 405 bad = [] 406 for el in doc.iter(): 407 if el.tag not in allow_tags: 408 bad.append(el) 409 if bad: 410 if bad[0] is doc: 411 el = bad.pop(0) 412 el.tag = 'div' 413 el.attrib.clear() 414 for el in bad: 415 el.drop_tag() 416 if self.add_nofollow: 417 for el in _find_external_links(doc): 418 if not self.allow_follow(el): 419 rel = el.get('rel') 420 if rel: 421 if ('nofollow' in rel 422 and ' nofollow ' in (' %s ' % rel)): 423 continue 424 rel = '%s nofollow' % rel 425 else: 426 rel = 'nofollow' 427 el.set('rel', rel)
428
429 - def allow_follow(self, anchor):
430 """ 431 Override to suppress rel="nofollow" on some anchors. 432 """ 433 return False
434
435 - def allow_element(self, el):
436 if el.tag not in self._tag_link_attrs: 437 return False 438 attr = self._tag_link_attrs[el.tag] 439 if isinstance(attr, (list, tuple)): 440 for one_attr in attr: 441 url = el.get(one_attr) 442 if not url: 443 return False 444 if not self.allow_embedded_url(el, url): 445 return False 446 return True 447 else: 448 url = el.get(attr) 449 if not url: 450 return False 451 return self.allow_embedded_url(el, url)
452
453 - def allow_embedded_url(self, el, url):
454 if (self.whitelist_tags is not None 455 and el.tag not in self.whitelist_tags): 456 return False 457 scheme, netloc, path, query, fragment = urlsplit(url) 458 netloc = netloc.lower().split(':', 1)[0] 459 if scheme not in ('http', 'https'): 460 return False 461 if netloc in self.host_whitelist: 462 return True 463 return False
464
465 - def kill_conditional_comments(self, doc):
466 """ 467 IE conditional comments basically embed HTML that the parser 468 doesn't normally see. We can't allow anything like that, so 469 we'll kill any comments that could be conditional. 470 """ 471 bad = [] 472 self._kill_elements( 473 doc, lambda el: _conditional_comment_re.search(el.text), 474 etree.Comment)
475
476 - def _kill_elements(self, doc, condition, iterate=None):
477 bad = [] 478 for el in doc.iter(iterate): 479 if condition(el): 480 bad.append(el) 481 for el in bad: 482 el.drop_tree()
483 491 492 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 493
494 - def _has_sneaky_javascript(self, style):
495 """ 496 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 497 can get interpreted, or ``expre/* stuff */ssion(...)``. This 498 checks for attempt to do stuff like this. 499 500 Typically the response will be to kill the entire style; if you 501 have just a bit of Javascript in the style another rule will catch 502 that and remove only the Javascript from the style; this catches 503 more sneaky attempts. 504 """ 505 style = self._substitute_comments('', style) 506 style = style.replace('\\', '') 507 style = _substitute_whitespace('', style) 508 style = style.lower() 509 if 'javascript:' in style: 510 return True 511 if 'expression(' in style: 512 return True 513 return False
514
515 - def clean_html(self, html):
516 result_type = type(html) 517 if isinstance(html, basestring): 518 doc = fromstring(html) 519 else: 520 doc = copy.deepcopy(html) 521 self(doc) 522 return _transform_result(result_type, doc)
523 524 clean = Cleaner() 525 clean_html = clean.clean_html 526 527 ############################################################ 528 ## Autolinking 529 ############################################################ 530 531 _link_regexes = [ 532 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 533 # This is conservative, but autolinking can be a bit conservative: 534 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), 535 ] 536 537 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 538 539 _avoid_hosts = [ 540 re.compile(r'^localhost', re.I), 541 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 542 re.compile(r'^127\.0\.0\.1$'), 543 ] 544 545 _avoid_classes = ['nolink'] 546 591 649 658 659 autolink_html.__doc__ = autolink.__doc__ 660 661 ############################################################ 662 ## Word wrapping 663 ############################################################ 664 665 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 666 _avoid_word_break_classes = ['nobreak'] 667
668 -def word_break(el, max_width=40, 669 avoid_elements=_avoid_word_break_elements, 670 avoid_classes=_avoid_word_break_classes, 671 break_character=unichr(0x200b)):
672 """ 673 Breaks any long words found in the body of the text (not attributes). 674 675 Doesn't effect any of the tags in avoid_elements, by default 676 ``<textarea>`` and ``<pre>`` 677 678 Breaks words by inserting &#8203;, which is a unicode character 679 for Zero Width Space character. This generally takes up no space 680 in rendering, but does copy as a space, and in monospace contexts 681 usually takes up space. 682 683 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 684 """ 685 # Character suggestion of &#8203 comes from: 686 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 687 if el.tag in _avoid_word_break_elements: 688 return 689 class_name = el.get('class') 690 if class_name: 691 dont_break = False 692 class_name = class_name.split() 693 for avoid in avoid_classes: 694 if avoid in class_name: 695 dont_break = True 696 break 697 if dont_break: 698 return 699 if el.text: 700 el.text = _break_text(el.text, max_width, break_character) 701 for child in el: 702 word_break(child, max_width=max_width, 703 avoid_elements=avoid_elements, 704 avoid_classes=avoid_classes, 705 break_character=break_character) 706 if child.tail: 707 child.tail = _break_text(child.tail, max_width, break_character)
708
709 -def word_break_html(html, *args, **kw):
710 result_type = type(html) 711 doc = fromstring(html) 712 word_break(doc, *args, **kw) 713 return _transform_result(result_type, doc)
714
715 -def _break_text(text, max_width, break_character):
716 words = text.split() 717 for word in words: 718 if len(word) > max_width: 719 replacement = _insert_break(word, max_width, break_character) 720 text = text.replace(word, replacement) 721 return text
722 723 _break_prefer_re = re.compile(r'[^a-z]', re.I) 724
725 -def _insert_break(word, width, break_character):
726 orig_word = word 727 result = '' 728 while len(word) > width: 729 start = word[:width] 730 breaks = list(_break_prefer_re.finditer(start)) 731 if breaks: 732 last_break = breaks[-1] 733 # Only walk back up to 10 characters to find a nice break: 734 if last_break.end() > width-10: 735 # FIXME: should the break character be at the end of the 736 # chunk, or the beginning of the next chunk? 737 start = word[:last_break.end()] 738 result += start + break_character 739 word = word[len(start):] 740 result += word 741 return result
742