lxml.html.clean

101 """ 102 Instances cleans the document of each of the possible offending 103 elements. The cleaning is controlled by attributes; you can 104 override attributes in a subclass, or set them in the constructor. 105 106 ``scripts``: 107 Removes any ``<script>`` tags. 108 109 ``javascript``: 110 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 111 as they could contain Javascript. 112 113 ``comments``: 114 Removes any comments. 115 116 ``style``: 117 Removes any style tags. 118 119 ``inline_style`` 120 Removes any style attributes. Defaults to the value of the ``style`` option. 121 122 ``links``: 123 Removes any ``<link>`` tags 124 125 ``meta``: 126 Removes any ``<meta>`` tags 127 128 ``page_structure``: 129 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 130 131 ``processing_instructions``: 132 Removes any processing instructions. 133 134 ``embedded``: 135 Removes any embedded objects (flash, iframes) 136 137 ``frames``: 138 Removes any frame-related tags 139 140 ``forms``: 141 Removes any form tags 142 143 ``annoying_tags``: 144 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 145 146 ``remove_tags``: 147 A list of tags to remove. Only the tags will be removed, 148 their content will get pulled up into the parent tag. 149 150 ``kill_tags``: 151 A list of tags to kill. Killing also removes the tag's content, 152 i.e. the whole subtree, not just the tag itself. 153 154 ``allow_tags``: 155 A list of tags to include (default include all). 156 157 ``remove_unknown_tags``: 158 Remove any tags that aren't standard parts of HTML. 159 160 ``safe_attrs_only``: 161 If true, only include 'safe' attributes (specifically the list 162 from the feedparser HTML sanitisation web site). 163 164 ``safe_attrs``: 165 A set of attribute names to override the default list of attributes 166 considered 'safe' (when safe_attrs_only=True). 167 168 ``add_nofollow``: 169 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 170 171 ``host_whitelist``: 172 A list or set of hosts that you can use for embedded content 173 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 174 You can also implement/override the method 175 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 176 implement more complex rules for what can be embedded. 177 Anything that passes this test will be shown, regardless of 178 the value of (for instance) ``embedded``. 179 180 Note that this parameter might not work as intended if you do not 181 make the links absolute before doing the cleaning. 182 183 Note that you may also need to set ``whitelist_tags``. 184 185 ``whitelist_tags``: 186 A set of tags that can be included with ``host_whitelist``. 187 The default is ``iframe`` and ``embed``; you may wish to 188 include other tags like ``script``, or you may want to 189 implement ``allow_embedded_url`` for more control. Set to None to 190 include all tags. 191 192 This modifies the document *in place*. 193 """ 194 195 scripts = True 196 javascript = True 197 comments = True 198 style = False 199 inline_style = None 200 links = True 201 meta = True 202 page_structure = True 203 processing_instructions = True 204 embedded = True 205 frames = True 206 forms = True 207 annoying_tags = True 208 remove_tags = None 209 allow_tags = None 210 kill_tags = None 211 remove_unknown_tags = True 212 safe_attrs_only = True 213 safe_attrs = defs.safe_attrs 214 add_nofollow = False 215 host_whitelist = () 216 whitelist_tags = set(['iframe', 'embed']) 217

218 - def __init__(self, **kw):

219 for name, value in kw.items(): 220 if not hasattr(self, name): 221 raise TypeError( 222 "Unknown parameter: %s=%r" % (name, value)) 223 setattr(self, name, value) 224 if self.inline_style is None and 'inline_style' not in kw: 225 self.inline_style = self.style

226 227 # Used to lookup the primary URL for a given tag that is up for 228 # removal: 229 _tag_link_attrs = dict( 230 script='src', 231 link='href', 232 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 233 # From what I can tell, both attributes can contain a link: 234 applet=['code', 'object'], 235 iframe='src', 236 embed='src', 237 layer='src', 238 # FIXME: there doesn't really seem like a general way to figure out what 239 # links an <object> tag uses; links often go in <param> tags with values 240 # that we don't really know. You'd have to have knowledge about specific 241 # kinds of plugins (probably keyed off classid), and match against those. 242 ##object=?, 243 # FIXME: not looking at the action currently, because it is more complex 244 # than than -- if you keep the form, you should keep the form controls. 245 ##form='action', 246 a='href', 247 ) 248

249 - def __call__(self, doc):

250 """ 251 Cleans the document. 252 """ 253 if hasattr(doc, 'getroot'): 254 # ElementTree instance, instead of an element 255 doc = doc.getroot() 256 # convert XHTML to HTML 257 xhtml_to_html(doc) 258 # Normalize a case that IE treats <image> like <img>, and that 259 # can confuse either this step or later steps. 260 for el in doc.iter('image'): 261 el.tag = 'img' 262 if not self.comments: 263 # Of course, if we were going to kill comments anyway, we don't 264 # need to worry about this 265 self.kill_conditional_comments(doc) 266 267 kill_tags = set(self.kill_tags or ()) 268 remove_tags = set(self.remove_tags or ()) 269 allow_tags = set(self.allow_tags or ()) 270 271 if self.scripts: 272 kill_tags.add('script') 273 if self.safe_attrs_only: 274 safe_attrs = set(self.safe_attrs) 275 for el in doc.iter(etree.Element): 276 attrib = el.attrib 277 for aname in attrib.keys(): 278 if aname not in safe_attrs: 279 del attrib[aname] 280 if self.javascript: 281 if not (self.safe_attrs_only and 282 self.safe_attrs == defs.safe_attrs): 283 # safe_attrs handles events attributes itself 284 for el in doc.iter(etree.Element): 285 attrib = el.attrib 286 for aname in attrib.keys(): 287 if aname.startswith('on'): 288 del attrib[aname] 289 doc.rewrite_links(self._remove_javascript_link, 290 resolve_base_href=False) 291 # If we're deleting style then we don't have to remove JS links 292 # from styles, otherwise... 293 if not self.inline_style: 294 for el in _find_styled_elements(doc): 295 old = el.get('style') 296 new = _css_javascript_re.sub('', old) 297 new = _css_import_re.sub('', new) 298 if self._has_sneaky_javascript(new): 299 # Something tricky is going on... 300 del el.attrib['style'] 301 elif new != old: 302 el.set('style', new) 303 if not self.style: 304 for el in list(doc.iter('style')): 305 if el.get('type', '').lower().strip() == 'text/javascript': 306 el.drop_tree() 307 continue 308 old = el.text or '' 309 new = _css_javascript_re.sub('', old) 310 # The imported CSS can do anything; we just can't allow: 311 new = _css_import_re.sub('', old) 312 if self._has_sneaky_javascript(new): 313 # Something tricky is going on... 314 el.text = '/* deleted */' 315 elif new != old: 316 el.text = new 317 if self.comments or self.processing_instructions: 318 # FIXME: why either? I feel like there's some obscure reason 319 # because you can put PIs in comments...? But I've already 320 # forgotten it 321 kill_tags.add(etree.Comment) 322 if self.processing_instructions: 323 kill_tags.add(etree.ProcessingInstruction) 324 if self.style: 325 kill_tags.add('style') 326 if self.inline_style: 327 etree.strip_attributes(doc, 'style') 328 if self.links: 329 kill_tags.add('link') 330 elif self.style or self.javascript: 331 # We must get rid of included stylesheets if Javascript is not 332 # allowed, as you can put Javascript in them 333 for el in list(doc.iter('link')): 334 if 'stylesheet' in el.get('rel', '').lower(): 335 # Note this kills alternate stylesheets as well 336 if not self.allow_element(el): 337 el.drop_tree() 338 if self.meta: 339 kill_tags.add('meta') 340 if self.page_structure: 341 remove_tags.update(('head', 'html', 'title')) 342 if self.embedded: 343 # FIXME: is <layer> really embedded? 344 # We should get rid of any <param> tags not inside <applet>; 345 # These are not really valid anyway. 346 for el in list(doc.iter('param')): 347 found_parent = False 348 parent = el.getparent() 349 while parent is not None and parent.tag not in ('applet', 'object'): 350 parent = parent.getparent() 351 if parent is None: 352 el.drop_tree() 353 kill_tags.update(('applet',)) 354 # The alternate contents that are in an iframe are a good fallback: 355 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 356 if self.frames: 357 # FIXME: ideally we should look at the frame links, but 358 # generally frames don't mix properly with an HTML 359 # fragment anyway. 360 kill_tags.update(defs.frame_tags) 361 if self.forms: 362 remove_tags.add('form') 363 kill_tags.update(('button', 'input', 'select', 'textarea')) 364 if self.annoying_tags: 365 remove_tags.update(('blink', 'marquee')) 366 367 _remove = [] 368 _kill = [] 369 for el in doc.iter(): 370 if el.tag in kill_tags: 371 if self.allow_element(el): 372 continue 373 _kill.append(el) 374 elif el.tag in remove_tags: 375 if self.allow_element(el): 376 continue 377 _remove.append(el) 378 379 if _remove and _remove[0] == doc: 380 # We have to drop the parent-most tag, which we can't 381 # do. Instead we'll rewrite it: 382 el = _remove.pop(0) 383 el.tag = 'div' 384 el.attrib.clear() 385 elif _kill and _kill[0] == doc: 386 # We have to drop the parent-most element, which we can't 387 # do. Instead we'll clear it: 388 el = _kill.pop(0) 389 if el.tag != 'html': 390 el.tag = 'div' 391 el.clear() 392 393 _kill.reverse() # start with innermost tags 394 for el in _kill: 395 el.drop_tree() 396 for el in _remove: 397 el.drop_tag() 398 399 if self.remove_unknown_tags: 400 if allow_tags: 401 raise ValueError( 402 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 403 allow_tags = set(defs.tags) 404 if allow_tags: 405 bad = [] 406 for el in doc.iter(): 407 if el.tag not in allow_tags: 408 bad.append(el) 409 if bad: 410 if bad[0] is doc: 411 el = bad.pop(0) 412 el.tag = 'div' 413 el.attrib.clear() 414 for el in bad: 415 el.drop_tag() 416 if self.add_nofollow: 417 for el in _find_external_links(doc): 418 if not self.allow_follow(el): 419 rel = el.get('rel') 420 if rel: 421 if ('nofollow' in rel 422 and ' nofollow ' in (' %s ' % rel)): 423 continue 424 rel = '%s nofollow' % rel 425 else: 426 rel = 'nofollow' 427 el.set('rel', rel)

428

429 - def allow_follow(self, anchor):

430 """ 431 Override to suppress rel="nofollow" on some anchors. 432 """ 433 return False

434

435 - def allow_element(self, el):

436 if el.tag not in self._tag_link_attrs: 437 return False 438 attr = self._tag_link_attrs[el.tag] 439 if isinstance(attr, (list, tuple)): 440 for one_attr in attr: 441 url = el.get(one_attr) 442 if not url: 443 return False 444 if not self.allow_embedded_url(el, url): 445 return False 446 return True 447 else: 448 url = el.get(attr) 449 if not url: 450 return False 451 return self.allow_embedded_url(el, url)

452

453 - def allow_embedded_url(self, el, url):

454 if (self.whitelist_tags is not None 455 and el.tag not in self.whitelist_tags): 456 return False 457 scheme, netloc, path, query, fragment = urlsplit(url) 458 netloc = netloc.lower().split(':', 1)[0] 459 if scheme not in ('http', 'https'): 460 return False 461 if netloc in self.host_whitelist: 462 return True 463 return False

464

465 - def kill_conditional_comments(self, doc):

466 """ 467 IE conditional comments basically embed HTML that the parser 468 doesn't normally see. We can't allow anything like that, so 469 we'll kill any comments that could be conditional. 470 """ 471 bad = [] 472 self._kill_elements( 473 doc, lambda el: _conditional_comment_re.search(el.text), 474 etree.Comment)

475

476 - def _kill_elements(self, doc, condition, iterate=None):

477 bad = [] 478 for el in doc.iter(iterate): 479 if condition(el): 480 bad.append(el) 481 for el in bad: 482 el.drop_tree()

483

484 - def _remove_javascript_link(self, link):

485 # links like "j a v a s c r i p t:" might be interpreted in IE 486 new = _substitute_whitespace('', unquote_plus(link)) 487 if _is_javascript_scheme(new): 488 # FIXME: should this be None to delete? 489 return '' 490 return link

491 492 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 493

494 - def _has_sneaky_javascript(self, style):

495 """ 496 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 497 can get interpreted, or ``expre/* stuff */ssion(...)``. This 498 checks for attempt to do stuff like this. 499 500 Typically the response will be to kill the entire style; if you 501 have just a bit of Javascript in the style another rule will catch 502 that and remove only the Javascript from the style; this catches 503 more sneaky attempts. 504 """ 505 style = self._substitute_comments('', style) 506 style = style.replace('\\', '') 507 style = _substitute_whitespace('', style) 508 style = style.lower() 509 if 'javascript:' in style: 510 return True 511 if 'expression(' in style: 512 return True 513 return False

514

515 - def clean_html(self, html):

516 result_type = type(html) 517 if isinstance(html, basestring): 518 doc = fromstring(html) 519 else: 520 doc = copy.deepcopy(html) 521 self(doc) 522 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean