1"""A cleanup tool for HTML. 2 3Removes unwanted tags and content. See the `Cleaner` class for 4details. 5""" 6 7importre 8importcopy 9try: 10fromurlparseimporturlsplit 11fromurllibimportunquote_plus 12exceptImportError: 13# Python 3 14fromurllib.parseimporturlsplit,unquote_plus 15fromlxmlimportetree 16fromlxml.htmlimportdefs 17fromlxml.htmlimportfromstring,XHTML_NAMESPACE 18fromlxml.htmlimportxhtml_to_html,_transform_result 19 20try: 21unichr 22exceptNameError: 23# Python 3 24unichr=chr 25try: 26unicode 27exceptNameError: 28# Python 3 29unicode=str 30try: 31bytes 32exceptNameError: 33# Python < 2.6 34bytes=str 35try: 36basestring 37exceptNameError: 38basestring=(str,bytes) 39 40 41__all__=['clean_html','clean','Cleaner','autolink','autolink_html', 42'word_break','word_break_html'] 43 44# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 45# Particularly the CSS cleaning; most of the tag cleaning is integrated now 46# I have multiple kinds of schemes searched; but should schemes be 47# whitelisted instead? 48# max height? 49# remove images? Also in CSS? background attribute? 50# Some way to whitelist object, iframe, etc (e.g., if you want to 51# allow *just* embedded YouTube movies) 52# Log what was deleted and why? 53# style="behavior: ..." might be bad in IE? 54# Should we have something for just <meta http-equiv>? That's the worst of the 55# metas. 56# UTF-7 detections? Example: 57# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 58# you don't always have to have the charset set, if the page has no charset 59# and there's UTF7-like code in it. 60# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 61 62 63# This is an IE-specific construct you can have in a stylesheet to 64# run some Javascript: 65_css_javascript_re=re.compile( 66r'expression\s*\(.*?\)',re.S|re.I) 67 68# Do I have to worry about @\nimport? 69_css_import_re=re.compile( 70r'@\s*import',re.I) 71 72# All kinds of schemes besides just javascript: that can cause 73# execution: 74_is_image_dataurl=re.compile( 75r'^data:image/.+;base64',re.I).search 76_is_possibly_malicious_scheme=re.compile( 77r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 78re.I).search
101"""102 Instances cleans the document of each of the possible offending103 elements. The cleaning is controlled by attributes; you can104 override attributes in a subclass, or set them in the constructor.105106 ``scripts``:107 Removes any ``<script>`` tags.108109 ``javascript``:110 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets111 as they could contain Javascript.112113 ``comments``:114 Removes any comments.115116 ``style``:117 Removes any style tags.118119 ``inline_style``120 Removes any style attributes. Defaults to the value of the ``style`` option.121122 ``links``:123 Removes any ``<link>`` tags124125 ``meta``:126 Removes any ``<meta>`` tags127128 ``page_structure``:129 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.130131 ``processing_instructions``:132 Removes any processing instructions.133134 ``embedded``:135 Removes any embedded objects (flash, iframes)136137 ``frames``:138 Removes any frame-related tags139140 ``forms``:141 Removes any form tags142143 ``annoying_tags``:144 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``145146 ``remove_tags``:147 A list of tags to remove. Only the tags will be removed,148 their content will get pulled up into the parent tag.149150 ``kill_tags``:151 A list of tags to kill. Killing also removes the tag's content,152 i.e. the whole subtree, not just the tag itself.153154 ``allow_tags``:155 A list of tags to include (default include all).156157 ``remove_unknown_tags``:158 Remove any tags that aren't standard parts of HTML.159160 ``safe_attrs_only``:161 If true, only include 'safe' attributes (specifically the list162 from the feedparser HTML sanitisation web site).163164 ``safe_attrs``:165 A set of attribute names to override the default list of attributes166 considered 'safe' (when safe_attrs_only=True).167168 ``add_nofollow``:169 If true, then any <a> tags will have ``rel="nofollow"`` added to them.170171 ``host_whitelist``:172 A list or set of hosts that you can use for embedded content173 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).174 You can also implement/override the method175 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to176 implement more complex rules for what can be embedded.177 Anything that passes this test will be shown, regardless of178 the value of (for instance) ``embedded``.179180 Note that this parameter might not work as intended if you do not181 make the links absolute before doing the cleaning.182183 Note that you may also need to set ``whitelist_tags``.184185 ``whitelist_tags``:186 A set of tags that can be included with ``host_whitelist``.187 The default is ``iframe`` and ``embed``; you may wish to188 include other tags like ``script``, or you may want to189 implement ``allow_embedded_url`` for more control. Set to None to190 include all tags.191192 This modifies the document *in place*.193 """194195scripts=True196javascript=True197comments=True198style=False199inline_style=None200links=True201meta=True202page_structure=True203processing_instructions=True204embedded=True205frames=True206forms=True207annoying_tags=True208remove_tags=None209allow_tags=None210kill_tags=None211remove_unknown_tags=True212safe_attrs_only=True213safe_attrs=defs.safe_attrs214add_nofollow=False215host_whitelist=()216whitelist_tags=set(['iframe','embed'])217
226227# Used to lookup the primary URL for a given tag that is up for228# removal:229_tag_link_attrs=dict(230script='src',231link='href',232# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html233# From what I can tell, both attributes can contain a link:234applet=['code','object'],235iframe='src',236embed='src',237layer='src',238# FIXME: there doesn't really seem like a general way to figure out what239# links an <object> tag uses; links often go in <param> tags with values240# that we don't really know. You'd have to have knowledge about specific241# kinds of plugins (probably keyed off classid), and match against those.242##object=?,243# FIXME: not looking at the action currently, because it is more complex244# than than -- if you keep the form, you should keep the form controls.245##form='action',246a='href',247)248
466"""467 IE conditional comments basically embed HTML that the parser468 doesn't normally see. We can't allow anything like that, so469 we'll kill any comments that could be conditional.470 """471bad=[]472self._kill_elements(473doc,lambdael:_conditional_comment_re.search(el.text),474etree.Comment)
485# links like "j a v a s c r i p t:" might be interpreted in IE486new=_substitute_whitespace('',unquote_plus(link))487if_is_javascript_scheme(new):488# FIXME: should this be None to delete?489return''490returnlink
495"""496 Depending on the browser, stuff like ``e x p r e s s i o n(...)``497 can get interpreted, or ``expre/* stuff */ssion(...)``. This498 checks for attempt to do stuff like this.499500 Typically the response will be to kill the entire style; if you501 have just a bit of Javascript in the style another rule will catch502 that and remove only the Javascript from the style; this catches503 more sneaky attempts.504 """505style=self._substitute_comments('',style)506style=style.replace('\\','')507style=_substitute_whitespace('',style)508style=style.lower()509if'javascript:'instyle:510returnTrue511if'expression('instyle:512returnTrue513returnFalse
523524clean=Cleaner()525clean_html=clean.clean_html526527############################################################528## Autolinking529############################################################530531_link_regexes=[532re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)',re.I),533# This is conservative, but autolinking can be a bit conservative:534re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))',re.I),535]536537_avoid_elements=['textarea','pre','code','head','select','a']538539_avoid_hosts=[540re.compile(r'^localhost',re.I),541re.compile(r'\bexample\.(?:com|org|net)$',re.I),542re.compile(r'^127\.0\.0\.1$'),543]544545_avoid_classes=['nolink']546
551"""552 Turn any URLs into links.553554 It will search for links identified by the given regular555 expressions (by default mailto and http(s) links).556557 It won't link text in an element in avoid_elements, or an element558 with a class in avoid_classes. It won't link to anything with a559 host that matches one of the regular expressions in avoid_hosts560 (default localhost and 127.0.0.1).561562 If you pass in an element, the element's tail will not be563 substituted, only the contents of the element.564 """565ifel.taginavoid_elements:566return567class_name=el.get('class')568ifclass_name:569class_name=class_name.split()570formatch_classinavoid_classes:571ifmatch_classinclass_name:572return573forchildinlist(el):574autolink(child,link_regexes=link_regexes,575avoid_elements=avoid_elements,576avoid_hosts=avoid_hosts,577avoid_classes=avoid_classes)578ifchild.tail:579text,tail_children=_link_text(580child.tail,link_regexes,avoid_hosts,factory=el.makeelement)581iftail_children:582child.tail=text583index=el.index(child)584el[index+1:index+1]=tail_children585ifel.text:586text,pre_children=_link_text(587el.text,link_regexes,avoid_hosts,factory=el.makeelement)588ifpre_children:589el.text=text590el[:0]=pre_children
593leading_text=''594links=[]595last_pos=0596while1:597best_match,best_pos=None,None598forregexinlink_regexes:599regex_pos=last_pos600while1:601match=regex.search(text,pos=regex_pos)602ifmatchisNone:603break604host=match.group('host')605forhost_regexinavoid_hosts:606ifhost_regex.search(host):607regex_pos=match.end()608break609else:610break611ifmatchisNone:612continue613ifbest_posisNoneormatch.start()<best_pos:614best_match=match615best_pos=match.start()616ifbest_matchisNone:617# No more matches618iflinks:619assertnotlinks[-1].tail620links[-1].tail=text621else:622assertnotleading_text623leading_text=text624break625link=best_match.group(0)626end=best_match.end()627iflink.endswith('.')orlink.endswith(','):628# These punctuation marks shouldn't end a link629end-=1630link=link[:-1]631prev_text=text[:best_match.start()]632iflinks:633assertnotlinks[-1].tail634links[-1].tail=prev_text635else:636assertnotleading_text637leading_text=prev_text638anchor=factory('a')639anchor.set('href',link)640body=best_match.group('body')641ifnotbody:642body=link643ifbody.endswith('.')orbody.endswith(','):644body=body[:-1]645anchor.text=body646links.append(anchor)647text=text[end:]648returnleading_text,links
672"""673 Breaks any long words found in the body of the text (not attributes).674675 Doesn't effect any of the tags in avoid_elements, by default676 ``<textarea>`` and ``<pre>``677678 Breaks words by inserting ​, which is a unicode character679 for Zero Width Space character. This generally takes up no space680 in rendering, but does copy as a space, and in monospace contexts681 usually takes up space.682683 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion684 """685# Character suggestion of ​ comes from:686# http://www.cs.tut.fi/~jkorpela/html/nobr.html687ifel.tagin_avoid_word_break_elements:688return689class_name=el.get('class')690ifclass_name:691dont_break=False692class_name=class_name.split()693foravoidinavoid_classes:694ifavoidinclass_name:695dont_break=True696break697ifdont_break:698return699ifel.text:700el.text=_break_text(el.text,max_width,break_character)701forchildinel:702word_break(child,max_width=max_width,703avoid_elements=avoid_elements,704avoid_classes=avoid_classes,705break_character=break_character)706ifchild.tail:707child.tail=_break_text(child.tail,max_width,break_character)
726orig_word=word727result=''728whilelen(word)>width:729start=word[:width]730breaks=list(_break_prefer_re.finditer(start))731ifbreaks:732last_break=breaks[-1]733# Only walk back up to 10 characters to find a nice break:734iflast_break.end()>width-10:735# FIXME: should the break character be at the end of the736# chunk, or the beginning of the next chunk?737start=word[:last_break.end()]738result+=start+break_character739word=word[len(start):]740result+=word741returnresult