32""" 33 doclist should be ordered from oldest to newest, like:: 34 35 >>> version1 = 'Hello World' 36 >>> version2 = 'Goodbye World' 37 >>> print(html_annotate([(version1, 'version 1'), 38 ... (version2, 'version 2')])) 39 <span title="version 2">Goodbye</span> <span title="version 1">World</span> 40 41 The documents must be *fragments* (str/UTF8 or unicode), not 42 complete documents 43 44 The markup argument is a function to markup the spans of words. 45 This function is called like markup('Hello', 'version 2'), and 46 returns HTML. The first argument is text and never includes any 47 markup. The default uses a span with a title: 48 49 >>> print(default_markup('Some Text', 'by Joe')) 50 <span title="by Joe">Some Text</span> 51 """ 52# The basic strategy we have is to split the documents up into 53# logical tokens (which are words with attached markup). We then 54# do diffs of each of the versions to track when a token first 55# appeared in the document; the annotation attached to the token 56# is the version where it first appeared. 57tokenlist=[tokenize_annotated(doc,version) 58fordoc,versionindoclist] 59cur_tokens=tokenlist[0] 60fortokensintokenlist[1:]: 61html_annotate_merge_annotations(cur_tokens,tokens) 62cur_tokens=tokens 63 64# After we've tracked all the tokens, we can combine spans of text 65# that are adjacent and have the same annotation 66cur_tokens=compress_tokens(cur_tokens) 67# And finally add markup 68result=markup_serialize_tokens(cur_tokens,markup) 69return''.join(result).strip()
72"""Tokenize a document and add an annotation attribute to each token 73 """ 74tokens=tokenize(doc,include_hrefs=False) 75fortokintokens: 76tok.annotation=annotation 77returntokens 78
80"""Merge the annotations from tokens_old into tokens_new, when the 81 tokens in the new document already existed in the old document. 82 """ 83s=InsensitiveSequenceMatcher(a=tokens_old,b=tokens_new) 84commands=s.get_opcodes() 85 86forcommand,i1,i2,j1,j2incommands: 87ifcommand=='equal': 88eq_old=tokens_old[i1:i2] 89eq_new=tokens_new[j1:j2] 90copy_annotations(eq_old,eq_new) 91
93""" 94 Copy annotations from the tokens listed in src to the tokens in dest 95 """ 96assertlen(src)==len(dest) 97forsrc_tok,dest_tokinzip(src,dest): 98dest_tok.annotation=src_tok.annotation 99
101"""102 Combine adjacent tokens when there is no HTML between the tokens, 103 and they share an annotation104 """105result=[tokens[0]]106fortokintokens[1:]:107if(notresult[-1].post_tagsand108nottok.pre_tagsand109result[-1].annotation==tok.annotation):110compress_merge_back(result,tok)111else:112result.append(tok)113returnresult
116""" Merge tok into the last element of tokens (modifying the list of117 tokens in-place). """118last=tokens[-1]119iftype(last)isnottokenortype(tok)isnottoken:120tokens.append(tok)121else:122text=_unicode(last)123iflast.trailing_whitespace:124text+=last.trailing_whitespace125text+=tok126merged=token(text,127pre_tags=last.pre_tags,128post_tags=tok.post_tags,129trailing_whitespace=tok.trailing_whitespace)130merged.annotation=last.annotation131tokens[-1]=merged132
134"""135 Serialize the list of tokens into a list of text chunks, calling136 markup_func around text to add annotations.137 """138fortokenintokens:139forpreintoken.pre_tags:140yieldpre141html=token.html()142html=markup_func(html,token.annotation)143iftoken.trailing_whitespace:144html+=token.trailing_whitespace145yieldhtml146forpostintoken.post_tags:147yieldpost
148149150############################################################151## HTML Diffs152############################################################153
155## FIXME: this should take parsed documents too, and use their body156## or other content.157""" Do a diff of the old and new document. The documents are HTML158 *fragments* (str/UTF8 or unicode), they are not complete documents159 (i.e., no <html> tag).160161 Returns HTML with <ins> and <del> tags added around the162 appropriate text. 163164 Markup is generally ignored, with the markup from new_html165 preserved, and possibly some markup from old_html (though it is166 considered acceptable to lose some of the old markup). Only the167 words in the HTML are diffed. The exception is <img> tags, which168 are treated like words, and the href attribute of <a> tags, which169 are noted inside the tag itself when there are changes.170 """171old_html_tokens=tokenize(old_html)172new_html_tokens=tokenize(new_html)173result=htmldiff_tokens(old_html_tokens,new_html_tokens)174result=''.join(result).strip()175returnfixup_ins_del_tags(result)
178""" Does a diff on the tokens themselves, returning a list of text179 chunks (not tokens).180 """181# There are several passes as we do the differences. The tokens182# isolate the portion of the content we care to diff; difflib does183# all the actual hard work at that point. 184#185# Then we must create a valid document from pieces of both the old186# document and the new document. We generally prefer to take187# markup from the new document, and only do a best effort attempt188# to keep markup from the old document; anything that we can't189# resolve we throw away. Also we try to put the deletes as close190# to the location where we think they would have been -- because191# we are only keeping the markup from the new document, it can be192# fuzzy where in the new document the old text would have gone.193# Again we just do a best effort attempt.194s=InsensitiveSequenceMatcher(a=html1_tokens,b=html2_tokens)195commands=s.get_opcodes()196result=[]197forcommand,i1,i2,j1,j2incommands:198ifcommand=='equal':199result.extend(expand_tokens(html2_tokens[j1:j2],equal=True))200continue201ifcommand=='insert'orcommand=='replace':202ins_tokens=expand_tokens(html2_tokens[j1:j2])203merge_insert(ins_tokens,result)204ifcommand=='delete'orcommand=='replace':205del_tokens=expand_tokens(html1_tokens[i1:i2])206merge_delete(del_tokens,result)207# If deletes were inserted directly as <del> then we'd have an208# invalid document at this point. Instead we put in special209# markers, and when the complete diffed document has been created210# we try to move the deletes around and resolve any problems.211result=cleanup_delete(result)212213returnresult
216"""Given a list of tokens, return a generator of the chunks of217 text for the data in the tokens.218 """219fortokenintokens:220forpreintoken.pre_tags:221yieldpre222ifnotequalornottoken.hide_when_equal:223iftoken.trailing_whitespace:224yieldtoken.html()+token.trailing_whitespace225else:226yieldtoken.html()227forpostintoken.post_tags:228yieldpost
231""" doc is the already-handled document (as a list of text chunks);232 here we add <ins>ins_chunks</ins> to the end of that. """233# Though we don't throw away unbalanced_start or unbalanced_end234# (we assume there is accompanying markup later or earlier in the235# document), we only put <ins> around the balanced portion.236unbalanced_start,balanced,unbalanced_end=split_unbalanced(ins_chunks)237doc.extend(unbalanced_start)238ifdocandnotdoc[-1].endswith(' '):239# Fix up the case where the word before the insert didn't end with 240# a space241doc[-1]+=' '242doc.append('<ins>')243ifbalancedandbalanced[-1].endswith(' '):244# We move space outside of </ins>245balanced[-1]=balanced[-1][:-1]246doc.extend(balanced)247doc.append('</ins> ')248doc.extend(unbalanced_end)
249250# These are sentinals to represent the start and end of a <del>251# segment, until we do the cleanup phase to turn them into proper252# markup:
263""" Adds the text chunks in del_chunks to the document doc (another264 list of text chunks) with marker to show it is a delete.265 cleanup_delete later resolves these markers into <del> tags."""266doc.append(DEL_START)267doc.extend(del_chunks)268doc.append(DEL_END)
271""" Cleans up any DEL_START/DEL_END markers in the document, replacing272 them with <del></del>. To do this while keeping the document273 valid, it may need to drop some tags (either start or end tags).274275 It may also move the del into adjacent tags to try to move it to a276 similar location where it was originally located (e.g., moving a277 delete into preceding <div> tag, if the del looks like (DEL_START,278 'Text</div>', DEL_END)"""279while1:280# Find a pending DEL_START/DEL_END, splitting the document281# into stuff-preceding-DEL_START, stuff-inside, and282# stuff-following-DEL_END283try:284pre_delete,delete,post_delete=split_delete(chunks)285exceptNoDeletes:286# Nothing found, we've cleaned up the entire doc287break288# The stuff-inside-DEL_START/END may not be well balanced289# markup. First we figure out what unbalanced portions there are:290unbalanced_start,balanced,unbalanced_end=split_unbalanced(delete)291# Then we move the span forward and/or backward based on these292# unbalanced portions:293locate_unbalanced_start(unbalanced_start,pre_delete,post_delete)294locate_unbalanced_end(unbalanced_end,pre_delete,post_delete)295doc=pre_delete296ifdocandnotdoc[-1].endswith(' '):297# Fix up case where the word before us didn't have a trailing space298doc[-1]+=' '299doc.append('<del>')300ifbalancedandbalanced[-1].endswith(' '):301# We move space outside of </del>302balanced[-1]=balanced[-1][:-1]303doc.extend(balanced)304doc.append('</del> ')305doc.extend(post_delete)306chunks=doc307returnchunks
310"""Return (unbalanced_start, balanced, unbalanced_end), where each is311 a list of text and tag chunks.312313 unbalanced_start is a list of all the tags that are opened, but314 not closed in this span. Similarly, unbalanced_end is a list of315 tags that are closed but were not opened. Extracting these might316 mean some reordering of the chunks."""317start=[]318end=[]319tag_stack=[]320balanced=[]321forchunkinchunks:322ifnotchunk.startswith('<'):323balanced.append(chunk)324continue325endtag=chunk[1]=='/'326name=chunk.split()[0].strip('<>/')327ifnameinempty_tags:328balanced.append(chunk)329continue330ifendtag:331iftag_stackandtag_stack[-1][0]==name:332balanced.append(chunk)333name,pos,tag=tag_stack.pop()334balanced[pos]=tag335eliftag_stack:336start.extend([tagforname,pos,tagintag_stack])337tag_stack=[]338end.append(chunk)339else:340end.append(chunk)341else:342tag_stack.append((name,len(balanced),chunk))343balanced.append(None)344start.extend(345[chunkforname,pos,chunkintag_stack])346balanced=[chunkforchunkinbalancedifchunkisnotNone]347returnstart,balanced,end
350""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,351 stuff_after_DEL_END). Returns the first case found (there may be352 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if353 there's no DEL_START found. """354try:355pos=chunks.index(DEL_START)356exceptValueError:357raiseNoDeletes358pos2=chunks.index(DEL_END)359returnchunks[:pos],chunks[pos+1:pos2],chunks[pos2+1:]
362""" pre_delete and post_delete implicitly point to a place in the363 document (where the two were split). This moves that point (by364 popping items from one and pushing them onto the other). It moves365 the point to try to find a place where unbalanced_start applies.366367 As an example::368369 >>> unbalanced_start = ['<div>']370 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']371 >>> pre, post = doc[:3], doc[3:]372 >>> pre, post373 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])374 >>> locate_unbalanced_start(unbalanced_start, pre, post)375 >>> pre, post376 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])377378 As you can see, we moved the point so that the dangling <div> that379 we found will be effectively replaced by the div in the original380 document. If this doesn't work out, we just throw away381 unbalanced_start without doing anything.382 """383while1:384ifnotunbalanced_start:385# We have totally succeeded in finding the position386break387finding=unbalanced_start[0]388finding_name=finding.split()[0].strip('<>')389ifnotpost_delete:390break391next=post_delete[0]392ifnextisDEL_STARTornotnext.startswith('<'):393# Reached a word, we can't move the delete text forward394break395ifnext[1]=='/':396# Reached a closing tag, can we go further? Maybe not...397break398name=next.split()[0].strip('<>')399ifname=='ins':400# Can't move into an insert401break402assertname!='del',(403"Unexpected delete tag: %r"%next)404ifname==finding_name:405unbalanced_start.pop(0)406pre_delete.append(post_delete.pop(0))407else:408# Found a tag that doesn't match409break
412""" like locate_unbalanced_start, except handling end tags and413 possibly moving the point earlier in the document. """414while1:415ifnotunbalanced_end:416# Success417break418finding=unbalanced_end[-1]419finding_name=finding.split()[0].strip('<>/')420ifnotpre_delete:421break422next=pre_delete[-1]423ifnextisDEL_ENDornotnext.startswith('</'):424# A word or a start tag425break426name=next.split()[0].strip('<>/')427ifname=='ins'orname=='del':428# Can't move into an insert or delete429break430ifname==finding_name:431unbalanced_end.pop()432post_delete.insert(0,pre_delete.pop())433else:434# Found a tag that doesn't match435break
438""" Represents a diffable token, generally a word that is displayed to439 the user. Opening tags are attached to this token when they are440 adjacent (pre_tags) and closing tags that follow the word441 (post_tags). Some exceptions occur when there are empty tags442 adjacent to a word, so there may be close tags in pre_tags, or443 open tags in post_tags.444445 We also keep track of whether the word was originally followed by446 whitespace, even though we do not want to treat the word as447 equivalent to a similar word that does not have a trailing448 space."""449450# When this is true, the token will be eliminated from the451# displayed diff if no change has occurred:452hide_when_equal=False453
479480""" Represents a token that is actually a tag. Currently this is just481 the <img> tag, which takes up visible space just like a word but482 is only represented in a document by a tag. """483
517"""518 Parse the given HTML and returns token objects (words with attached tags).519520 This parses only the content of a page; anything in the head is521 ignored, and the <head> and <body> elements are themselves522 optional. The content is then parsed by lxml, which ensures the523 validity of the resulting parsed document (though lxml may make524 incorrect guesses when the markup is particular bad).525526 <ins> and <del> tags are also eliminated from the document, as527 that gets confusing.528529 If include_hrefs is true, then the href attribute of <a> tags is530 included as a special kind of diffable token."""531ifetree.iselement(html):532body_el=html533else:534body_el=parse_html(html,cleanup=True)535# Then we split the document into text chunks for each tag, word, and end tag:536chunks=flatten_el(body_el,skip_tag=True,include_hrefs=include_hrefs)537# Finally re-joining them into token objects:538returnfixup_chunks(chunks)
541"""542 Parses an HTML fragment, returning an lxml element. Note that the HTML will be543 wrapped in a <div> tag that was not in the original document.544545 If cleanup is true, make sure there's no <head> or <body>, and get546 rid of any <ins> and <del> tags.547 """548ifcleanup:549# This removes any extra markup or structure like <head>:550html=cleanup_html(html)551returnfragment_fromstring(html,create_parent=True)
558""" This 'cleans' the HTML, meaning that any page structure is removed559 (only the contents of <body> are used, if there is any <body).560 Also <ins> and <del> tags are removed. """561match=_body_re.search(html)562ifmatch:563html=html[match.end():]564match=_end_body_re.search(html)565ifmatch:566html=html[:match.start()]567html=_ins_del_re.sub('',html)568returnhtml
574"""575 This function takes a word, such as 'test\n\n' and returns ('test','\n\n')576 """577stripped_length=len(word.rstrip())578returnword[0:stripped_length],word[stripped_length:]
582"""583 This function takes a list of chunks and produces a list of tokens.584 """585tag_accum=[]586cur_word=None587result=[]588forchunkinchunks:589ifisinstance(chunk,tuple):590ifchunk[0]=='img':591src=chunk[1]592tag,trailing_whitespace=split_trailing_whitespace(chunk[2])593cur_word=tag_token('img',src,html_repr=tag,594pre_tags=tag_accum,595trailing_whitespace=trailing_whitespace)596tag_accum=[]597result.append(cur_word)598599elifchunk[0]=='href':600href=chunk[1]601cur_word=href_token(href,pre_tags=tag_accum,trailing_whitespace=" ")602tag_accum=[]603result.append(cur_word)604continue605606ifis_word(chunk):607chunk,trailing_whitespace=split_trailing_whitespace(chunk)608cur_word=token(chunk,pre_tags=tag_accum,trailing_whitespace=trailing_whitespace)609tag_accum=[]610result.append(cur_word)611612elifis_start_tag(chunk):613tag_accum.append(chunk)614615elifis_end_tag(chunk):616iftag_accum:617tag_accum.append(chunk)618else:619assertcur_word,(620"Weird state, cur_word=%r, result=%r, chunks=%r of %r"621%(cur_word,result,chunk,chunks))622cur_word.post_tags.append(chunk)623else:624assert(0)625626ifnotresult:627return[token('',pre_tags=tag_accum)]628else:629result[-1].post_tags.extend(tag_accum)630631returnresult
632633634# All the tags in HTML that don't require end tags:635empty_tags=(636'param','img','area','br','basefont','input',637'base','meta','link','col')638639block_level_tags=(640'address',641'blockquote',642'center',643'dir',644'div',645'dl',646'fieldset',647'form',648'h1',649'h2',650'h3',651'h4',652'h5',653'h6',654'hr',655'isindex',656'menu',657'noframes',658'noscript',659'ol',660'p',661'pre',662'table',663'ul',664)665666block_level_container_tags=(667'dd',668'dt',669'frameset',670'li',671'tbody',672'td',673'tfoot',674'th',675'thead',676'tr',677)678679
681""" Takes an lxml element el, and generates all the text chunks for682 that tag. Each start tag is a chunk, each word is a chunk, and each683 end tag is a chunk.684685 If skip_tag is true, then the outermost container tag is686 not returned (just its contents)."""687ifnotskip_tag:688ifel.tag=='img':689yield('img',el.get('src'),start_tag(el))690else:691yieldstart_tag(el)692ifel.taginempty_tagsandnotel.textandnotlen(el)andnotel.tail:693return694start_words=split_words(el.text)695forwordinstart_words:696yieldhtml_escape(word)697forchildinel:698foriteminflatten_el(child,include_hrefs=include_hrefs):699yielditem700ifel.tag=='a'andel.get('href')andinclude_hrefs:701yield('href',el.get('href'))702ifnotskip_tag:703yieldend_tag(el)704end_words=split_words(el.tail)705forwordinend_words:706yieldhtml_escape(word)
711""" Splits some text into words. Includes trailing whitespace712 on each word when appropriate. """713ifnottextornottext.strip():714return[]715716words=split_words_re.findall(text)717returnwords
722"""723 The text representation of the start tag for a tag.724 """725return'<%s%s>'%(726el.tag,''.join([' %s="%s"'%(name,html_escape(value,True))727forname,valueinel.attrib.items()]))
730""" The text representation of an end tag for a tag. Includes731 trailing whitespace when appropriate. """732ifel.tailandstart_whitespace_re.search(el.tail):733extra=' '734else:735extra=''736return'</%s>%s'%(el.tag,extra)
748""" Given an html string, move any <ins> or <del> tags inside of any749 block-level elements, e.g. transform <ins><p>word</p></ins> to750 <p><ins>word</ins></p> """751doc=parse_html(html,cleanup=False)752_fixup_ins_del_tags(doc)753html=serialize_html_fragment(doc,skip_outer=True)754returnhtml
757""" Serialize a single lxml element as HTML. The serialized form758 includes the elements tail. 759760 If skip_outer is true, then don't serialize the outermost tag761 """762assertnotisinstance(el,basestring),(763"You should pass in an element, not a string like %r"%el)764html=etree.tostring(el,method="html",encoding=_unicode)765ifskip_outer:766# Get rid of the extra starting tag:767html=html[html.find('>')+1:]768# Get rid of the extra end tag:769html=html[:html.rfind('<')]770returnhtml.strip()771else:772returnhtml
829"""830 Removes an element, but merges its contents into its place, e.g.,831 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get832 <p>Hi there!</p>833 """834parent=el.getparent()835text=el.textor''836ifel.tail:837ifnotlen(el):838text+=el.tail839else:840ifel[-1].tail:841el[-1].tail+=el.tail842else:843el[-1].tail=el.tail844index=parent.index(el)845iftext:846ifindex==0:847previous=None848else:849previous=parent[index-1]850ifpreviousisNone:851ifparent.text:852parent.text+=text853else:854parent.text=text855else:856ifprevious.tail:857previous.tail+=text858else:859previous.tail=text860parent[index:index+1]=el.getchildren()