src/cm/utils/spannifier.py
author Simon Descarpentries <sid@sopinspace.com>
Tue, 06 May 2014 13:52:01 +0200
changeset 651 9bbc657f6837
parent 516 c6105d922ac6
permissions -rw-r--r--
Replace DISABLE_TRACKING and TRACKING_HTML by a TRACKING_ID variable in configuration files
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
import uuid
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
import xml.dom.minidom
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     3
import re
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     4
from BeautifulSoup import BeautifulSoup, Comment
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     5
 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     6
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     7
def get_text_nodes(soup):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
    return soup(text=lambda text:not isinstance(text, Comment))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
502
8ec189cc214d do not skip span for newline textnodes otherwise compute_new_comment_positions() will return bad results for pandoc texts.
gibus
parents: 473
diff changeset
    10
def is_real_text_node(textNode, nolinefeed=True):
8ec189cc214d do not skip span for newline textnodes otherwise compute_new_comment_positions() will return bad results for pandoc texts.
gibus
parents: 473
diff changeset
    11
    if nolinefeed and textNode.string == "\n":
473
cefe588b2a2b Do not spannify empty text nodes, prevents abiword crash.
gibus
parents: 464
diff changeset
    12
      return False
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    13
    return not textNode.findParent('style') 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    14
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    15
def get_the_soup(input):
516
c6105d922ac6 For some reason, BeautifulSoup wants now fromEncoding='UTF-8' in some unidentified cases.
gibus
parents: 502
diff changeset
    16
    return BeautifulSoup(input, convertEntities=BeautifulSoup.ALL_ENTITIES, fromEncoding='UTF-8')
270
05a602160c58 cache spannify function
raph
parents: 0
diff changeset
    17
     
05a602160c58 cache spannify function
raph
parents: 0
diff changeset
    18
from cm.utils.cache import memoize, dj_memoize
05a602160c58 cache spannify function
raph
parents: 0
diff changeset
    19
@dj_memoize
502
8ec189cc214d do not skip span for newline textnodes otherwise compute_new_comment_positions() will return bad results for pandoc texts.
gibus
parents: 473
diff changeset
    20
def spannify(input, nolinefeed=True):
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    21
    """ 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    22
    wrap textNodes in spans 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    23
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    24
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    25
    input = re.sub("\s*$","",input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    26
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
    soup = get_the_soup(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    28
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    29
    textNodes = get_text_nodes(soup)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    30
    textNodes_content = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    31
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    32
    span_starts = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    33
    for i in xrange(len(textNodes)):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    34
        textNode = textNodes[i]
502
8ec189cc214d do not skip span for newline textnodes otherwise compute_new_comment_positions() will return bad results for pandoc texts.
gibus
parents: 473
diff changeset
    35
        if is_real_text_node(textNode, nolinefeed) :
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    36
            textNode.replaceWith('<span id="sv_' + str(i) + '" class="c-s"><span id="sv-' + str(i) + '" class="c-count-0 c-c">' + textNode.string + '</span></span>')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    37
            span_starts[i] = len(''.join(textNodes_content))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    38
            textNodes_content.append(textNode.string)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    39
    output = unicode(soup)
450
81fa74c112b8 Fix #77 html code inside document viewer.
gibus
parents: 270
diff changeset
    40
    # Soup has introduced HTML entities, which should be expanded
81fa74c112b8 Fix #77 html code inside document viewer.
gibus
parents: 270
diff changeset
    41
    output =re.sub(r"&quot;", '"', output)
81fa74c112b8 Fix #77 html code inside document viewer.
gibus
parents: 270
diff changeset
    42
    output =re.sub(r"&amp;", '&', output)
81fa74c112b8 Fix #77 html code inside document viewer.
gibus
parents: 270
diff changeset
    43
    output =re.sub(r"&gt;", '>', output)
81fa74c112b8 Fix #77 html code inside document viewer.
gibus
parents: 270
diff changeset
    44
    output =re.sub(r"&lt;", '<', output)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    45
         
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    46
    textualized = ''.join(textNodes_content)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    47
    return output, textualized, span_starts