src/cm/utils/spannifier.py
author raph
Mon, 23 Nov 2009 15:14:29 +0100
changeset 0 40c8f766c9b8
child 270 05a602160c58
permissions -rw-r--r--
import from internal svn r 4007
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
import uuid
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
import xml.dom.minidom
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     3
import re
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     4
from BeautifulSoup import BeautifulSoup, Comment
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     5
 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     6
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     7
def get_text_nodes(soup):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
    return soup(text=lambda text:not isinstance(text, Comment))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    10
def is_real_text_node(textNode):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    11
    return not textNode.findParent('style') 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    12
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    13
def get_the_soup(input):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    14
    return BeautifulSoup(input, convertEntities=["xml", "html"])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    15
                     
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    16
def spannify(input):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    17
    """ 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    18
    wrap textNodes in spans 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    19
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    20
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    21
    input = re.sub("\s*$","",input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    22
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    23
    soup = get_the_soup(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    24
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    25
    textNodes = get_text_nodes(soup)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    26
    textNodes_content = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    28
    span_starts = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    29
    for i in xrange(len(textNodes)):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    30
        textNode = textNodes[i]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    31
        if is_real_text_node(textNode) :
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    32
            textNode.replaceWith('<span id="sv_' + str(i) + '" class="c-s"><span id="sv-' + str(i) + '" class="c-count-0 c-c">' + textNode.string + '</span></span>')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    33
            span_starts[i] = len(''.join(textNodes_content))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    34
            textNodes_content.append(textNode.string)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    35
    output = unicode(soup)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    36
         
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    37
    textualized = ''.join(textNodes_content)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    38
    return output, textualized, span_starts