src/cm/utils/spannifier.py
changeset 0 40c8f766c9b8
child 270 05a602160c58
equal deleted inserted replaced
-1:000000000000 0:40c8f766c9b8
       
     1 import uuid
       
     2 import xml.dom.minidom
       
     3 import re
       
     4 from BeautifulSoup import BeautifulSoup, Comment
       
     5  
       
     6 
       
     7 def get_text_nodes(soup):
       
     8     return soup(text=lambda text:not isinstance(text, Comment))
       
     9 
       
    10 def is_real_text_node(textNode):
       
    11     return not textNode.findParent('style') 
       
    12 
       
    13 def get_the_soup(input):
       
    14     return BeautifulSoup(input, convertEntities=["xml", "html"])
       
    15                      
       
    16 def spannify(input):
       
    17     """ 
       
    18     wrap textNodes in spans 
       
    19     """
       
    20     
       
    21     input = re.sub("\s*$","",input)
       
    22         
       
    23     soup = get_the_soup(input)
       
    24     
       
    25     textNodes = get_text_nodes(soup)
       
    26     textNodes_content = []
       
    27     
       
    28     span_starts = {}
       
    29     for i in xrange(len(textNodes)):
       
    30         textNode = textNodes[i]
       
    31         if is_real_text_node(textNode) :
       
    32             textNode.replaceWith('<span id="sv_' + str(i) + '" class="c-s"><span id="sv-' + str(i) + '" class="c-count-0 c-c">' + textNode.string + '</span></span>')
       
    33             span_starts[i] = len(''.join(textNodes_content))
       
    34             textNodes_content.append(textNode.string)
       
    35     output = unicode(soup)
       
    36          
       
    37     textualized = ''.join(textNodes_content)
       
    38     return output, textualized, span_starts