diff -r 000000000000 -r 40c8f766c9b8 src/cm/utils/spannifier.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cm/utils/spannifier.py Mon Nov 23 15:14:29 2009 +0100 @@ -0,0 +1,38 @@ +import uuid +import xml.dom.minidom +import re +from BeautifulSoup import BeautifulSoup, Comment + + +def get_text_nodes(soup): + return soup(text=lambda text:not isinstance(text, Comment)) + +def is_real_text_node(textNode): + return not textNode.findParent('style') + +def get_the_soup(input): + return BeautifulSoup(input, convertEntities=["xml", "html"]) + +def spannify(input): + """ + wrap textNodes in spans + """ + + input = re.sub("\s*$","",input) + + soup = get_the_soup(input) + + textNodes = get_text_nodes(soup) + textNodes_content = [] + + span_starts = {} + for i in xrange(len(textNodes)): + textNode = textNodes[i] + if is_real_text_node(textNode) : + textNode.replaceWith('' + textNode.string + '') + span_starts[i] = len(''.join(textNodes_content)) + textNodes_content.append(textNode.string) + output = unicode(soup) + + textualized = ''.join(textNodes_content) + return output, textualized, span_starts