comt: comparison src/cm/utils/spannifier.py

equal deleted inserted replaced

--1:000000000000
+:40c8f766c9b8
+import uuid
+import xml.dom.minidom
+import re
+from BeautifulSoup import BeautifulSoup, Comment
+def get_text_nodes(soup):
+return soup(text=lambda text:not isinstance(text, Comment))
+def is_real_text_node(textNode):
+return not textNode.findParent('style')
+def get_the_soup(input):
+return BeautifulSoup(input, convertEntities=["xml", "html"])
+def spannify(input):
+"""
+wrap textNodes in spans
+"""
+input = re.sub("\s*$","",input)
+soup = get_the_soup(input)
+textNodes = get_text_nodes(soup)
+textNodes_content = []
+span_starts = {}
+for i in xrange(len(textNodes)):
+textNode = textNodes[i]
+if is_real_text_node(textNode) :
+textNode.replaceWith('<span id="sv_' + str(i) + '" class="c-s"><span id="sv-' + str(i) + '" class="c-count-0 c-c">' + textNode.string + '</span></span>')
+span_starts[i] = len(''.join(textNodes_content))
+textNodes_content.append(textNode.string)
+output = unicode(soup)
+textualized = ''.join(textNodes_content)
+return output, textualized, span_starts

changeset 0	40c8f766c9b8
child 270	05a602160c58