src/cm/utils/spannifier.py
changeset 450 81fa74c112b8
parent 270 05a602160c58
child 464 5a02bfc8aae8
--- a/src/cm/utils/spannifier.py	Tue Jun 12 14:00:32 2012 +0200
+++ b/src/cm/utils/spannifier.py	Thu Jun 14 11:43:46 2012 +0200
@@ -35,6 +35,11 @@
             span_starts[i] = len(''.join(textNodes_content))
             textNodes_content.append(textNode.string)
     output = unicode(soup)
+    # Soup has introduced HTML entities, which should be expanded
+    output =re.sub(r""", '"', output)
+    output =re.sub(r"&", '&', output)
+    output =re.sub(r">", '>', output)
+    output =re.sub(r"&lt;", '<', output)
          
     textualized = ''.join(textNodes_content)
     return output, textualized, span_starts