diff -r f62f7f0bcaa4 -r ba7e05582435 src/cm/converters/pandoc_converters.py --- a/src/cm/converters/pandoc_converters.py Thu Aug 09 11:06:46 2012 +0200 +++ b/src/cm/converters/pandoc_converters.py Thu Aug 09 13:05:11 2012 +0200 @@ -11,7 +11,7 @@ import StringIO import tidy from cm.utils.string_utils import to_unicode -from xml.dom.minidom import parseString +from BeautifulSoup import BeautifulSoup import re from distutils.version import LooseVersion @@ -221,22 +221,15 @@ if from_format==to_format=='html': # get body content stdoutdata = (content.encode('utf8')) - #stdoutdata = re.sub(r".*
]*>", '', stdoutdata) - #stdoutdata = re.sub(r".*", '', stdoutdata) - # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( ) - stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata) - dom = parseString(stdoutdata) - body = dom.getElementsByTagName("body")[0].toxml() - stdoutdata = body[body.find('>')+1:body.rfind('')] + soup = BeautifulSoup(stdoutdata) + body = soup.body + stdoutdata = body.renderContents() # strip leading spaces stdoutdata = re.sub(r"^\s+", '', stdoutdata) # add new line before closing bracket stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata) # do not split closing tag with following opening tag stdoutdata = re.sub(r">\n<", r"><", stdoutdata) - # nest headers tags - #stdoutdata = re.sub(r'