When extracting body of a document in HTML source format (i.e. without pandoc conversion before display), use BeautifilSoup, which is more tolerant, instead of xml.dom.minidom, which crashes too often.
--- a/src/cm/converters/pandoc_converters.py Thu Aug 09 11:06:46 2012 +0200
+++ b/src/cm/converters/pandoc_converters.py Thu Aug 09 13:05:11 2012 +0200
@@ -11,7 +11,7 @@
import StringIO
import tidy
from cm.utils.string_utils import to_unicode
-from xml.dom.minidom import parseString
+from BeautifulSoup import BeautifulSoup
import re
from distutils.version import LooseVersion
@@ -221,22 +221,15 @@
if from_format==to_format=='html':
# get body content
stdoutdata = (content.encode('utf8'))
- #stdoutdata = re.sub(r".*<body[^>]*>", '', stdoutdata)
- #stdoutdata = re.sub(r"</body>.*", '', stdoutdata)
- # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( )
- stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata)
- dom = parseString(stdoutdata)
- body = dom.getElementsByTagName("body")[0].toxml()
- stdoutdata = body[body.find('>')+1:body.rfind('</')]
+ soup = BeautifulSoup(stdoutdata)
+ body = soup.body
+ stdoutdata = body.renderContents()
# strip leading spaces
stdoutdata = re.sub(r"^\s+", '', stdoutdata)
# add new line before closing bracket
stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata)
# do not split closing tag with following opening tag
stdoutdata = re.sub(r">\n<", r"><", stdoutdata)
- # nest headers tags
- #stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata)
- #stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata)
return stdoutdata
cmd_args = ' %s -o %s ' %(p_options,output_temp_name)
@@ -248,8 +241,6 @@
cmd_args += ' %s ' % input_temp_name
cmd = PANDOC_BIN + ' ' + cmd_args
- #from socommons.converters.new_conv import controlled_Popen
- #controlled_Popen(cmd, stderr=file(error_temp_name,'w'))
fp_error = file(error_temp_name,'w')
retcode = call(cmd, shell=True, stderr=fp_error)
fp_error.close()