src/cm/converters/pandoc_converters.py
changeset 393 8806f683d0be
parent 358 3e58bf6b3f3e
child 418 8e3c6d8c39f8
equal deleted inserted replaced
392:1e449e11efc8 393:8806f683d0be
   207                 
   207                 
   208     # do not use pandoc to convert from html to html
   208     # do not use pandoc to convert from html to html
   209     if from_format==to_format=='html':
   209     if from_format==to_format=='html':
   210       # get body content
   210       # get body content
   211       stdoutdata = (content.encode('utf8'))
   211       stdoutdata = (content.encode('utf8'))
       
   212       #stdoutdata = re.sub(r".*<body[^>]*>", '', stdoutdata)
       
   213       #stdoutdata = re.sub(r"</body>.*", '', stdoutdata)
   212       # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities (&nbsp;)
   214       # if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities (&nbsp;)
   213       stdoutdata = re.sub(r"&nbsp;", '\xc2\xa0', stdoutdata)
   215       stdoutdata = re.sub(r"&nbsp;", '\xc2\xa0', stdoutdata)
   214       dom = parseString(stdoutdata)
   216       dom = parseString(stdoutdata)
   215       body = dom.getElementsByTagName("body")[0].toxml()
   217       body = dom.getElementsByTagName("body")[0].toxml()
   216       stdoutdata = body[body.find('>')+1:body.rfind('</')]
   218       stdoutdata = body[body.find('>')+1:body.rfind('</')]