src/cm/converters/pandoc_converters.py
changeset 119 5e8dda1b7631
parent 0 40c8f766c9b8
child 149 0f2c5744b39b
--- a/src/cm/converters/pandoc_converters.py	Mon Jan 25 09:23:32 2010 +0100
+++ b/src/cm/converters/pandoc_converters.py	Mon Jan 25 11:34:22 2010 +0100
@@ -10,7 +10,7 @@
 from tempfile import mkstemp
 import StringIO
 import tidy
-
+from cm.utils.string import to_unicode
 
 PANDOC_BIN = "pandoc"
 PANDOC_OPTIONS = "--sanitize-html "
@@ -46,7 +46,11 @@
     # pandoc does not react well when html is not valid
     # use tidy to clean html  
     if from_format == 'html':
-        content = do_tidy(content)
+        try:
+            content = do_tidy(content)
+        except:
+            # tidy fails ... try pandoc anyway...
+            content = to_unicode(content)
     # if to_format is pdf: use markdown2pdf
     if to_format == 'pdf':        
         if from_format != 'markdown':
@@ -83,7 +87,7 @@
                         input_encoding='utf8',
                         output_encoding='utf8',
                         )
-    tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options)
+    tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
     tidyied_content = str(tidyied_content)
     if content and not tidyied_content.strip():
         raise Exception('Content could not be tidyfied')