recover when tidy trashes: try markdown anyway
authorraph
Mon, 25 Jan 2010 11:34:22 +0100
changeset 119 5e8dda1b7631
parent 118 75d94dd14511
child 120 5afc61269b10
recover when tidy trashes: try markdown anyway
src/cm/converters/__init__.py
src/cm/converters/pandoc_converters.py
src/cm/utils/string.py
--- a/src/cm/converters/__init__.py	Mon Jan 25 09:23:32 2010 +0100
+++ b/src/cm/converters/__init__.py	Mon Jan 25 11:34:22 2010 +0100
@@ -1,5 +1,6 @@
 from pandoc_converters import pandoc_convert
 import chardet 
+from cm.utils.string import to_unicode 
 import re
 
 # TODO: move that in text_base: save images
@@ -7,19 +8,6 @@
     input = open(file_name, 'r').read()
     return _convert_from_mimetype(input, mime_type, format)
 
-def to_unicode(input):
-    if type(input) == str:
-        res = None
-        for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
-            try:
-                res = unicode(input, encoding)
-                break;
-            except UnicodeDecodeError:
-                pass
-        if not res:
-            raise Exception('UnicodeDecodeError: could not decode')
-        return res
-    return input
 
 def _convert_from_mimetype(input, mime_type, format):
     #input = to_unicode(input)
--- a/src/cm/converters/pandoc_converters.py	Mon Jan 25 09:23:32 2010 +0100
+++ b/src/cm/converters/pandoc_converters.py	Mon Jan 25 11:34:22 2010 +0100
@@ -10,7 +10,7 @@
 from tempfile import mkstemp
 import StringIO
 import tidy
-
+from cm.utils.string import to_unicode
 
 PANDOC_BIN = "pandoc"
 PANDOC_OPTIONS = "--sanitize-html "
@@ -46,7 +46,11 @@
     # pandoc does not react well when html is not valid
     # use tidy to clean html  
     if from_format == 'html':
-        content = do_tidy(content)
+        try:
+            content = do_tidy(content)
+        except:
+            # tidy fails ... try pandoc anyway...
+            content = to_unicode(content)
     # if to_format is pdf: use markdown2pdf
     if to_format == 'pdf':        
         if from_format != 'markdown':
@@ -83,7 +87,7 @@
                         input_encoding='utf8',
                         output_encoding='utf8',
                         )
-    tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options)
+    tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
     tidyied_content = str(tidyied_content)
     if content and not tidyied_content.strip():
         raise Exception('Content could not be tidyfied') 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/utils/string.py	Mon Jan 25 11:34:22 2010 +0100
@@ -0,0 +1,15 @@
+import chardet
+
+def to_unicode(input):
+    if type(input) == str:
+        res = None
+        for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
+            try:
+                res = unicode(input, encoding)
+                break;
+            except UnicodeDecodeError:
+                pass
+        if not res:
+            raise Exception('UnicodeDecodeError: could not decode')
+        return res
+    return input
\ No newline at end of file