# HG changeset patch # User raph # Date 1264415662 -3600 # Node ID 5e8dda1b7631dde327fc2cc762427f0ed5207b9b # Parent 75d94dd1451189328614ca6fe473453992020b14 recover when tidy trashes: try markdown anyway diff -r 75d94dd14511 -r 5e8dda1b7631 src/cm/converters/__init__.py --- a/src/cm/converters/__init__.py Mon Jan 25 09:23:32 2010 +0100 +++ b/src/cm/converters/__init__.py Mon Jan 25 11:34:22 2010 +0100 @@ -1,5 +1,6 @@ from pandoc_converters import pandoc_convert import chardet +from cm.utils.string import to_unicode import re # TODO: move that in text_base: save images @@ -7,19 +8,6 @@ input = open(file_name, 'r').read() return _convert_from_mimetype(input, mime_type, format) -def to_unicode(input): - if type(input) == str: - res = None - for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']: - try: - res = unicode(input, encoding) - break; - except UnicodeDecodeError: - pass - if not res: - raise Exception('UnicodeDecodeError: could not decode') - return res - return input def _convert_from_mimetype(input, mime_type, format): #input = to_unicode(input) diff -r 75d94dd14511 -r 5e8dda1b7631 src/cm/converters/pandoc_converters.py --- a/src/cm/converters/pandoc_converters.py Mon Jan 25 09:23:32 2010 +0100 +++ b/src/cm/converters/pandoc_converters.py Mon Jan 25 11:34:22 2010 +0100 @@ -10,7 +10,7 @@ from tempfile import mkstemp import StringIO import tidy - +from cm.utils.string import to_unicode PANDOC_BIN = "pandoc" PANDOC_OPTIONS = "--sanitize-html " @@ -46,7 +46,11 @@ # pandoc does not react well when html is not valid # use tidy to clean html if from_format == 'html': - content = do_tidy(content) + try: + content = do_tidy(content) + except: + # tidy fails ... try pandoc anyway... + content = to_unicode(content) # if to_format is pdf: use markdown2pdf if to_format == 'pdf': if from_format != 'markdown': @@ -83,7 +87,7 @@ input_encoding='utf8', output_encoding='utf8', ) - tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options) + tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options) tidyied_content = str(tidyied_content) if content and not tidyied_content.strip(): raise Exception('Content could not be tidyfied') diff -r 75d94dd14511 -r 5e8dda1b7631 src/cm/utils/string.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cm/utils/string.py Mon Jan 25 11:34:22 2010 +0100 @@ -0,0 +1,15 @@ +import chardet + +def to_unicode(input): + if type(input) == str: + res = None + for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']: + try: + res = unicode(input, encoding) + break; + except UnicodeDecodeError: + pass + if not res: + raise Exception('UnicodeDecodeError: could not decode') + return res + return input \ No newline at end of file