--- a/src/cm/converters/__init__.py Mon Jan 25 09:23:32 2010 +0100
+++ b/src/cm/converters/__init__.py Mon Jan 25 11:34:22 2010 +0100
@@ -1,5 +1,6 @@
from pandoc_converters import pandoc_convert
import chardet
+from cm.utils.string import to_unicode
import re
# TODO: move that in text_base: save images
@@ -7,19 +8,6 @@
input = open(file_name, 'r').read()
return _convert_from_mimetype(input, mime_type, format)
-def to_unicode(input):
- if type(input) == str:
- res = None
- for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
- try:
- res = unicode(input, encoding)
- break;
- except UnicodeDecodeError:
- pass
- if not res:
- raise Exception('UnicodeDecodeError: could not decode')
- return res
- return input
def _convert_from_mimetype(input, mime_type, format):
#input = to_unicode(input)
--- a/src/cm/converters/pandoc_converters.py Mon Jan 25 09:23:32 2010 +0100
+++ b/src/cm/converters/pandoc_converters.py Mon Jan 25 11:34:22 2010 +0100
@@ -10,7 +10,7 @@
from tempfile import mkstemp
import StringIO
import tidy
-
+from cm.utils.string import to_unicode
PANDOC_BIN = "pandoc"
PANDOC_OPTIONS = "--sanitize-html "
@@ -46,7 +46,11 @@
# pandoc does not react well when html is not valid
# use tidy to clean html
if from_format == 'html':
- content = do_tidy(content)
+ try:
+ content = do_tidy(content)
+ except:
+ # tidy fails ... try pandoc anyway...
+ content = to_unicode(content)
# if to_format is pdf: use markdown2pdf
if to_format == 'pdf':
if from_format != 'markdown':
@@ -83,7 +87,7 @@
input_encoding='utf8',
output_encoding='utf8',
)
- tidyied_content = tidy.parseString(content.encode('utf8'), **tidy_options)
+ tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
tidyied_content = str(tidyied_content)
if content and not tidyied_content.strip():
raise Exception('Content could not be tidyfied')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/utils/string.py Mon Jan 25 11:34:22 2010 +0100
@@ -0,0 +1,15 @@
+import chardet
+
+def to_unicode(input):
+ if type(input) == str:
+ res = None
+ for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
+ try:
+ res = unicode(input, encoding)
+ break;
+ except UnicodeDecodeError:
+ pass
+ if not res:
+ raise Exception('UnicodeDecodeError: could not decode')
+ return res
+ return input
\ No newline at end of file