# HG changeset patch # User raph # Date 1271342325 -7200 # Node ID 0f0a79f7f213637673e48365537c94d9cf4c2f3c # Parent 3eb5299e8085d772e459a90f8a5831a74f5fa715 do not use pandoc for html content diff -r 3eb5299e8085 -r 0f0a79f7f213 src/cm/converters/__init__.py --- a/src/cm/converters/__init__.py Thu Apr 15 14:35:44 2010 +0200 +++ b/src/cm/converters/__init__.py Thu Apr 15 16:38:45 2010 +0200 @@ -2,6 +2,8 @@ import chardet from cm.utils.string_utils import to_unicode import re +from cm.converters.oo_converters import extract_css_body + # TODO: move that in text_base: save images def convert_from_mimetype(file_name, mime_type, format): @@ -19,8 +21,12 @@ 'application/msword', ]: - xhtml_input, attachs = convert_oo_to_html(input) - converted_input = pandoc_convert(xhtml_input, 'html', format) + html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) + if format == 'html': + _not_used_css, converted_input = extract_css_body(xhtml_input) + #converted_input = xhtml_input + else: + converted_input = pandoc_convert(html_input, 'html', format) ############################## # latex @@ -88,7 +94,6 @@ enc = chardet.detect(html_input)['encoding'] try_encodings = [enc, 'utf8', 'latin1'] - res_content = None for encoding in try_encodings: try: res_content_html = unicode(html_input, encoding) @@ -99,14 +104,13 @@ raise Exception('UnicodeDecodeError: could not decode') return res_content_html, images -def old_convert_oo_to_html(input): +def convert_oo_to_html_and_xhtml(input): from oo_converters import convert html_input, images = convert(input, 'html') xhtml_input, _not_used_ = convert(input, 'xhtml') enc = chardet.detect(xhtml_input)['encoding'] try_encodings = [enc, 'utf8', 'latin1'] - res_content = None for encoding in try_encodings: try: # TODO: fix path and manage images @@ -120,8 +124,11 @@ pass if not res_content_html or not res_content_xhtml: raise Exception('UnicodeDecodeError: could not decode') - return res_content_html, res_content_xhtml, images + return res_content_html, cleanup(res_content_xhtml), images +def cleanup(string): + return string.replace(u'\xc2\xa0',u'') + def markdown_from_code(code): CODE_INDICATOR = " " # 4 spaces return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) diff -r 3eb5299e8085 -r 0f0a79f7f213 src/cm/converters/pandoc_converters.py --- a/src/cm/converters/pandoc_converters.py Thu Apr 15 14:35:44 2010 +0200 +++ b/src/cm/converters/pandoc_converters.py Thu Apr 15 16:38:45 2010 +0200 @@ -13,7 +13,7 @@ from cm.utils.string_utils import to_unicode PANDOC_BIN = "pandoc" -PANDOC_OPTIONS = "--sanitize-html " +PANDOC_OPTIONS = " -R " MARKDOWN2PDF_BIN = "markdown2pdf" diff -r 3eb5299e8085 -r 0f0a79f7f213 src/cm/models.py --- a/src/cm/models.py Thu Apr 15 14:35:44 2010 +0200 +++ b/src/cm/models.py Thu Apr 15 16:38:45 2010 +0200 @@ -189,7 +189,10 @@ objects = TextVersionManager() def get_content(self, format='html'): - return pandoc_convert(self.content, self.format, format) + if format == self.format: + return self.content + else: + return pandoc_convert(self.content, self.format, format) # def _get_comments(self, user = None, filter_reply = 0): # """ # get comments viewable by this user (user = None or user = AnonymousUser => everyone)