do not use pandoc for html content preserve_html
authorraph
Thu, 15 Apr 2010 16:38:45 +0200
branchpreserve_html
changeset 252 0f0a79f7f213
parent 251 3eb5299e8085
child 253 a844469257b0
do not use pandoc for html content
src/cm/converters/__init__.py
src/cm/converters/pandoc_converters.py
src/cm/models.py
--- a/src/cm/converters/__init__.py	Thu Apr 15 14:35:44 2010 +0200
+++ b/src/cm/converters/__init__.py	Thu Apr 15 16:38:45 2010 +0200
@@ -2,6 +2,8 @@
 import chardet 
 from cm.utils.string_utils import to_unicode 
 import re
+from cm.converters.oo_converters import extract_css_body
+
 
 # TODO: move that in text_base: save images
 def convert_from_mimetype(file_name, mime_type, format):
@@ -19,8 +21,12 @@
                      'application/msword',
                      ]:
         
-        xhtml_input, attachs = convert_oo_to_html(input)
-        converted_input = pandoc_convert(xhtml_input, 'html', format)
+        html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
+        if format == 'html':
+                _not_used_css, converted_input = extract_css_body(xhtml_input)
+                #converted_input = xhtml_input
+        else:
+            converted_input = pandoc_convert(html_input, 'html', format)
         
     ##############################
     # latex
@@ -88,7 +94,6 @@
     
     enc = chardet.detect(html_input)['encoding']
     try_encodings = [enc, 'utf8', 'latin1']
-    res_content = None
     for encoding in try_encodings:
         try:
             res_content_html = unicode(html_input, encoding)
@@ -99,14 +104,13 @@
         raise Exception('UnicodeDecodeError: could not decode')
     return res_content_html, images
 
-def old_convert_oo_to_html(input): 
+def convert_oo_to_html_and_xhtml(input): 
     from oo_converters import convert   
     html_input, images = convert(input, 'html')
     xhtml_input, _not_used_ = convert(input, 'xhtml')
     
     enc = chardet.detect(xhtml_input)['encoding']
     try_encodings = [enc, 'utf8', 'latin1']
-    res_content = None
     for encoding in try_encodings:
         try:
             # TODO: fix path and manage images
@@ -120,8 +124,11 @@
             pass
     if not res_content_html or not res_content_xhtml:
         raise Exception('UnicodeDecodeError: could not decode')
-    return res_content_html, res_content_xhtml, images
+    return res_content_html, cleanup(res_content_xhtml), images
         
+def cleanup(string):
+    return string.replace(u'\xc2\xa0',u'')
+
 def markdown_from_code(code):
     CODE_INDICATOR = "    " # 4 spaces
     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
--- a/src/cm/converters/pandoc_converters.py	Thu Apr 15 14:35:44 2010 +0200
+++ b/src/cm/converters/pandoc_converters.py	Thu Apr 15 16:38:45 2010 +0200
@@ -13,7 +13,7 @@
 from cm.utils.string_utils import to_unicode
 
 PANDOC_BIN = "pandoc"
-PANDOC_OPTIONS = "--sanitize-html "
+PANDOC_OPTIONS = " -R "
 
 MARKDOWN2PDF_BIN = "markdown2pdf"
 
--- a/src/cm/models.py	Thu Apr 15 14:35:44 2010 +0200
+++ b/src/cm/models.py	Thu Apr 15 16:38:45 2010 +0200
@@ -189,7 +189,10 @@
     objects = TextVersionManager()
     
     def get_content(self, format='html'):
-        return pandoc_convert(self.content, self.format, format)
+        if format == self.format:
+            return self.content
+        else:
+            return pandoc_convert(self.content, self.format, format)
 #    def _get_comments(self, user = None, filter_reply = 0):        
 #        """
 #        get comments viewable by this user (user = None or user = AnonymousUser => everyone)