--- a/src/cm/converters/__init__.py Thu Apr 15 14:35:44 2010 +0200
+++ b/src/cm/converters/__init__.py Thu Apr 15 16:38:45 2010 +0200
@@ -2,6 +2,8 @@
import chardet
from cm.utils.string_utils import to_unicode
import re
+from cm.converters.oo_converters import extract_css_body
+
# TODO: move that in text_base: save images
def convert_from_mimetype(file_name, mime_type, format):
@@ -19,8 +21,12 @@
'application/msword',
]:
- xhtml_input, attachs = convert_oo_to_html(input)
- converted_input = pandoc_convert(xhtml_input, 'html', format)
+ html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
+ if format == 'html':
+ _not_used_css, converted_input = extract_css_body(xhtml_input)
+ #converted_input = xhtml_input
+ else:
+ converted_input = pandoc_convert(html_input, 'html', format)
##############################
# latex
@@ -88,7 +94,6 @@
enc = chardet.detect(html_input)['encoding']
try_encodings = [enc, 'utf8', 'latin1']
- res_content = None
for encoding in try_encodings:
try:
res_content_html = unicode(html_input, encoding)
@@ -99,14 +104,13 @@
raise Exception('UnicodeDecodeError: could not decode')
return res_content_html, images
-def old_convert_oo_to_html(input):
+def convert_oo_to_html_and_xhtml(input):
from oo_converters import convert
html_input, images = convert(input, 'html')
xhtml_input, _not_used_ = convert(input, 'xhtml')
enc = chardet.detect(xhtml_input)['encoding']
try_encodings = [enc, 'utf8', 'latin1']
- res_content = None
for encoding in try_encodings:
try:
# TODO: fix path and manage images
@@ -120,8 +124,11 @@
pass
if not res_content_html or not res_content_xhtml:
raise Exception('UnicodeDecodeError: could not decode')
- return res_content_html, res_content_xhtml, images
+ return res_content_html, cleanup(res_content_xhtml), images
+def cleanup(string):
+ return string.replace(u'\xc2\xa0',u'')
+
def markdown_from_code(code):
CODE_INDICATOR = " " # 4 spaces
return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
--- a/src/cm/converters/pandoc_converters.py Thu Apr 15 14:35:44 2010 +0200
+++ b/src/cm/converters/pandoc_converters.py Thu Apr 15 16:38:45 2010 +0200
@@ -13,7 +13,7 @@
from cm.utils.string_utils import to_unicode
PANDOC_BIN = "pandoc"
-PANDOC_OPTIONS = "--sanitize-html "
+PANDOC_OPTIONS = " -R "
MARKDOWN2PDF_BIN = "markdown2pdf"
--- a/src/cm/models.py Thu Apr 15 14:35:44 2010 +0200
+++ b/src/cm/models.py Thu Apr 15 16:38:45 2010 +0200
@@ -189,7 +189,10 @@
objects = TextVersionManager()
def get_content(self, format='html'):
- return pandoc_convert(self.content, self.format, format)
+ if format == self.format:
+ return self.content
+ else:
+ return pandoc_convert(self.content, self.format, format)
# def _get_comments(self, user = None, filter_reply = 0):
# """
# get comments viewable by this user (user = None or user = AnonymousUser => everyone)