--- a/src/cm/converters/__init__.py Tue Apr 20 10:47:04 2010 +0200
+++ b/src/cm/converters/__init__.py Tue Apr 20 11:37:33 2010 +0200
@@ -2,6 +2,9 @@
import chardet
from cm.utils.string_utils import to_unicode
import re
+import os
+from cm.converters.oo_converters import extract_css_body
+
# TODO: move that in text_base: save images
def convert_from_mimetype(file_name, mime_type, format):
@@ -19,8 +22,12 @@
'application/msword',
]:
- xhtml_input, attachs = convert_oo_to_html(input)
- converted_input = pandoc_convert(xhtml_input, 'html', format)
+ html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
+ if format == 'html':
+ _not_used_css, converted_input = extract_css_body(xhtml_input)
+ #converted_input = xhtml_input
+
+ converted_input = pandoc_convert(html_input, 'html', format)
##############################
# latex
@@ -37,8 +44,8 @@
elif mime_type in ['text/html', 'application/xhtml+xml']:
if format == 'html':
converted_input = input
- else:
- converted_input = pandoc_convert(input, 'html', format)
+
+ converted_input = pandoc_convert(input, 'html', format)
##############################
# anything looks like text -> markdown
elif mime_type in ['text/plain',
@@ -71,7 +78,7 @@
match_html = res_html.next()
if match_html:
img_name = match_html.group(1)
- img_path = imgs[img_name]
+ img_path = os.path.split(img_name)[-1]
except StopIteration:
# TODO : report pb
pass
@@ -88,7 +95,6 @@
enc = chardet.detect(html_input)['encoding']
try_encodings = [enc, 'utf8', 'latin1']
- res_content = None
for encoding in try_encodings:
try:
res_content_html = unicode(html_input, encoding)
@@ -99,29 +105,33 @@
raise Exception('UnicodeDecodeError: could not decode')
return res_content_html, images
-def old_convert_oo_to_html(input):
+def fix_html_img_path(html):
+ return html.replace('IMG SRC="../outdir/','IMG SRC="')
+
+def convert_oo_to_html_and_xhtml(input):
from oo_converters import convert
html_input, images = convert(input, 'html')
xhtml_input, _not_used_ = convert(input, 'xhtml')
-
enc = chardet.detect(xhtml_input)['encoding']
try_encodings = [enc, 'utf8', 'latin1']
- res_content = None
for encoding in try_encodings:
try:
- # TODO: fix path and manage images
- #res_content = fix_img_path(unicode(html_res_content,encoding),
- # unicode(xhtml_res_content,encoding),
- # iimg)
res_content_html = unicode(html_input, encoding)
res_content_xhtml = unicode(xhtml_input, encoding)
break;
except UnicodeDecodeError:
pass
+
+ res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images)
+ res_content_html = fix_html_img_path(res_content_html)
+
if not res_content_html or not res_content_xhtml:
raise Exception('UnicodeDecodeError: could not decode')
- return res_content_html, res_content_xhtml, images
+ return res_content_html, cleanup(res_content_xhtml), images
+def cleanup(string):
+ return string.replace(u'\xc2\xa0',u'')
+
def markdown_from_code(code):
CODE_INDICATOR = " " # 4 spaces
return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
--- a/src/cm/converters/oo_converters.py Tue Apr 20 10:47:04 2010 +0200
+++ b/src/cm/converters/oo_converters.py Tue Apr 20 11:37:33 2010 +0200
@@ -223,35 +223,6 @@
THE_INDIR = "indir"
THE_INFILE = "infile"
-
-def fix_img_path(html,xhtml,imgs):
- """
- imgs : name --> path
- """
- finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
- len_res_html = len(re.findall(finder_re,html,re.IGNORECASE))
- len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE))
- res_html = re.finditer(finder_re,html,re.IGNORECASE)
- res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE)
- result = []
- last_index = 0
- for match_xhtml in res_xhtml:
- img_path = ''
- try:
- match_html = res_html.next()
- if match_html:
- img_name = match_html.group(1)
- img_path = imgs[img_name]
- except StopIteration:
- # TODO : report pb
- pass
- offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
- result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
- result.append(img_path)
- last_index = match_xhtml.end() - 1 # -1 because trailing "
- result.append(xhtml[last_index:len(xhtml)])
- return u''.join(result)
-
def extract_css_body(xhtml):
dom = parseString(xhtml.encode('utf8'))
--- a/src/cm/converters/pandoc_converters.py Tue Apr 20 10:47:04 2010 +0200
+++ b/src/cm/converters/pandoc_converters.py Tue Apr 20 11:37:33 2010 +0200
@@ -13,7 +13,8 @@
from cm.utils.string_utils import to_unicode
PANDOC_BIN = "pandoc"
-PANDOC_OPTIONS = "--sanitize-html "
+PANDOC_OPTIONS = " --sanitize-html "
+PANDOC_OPTIONS_RAW = " -R "
MARKDOWN2PDF_BIN = "markdown2pdf"
@@ -37,7 +38,7 @@
_PANDOC_ENCODING = 'utf8'
@memoize
-def pandoc_convert(content, from_format, to_format, full=False):
+def pandoc_convert(content, from_format, to_format, full=False, raw=False):
"""
Convert markdown content to pdf
@@ -56,7 +57,7 @@
if from_format != 'markdown':
content = pandoc_convert(content, from_format, 'markdown', True)
return pandoc_markdown2pdf(content)
- return pandoc_pandoc(content, from_format, to_format, full)
+ return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
def content_or_file_name(content, file_name):
if not content and not file_name:
@@ -154,7 +155,7 @@
# TODO: use tidy to cleanup html
@memoize
-def pandoc_pandoc(content, from_format, to_format, full=False):
+def pandoc_pandoc(content, from_format, to_format, full=False, raw=False):
"""
Convert content (should be unicode) from from_format to to_format
(if full: includes header & co [html, latex])
@@ -189,7 +190,11 @@
input_file.close()
# pandoc arguments and command line
- cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name)
+ p_options = PANDOC_OPTIONS
+ if raw:
+ p_options = PANDOC_OPTIONS_RAW
+
+ cmd_args = ' %s -o %s ' %(p_options,output_temp_name)
if full:
cmd_args += ' -s '
cmd_args += ' -f %s ' % from_format