# HG changeset patch # User raph # Date 1271756253 -7200 # Node ID 9075dc2fb93c7d2a915f2f0cbe3b382bef63d98c # Parent 7b7ba5e472685e124441dcd8acef17ea9b8d80f2# Parent 0371caf8bcc6864a0f2250933431d3a913c9b379 Merge with 0371caf8bcc6864a0f2250933431d3a913c9b379 diff -r 7b7ba5e47268 -r 9075dc2fb93c src/cm/converters/__init__.py --- a/src/cm/converters/__init__.py Tue Apr 20 10:47:04 2010 +0200 +++ b/src/cm/converters/__init__.py Tue Apr 20 11:37:33 2010 +0200 @@ -2,6 +2,9 @@ import chardet from cm.utils.string_utils import to_unicode import re +import os +from cm.converters.oo_converters import extract_css_body + # TODO: move that in text_base: save images def convert_from_mimetype(file_name, mime_type, format): @@ -19,8 +22,12 @@ 'application/msword', ]: - xhtml_input, attachs = convert_oo_to_html(input) - converted_input = pandoc_convert(xhtml_input, 'html', format) + html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input) + if format == 'html': + _not_used_css, converted_input = extract_css_body(xhtml_input) + #converted_input = xhtml_input + + converted_input = pandoc_convert(html_input, 'html', format) ############################## # latex @@ -37,8 +44,8 @@ elif mime_type in ['text/html', 'application/xhtml+xml']: if format == 'html': converted_input = input - else: - converted_input = pandoc_convert(input, 'html', format) + + converted_input = pandoc_convert(input, 'html', format) ############################## # anything looks like text -> markdown elif mime_type in ['text/plain', @@ -71,7 +78,7 @@ match_html = res_html.next() if match_html: img_name = match_html.group(1) - img_path = imgs[img_name] + img_path = os.path.split(img_name)[-1] except StopIteration: # TODO : report pb pass @@ -88,7 +95,6 @@ enc = chardet.detect(html_input)['encoding'] try_encodings = [enc, 'utf8', 'latin1'] - res_content = None for encoding in try_encodings: try: res_content_html = unicode(html_input, encoding) @@ -99,29 +105,33 @@ raise Exception('UnicodeDecodeError: could not decode') return res_content_html, images -def old_convert_oo_to_html(input): +def fix_html_img_path(html): + return html.replace('IMG SRC="../outdir/','IMG SRC="') + +def convert_oo_to_html_and_xhtml(input): from oo_converters import convert html_input, images = convert(input, 'html') xhtml_input, _not_used_ = convert(input, 'xhtml') - enc = chardet.detect(xhtml_input)['encoding'] try_encodings = [enc, 'utf8', 'latin1'] - res_content = None for encoding in try_encodings: try: - # TODO: fix path and manage images - #res_content = fix_img_path(unicode(html_res_content,encoding), - # unicode(xhtml_res_content,encoding), - # iimg) res_content_html = unicode(html_input, encoding) res_content_xhtml = unicode(xhtml_input, encoding) break; except UnicodeDecodeError: pass + + res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images) + res_content_html = fix_html_img_path(res_content_html) + if not res_content_html or not res_content_xhtml: raise Exception('UnicodeDecodeError: could not decode') - return res_content_html, res_content_xhtml, images + return res_content_html, cleanup(res_content_xhtml), images +def cleanup(string): + return string.replace(u'\xc2\xa0',u'') + def markdown_from_code(code): CODE_INDICATOR = " " # 4 spaces return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) diff -r 7b7ba5e47268 -r 9075dc2fb93c src/cm/converters/oo_converters.py --- a/src/cm/converters/oo_converters.py Tue Apr 20 10:47:04 2010 +0200 +++ b/src/cm/converters/oo_converters.py Tue Apr 20 11:37:33 2010 +0200 @@ -223,35 +223,6 @@ THE_INDIR = "indir" THE_INFILE = "infile" - -def fix_img_path(html,xhtml,imgs): - """ - imgs : name --> path - """ - finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' - len_res_html = len(re.findall(finder_re,html,re.IGNORECASE)) - len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE)) - res_html = re.finditer(finder_re,html,re.IGNORECASE) - res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE) - result = [] - last_index = 0 - for match_xhtml in res_xhtml: - img_path = '' - try: - match_html = res_html.next() - if match_html: - img_name = match_html.group(1) - img_path = imgs[img_name] - except StopIteration: - # TODO : report pb - pass - offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) - result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) - result.append(img_path) - last_index = match_xhtml.end() - 1 # -1 because trailing " - result.append(xhtml[last_index:len(xhtml)]) - return u''.join(result) - def extract_css_body(xhtml): dom = parseString(xhtml.encode('utf8')) diff -r 7b7ba5e47268 -r 9075dc2fb93c src/cm/converters/pandoc_converters.py --- a/src/cm/converters/pandoc_converters.py Tue Apr 20 10:47:04 2010 +0200 +++ b/src/cm/converters/pandoc_converters.py Tue Apr 20 11:37:33 2010 +0200 @@ -13,7 +13,8 @@ from cm.utils.string_utils import to_unicode PANDOC_BIN = "pandoc" -PANDOC_OPTIONS = "--sanitize-html " +PANDOC_OPTIONS = " --sanitize-html " +PANDOC_OPTIONS_RAW = " -R " MARKDOWN2PDF_BIN = "markdown2pdf" @@ -37,7 +38,7 @@ _PANDOC_ENCODING = 'utf8' @memoize -def pandoc_convert(content, from_format, to_format, full=False): +def pandoc_convert(content, from_format, to_format, full=False, raw=False): """ Convert markdown content to pdf @@ -56,7 +57,7 @@ if from_format != 'markdown': content = pandoc_convert(content, from_format, 'markdown', True) return pandoc_markdown2pdf(content) - return pandoc_pandoc(content, from_format, to_format, full) + return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html def content_or_file_name(content, file_name): if not content and not file_name: @@ -154,7 +155,7 @@ # TODO: use tidy to cleanup html @memoize -def pandoc_pandoc(content, from_format, to_format, full=False): +def pandoc_pandoc(content, from_format, to_format, full=False, raw=False): """ Convert content (should be unicode) from from_format to to_format (if full: includes header & co [html, latex]) @@ -189,7 +190,11 @@ input_file.close() # pandoc arguments and command line - cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name) + p_options = PANDOC_OPTIONS + if raw: + p_options = PANDOC_OPTIONS_RAW + + cmd_args = ' %s -o %s ' %(p_options,output_temp_name) if full: cmd_args += ' -s ' cmd_args += ' -f %s ' % from_format diff -r 7b7ba5e47268 -r 9075dc2fb93c src/cm/models.py