# HG changeset patch # User raph # Date 1271344919 -7200 # Node ID a844469257b0d98caa28a8f26f6643eb1bd489d0 # Parent 0f0a79f7f213637673e48365537c94d9cf4c2f3c fix img path diff -r 0f0a79f7f213 -r a844469257b0 src/cm/converters/__init__.py --- a/src/cm/converters/__init__.py Thu Apr 15 16:38:45 2010 +0200 +++ b/src/cm/converters/__init__.py Thu Apr 15 17:21:59 2010 +0200 @@ -2,6 +2,7 @@ import chardet from cm.utils.string_utils import to_unicode import re +import os from cm.converters.oo_converters import extract_css_body @@ -77,7 +78,7 @@ match_html = res_html.next() if match_html: img_name = match_html.group(1) - img_path = imgs[img_name] + img_path = os.path.split(img_name)[-1] except StopIteration: # TODO : report pb pass @@ -104,24 +105,26 @@ raise Exception('UnicodeDecodeError: could not decode') return res_content_html, images +def fix_html_img_path(html): + return html.replace('IMG SRC="../outdir/','IMG SRC="') + def convert_oo_to_html_and_xhtml(input): from oo_converters import convert html_input, images = convert(input, 'html') xhtml_input, _not_used_ = convert(input, 'xhtml') - enc = chardet.detect(xhtml_input)['encoding'] try_encodings = [enc, 'utf8', 'latin1'] for encoding in try_encodings: try: - # TODO: fix path and manage images - #res_content = fix_img_path(unicode(html_res_content,encoding), - # unicode(xhtml_res_content,encoding), - # iimg) res_content_html = unicode(html_input, encoding) res_content_xhtml = unicode(xhtml_input, encoding) break; except UnicodeDecodeError: pass + + res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images) + res_content_html = fix_html_img_path(res_content_html) + if not res_content_html or not res_content_xhtml: raise Exception('UnicodeDecodeError: could not decode') return res_content_html, cleanup(res_content_xhtml), images diff -r 0f0a79f7f213 -r a844469257b0 src/cm/converters/oo_converters.py --- a/src/cm/converters/oo_converters.py Thu Apr 15 16:38:45 2010 +0200 +++ b/src/cm/converters/oo_converters.py Thu Apr 15 17:21:59 2010 +0200 @@ -223,35 +223,6 @@ THE_INDIR = "indir" THE_INFILE = "infile" - -def fix_img_path(html,xhtml,imgs): - """ - imgs : name --> path - """ - finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' - len_res_html = len(re.findall(finder_re,html,re.IGNORECASE)) - len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE)) - res_html = re.finditer(finder_re,html,re.IGNORECASE) - res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE) - result = [] - last_index = 0 - for match_xhtml in res_xhtml: - img_path = '' - try: - match_html = res_html.next() - if match_html: - img_name = match_html.group(1) - img_path = imgs[img_name] - except StopIteration: - # TODO : report pb - pass - offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) - result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) - result.append(img_path) - last_index = match_xhtml.end() - 1 # -1 because trailing " - result.append(xhtml[last_index:len(xhtml)]) - return u''.join(result) - def extract_css_body(xhtml): dom = parseString(xhtml.encode('utf8'))