diff -r 000000000000 -r 40c8f766c9b8 src/cm/converters/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/cm/converters/__init__.py Mon Nov 23 15:14:29 2009 +0100 @@ -0,0 +1,132 @@ +from oo_converters import convert +from pandoc_converters import pandoc_convert +import chardet + +# TODO: move that in text_base: save images +def convert_from_mimetype(file_name, mime_type, format): + input = open(file_name, 'r').read() + return _convert_from_mimetype(input, mime_type, format) + +def to_unicode(input): + if type(input) == str: + res = None + for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']: + try: + res = unicode(input, encoding) + break; + except UnicodeDecodeError: + pass + if not res: + raise Exception('UnicodeDecodeError: could not decode') + return res + return input + +def _convert_from_mimetype(input, mime_type, format): + #input = to_unicode(input) + + attachs = [] + attachs_dir = None + ############################## + if mime_type in ['application/vnd.oasis.opendocument.text', + 'application/msword', + ]: + + xhtml_input, attachs = convert_oo_to_html(input) + converted_input = pandoc_convert(xhtml_input, 'html', format) + + ############################## + # anything looks like text -> markdown + elif mime_type in ['text/plain', + 'text/english', + 'text/enriched' + ]: + converted_input = input + + ############################## + # anything looks like code: put them into markdown citation + elif mime_type.startswith('text/x-') or mime_type in ['application/x-latex', + 'application/x-ruby', + ]: + converted_input = markdown_from_code(input) + + ############################## + # html + elif mime_type in ['text/html', 'application/xhtml+xml']: + if format == 'html': + converted_input = input + else: + converted_input = pandoc_convert(input, 'html', format) + + return converted_input, attachs + +def fix_img_path(html, xhtml, imgs): + """ + imgs : name --> path + """ + finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' + len_res_html = len(re.findall(finder_re, html, re.IGNORECASE)) + len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE)) + res_html = re.finditer(finder_re, html, re.IGNORECASE) + res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE) + result = [] + last_index = 0 + for match_xhtml in res_xhtml: + img_path = '' + try: + match_html = res_html.next() + if match_html: + img_name = match_html.group(1) + img_path = imgs[img_name] + except StopIteration: + # TODO : report pb + pass + offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) + result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) + result.append(img_path) + last_index = match_xhtml.end() - 1 # -1 because trailing " + result.append(xhtml[last_index:len(xhtml)]) + return u''.join(result) + +def convert_oo_to_html(input): + html_input, images = convert(input, 'html') + + enc = chardet.detect(html_input)['encoding'] + try_encodings = [enc, 'utf8', 'latin1'] + res_content = None + for encoding in try_encodings: + try: + res_content_html = unicode(html_input, encoding) + break; + except UnicodeDecodeError: + pass + if not res_content_html: + raise Exception('UnicodeDecodeError: could not decode') + return res_content_html, images + +def old_convert_oo_to_html(input): + html_input, images = convert(input, 'html') + xhtml_input, _not_used_ = convert(input, 'xhtml') + + enc = chardet.detect(xhtml_input)['encoding'] + try_encodings = [enc, 'utf8', 'latin1'] + res_content = None + for encoding in try_encodings: + try: + # TODO: fix path and manage images + #res_content = fix_img_path(unicode(html_res_content,encoding), + # unicode(xhtml_res_content,encoding), + # iimg) + res_content_html = unicode(html_input, encoding) + res_content_xhtml = unicode(xhtml_input, encoding) + break; + except UnicodeDecodeError: + pass + if not res_content_html or not res_content_xhtml: + raise Exception('UnicodeDecodeError: could not decode') + return res_content_html, res_content_xhtml, images + +def markdown_from_code(code): + CODE_INDICATOR = " " # 4 spaces + return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')]) + + \ No newline at end of file