src/cm/converters/__init__.py
author raph
Mon, 11 Jan 2010 16:29:48 +0100
changeset 77 fe91eb717a96
parent 50 6db6c011a310
child 78 dda94db1149a
permissions -rw-r--r--
import oo_converters locally (not at module level) to avoid weird uno imports
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
from pandoc_converters import pandoc_convert
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
import chardet 
77
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
     3
import re
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     4
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     5
# TODO: move that in text_base: save images
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     6
def convert_from_mimetype(file_name, mime_type, format):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     7
    input = open(file_name, 'r').read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
    return _convert_from_mimetype(input, mime_type, format)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    10
def to_unicode(input):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    11
    if type(input) == str:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    12
        res = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    13
        for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    14
            try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    15
                res = unicode(input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    16
                break;
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    17
            except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    18
                pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    19
        if not res:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    20
            raise Exception('UnicodeDecodeError: could not decode')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    21
        return res
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    22
    return input
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    23
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    24
def _convert_from_mimetype(input, mime_type, format):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    25
    #input = to_unicode(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    26
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
    attachs = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    28
    attachs_dir = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    29
    ##############################
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    30
    if mime_type in ['application/vnd.oasis.opendocument.text',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    31
                     'application/msword',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    32
                     ]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    33
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    34
        xhtml_input, attachs = convert_oo_to_html(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    35
        converted_input = pandoc_convert(xhtml_input, 'html', format)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    36
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    37
    ##############################
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    38
    # anything looks like text -> markdown
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    39
    elif mime_type in ['text/plain',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    40
                       'text/english',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    41
                       'text/enriched'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    42
                      ]:
50
6db6c011a310 add unicode detection for txt format in upload
raph
parents: 0
diff changeset
    43
        converted_input = to_unicode(input)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    44
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    45
    ##############################
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    46
    # anything looks like code: put them into markdown citation
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    47
    elif mime_type.startswith('text/x-') or mime_type in ['application/x-latex',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    48
                                                          'application/x-ruby',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    49
                       ]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    50
        converted_input = markdown_from_code(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    51
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    52
    ##############################
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    53
    # html
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    54
    elif mime_type in ['text/html', 'application/xhtml+xml']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    55
        if format == 'html':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    56
            converted_input = input
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    57
        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    58
            converted_input = pandoc_convert(input, 'html', format)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    59
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    60
    return converted_input, attachs
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    61
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    62
def fix_img_path(html, xhtml, imgs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    63
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    64
    imgs : name --> path
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    65
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    66
    finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    67
    len_res_html = len(re.findall(finder_re, html, re.IGNORECASE))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    68
    len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    69
    res_html = re.finditer(finder_re, html, re.IGNORECASE)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    70
    res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    71
    result = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    72
    last_index = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    73
    for match_xhtml in res_xhtml:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    74
        img_path = '' 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    75
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    76
            match_html = res_html.next()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    77
            if match_html:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    78
                img_name = match_html.group(1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    79
                img_path = imgs[img_name]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    80
        except StopIteration:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    81
            # TODO : report pb
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    82
            pass 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    83
        offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    84
        result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    85
        result.append(img_path)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    86
        last_index = match_xhtml.end() - 1 # -1 because trailing "
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    87
    result.append(xhtml[last_index:len(xhtml)])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    88
    return u''.join(result)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    89
77
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
    90
def convert_oo_to_html(input):
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
    91
    from oo_converters import convert    
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    92
    html_input, images = convert(input, 'html')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    93
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    94
    enc = chardet.detect(html_input)['encoding']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    95
    try_encodings = [enc, 'utf8', 'latin1']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    96
    res_content = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    97
    for encoding in try_encodings:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    98
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    99
            res_content_html = unicode(html_input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   100
            break;
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   101
        except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   102
            pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   103
    if not res_content_html:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   104
        raise Exception('UnicodeDecodeError: could not decode')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   105
    return res_content_html, images
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   106
77
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
   107
def old_convert_oo_to_html(input): 
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
   108
    from oo_converters import convert   
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   109
    html_input, images = convert(input, 'html')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   110
    xhtml_input, _not_used_ = convert(input, 'xhtml')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   111
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   112
    enc = chardet.detect(xhtml_input)['encoding']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   113
    try_encodings = [enc, 'utf8', 'latin1']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   114
    res_content = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   115
    for encoding in try_encodings:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   116
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   117
            # TODO: fix path and manage images
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   118
            #res_content = fix_img_path(unicode(html_res_content,encoding),
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   119
            #                           unicode(xhtml_res_content,encoding),
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   120
            #                           iimg)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   121
            res_content_html = unicode(html_input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   122
            res_content_xhtml = unicode(xhtml_input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   123
            break;
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   124
        except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   125
            pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   126
    if not res_content_html or not res_content_xhtml:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   127
        raise Exception('UnicodeDecodeError: could not decode')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   128
    return res_content_html, res_content_xhtml, images
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   129
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   130
def markdown_from_code(code):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   131
    CODE_INDICATOR = "    " # 4 spaces
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   132
    return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   133
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   134