src/cm/converters/__init__.py
author Yves-Marie Haussonne <ymh.work+github@gmail.com>
Fri, 09 May 2014 18:35:26 +0200
changeset 656 a84519031134
parent 555 5d79dc4e50a3
permissions -rw-r--r--
add link to "privacy policy" in the header test
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
from pandoc_converters import pandoc_convert
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
import chardet 
149
0f2c5744b39b cleanup diff files / add experimental diff
raph
parents: 119
diff changeset
     3
from cm.utils.string_utils import to_unicode 
77
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
     4
import re
253
a844469257b0 fix img path
raph
parents: 252
diff changeset
     5
import os
360
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
     6
from oo_converters import extract_css_body
252
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
     7
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
# TODO: move that in text_base: save images
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    10
def convert_from_mimetype(file_name, mime_type, format):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    11
    input = open(file_name, 'r').read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    12
    return _convert_from_mimetype(input, mime_type, format)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    13
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    14
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    15
def _convert_from_mimetype(input, mime_type, format):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    16
    #input = to_unicode(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    17
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    18
    attachs = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    19
    attachs_dir = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    20
    ##############################
360
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    21
    # OO/MS-Word
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    22
    if mime_type in ['application/vnd.oasis.opendocument.text',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    23
                     'application/msword',
452
8f1d6a6cd7f6 forget a coma in list of input format for abiword conversion.
Production Moz <dev@sopinspace.com>
parents: 416
diff changeset
    24
                     'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
416
5573b959131d Adds rtfas input formats when converting with abiword.
Production Moz <dev@sopinspace.com>
parents: 360
diff changeset
    25
                     'application/rtf',
5573b959131d Adds rtfas input formats when converting with abiword.
Production Moz <dev@sopinspace.com>
parents: 360
diff changeset
    26
                     'text/rtf',
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
                     ]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    28
        
360
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    29
        from cm.cm_settings import USE_ABI
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    30
        if USE_ABI:
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    31
          from abi_converters import AbiFileConverter
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    32
          converter = AbiFileConverter()
555
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    33
          try:
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    34
            html_input, attachs = converter.convert_to_html(input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    35
            html_input = re.sub(r' awml:style="[^"]*"', '', html_input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    36
            converted_input = pandoc_convert(html_input, 'html', format)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    37
          except:
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    38
            # If Abiword fails for any reason, try libreoffice
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    39
            html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    40
            if format == 'html':
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    41
                  _not_used_css, converted_input = extract_css_body(xhtml_input)
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    42
                  #converted_input = xhtml_input
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    43
  
5d79dc4e50a3 When creating from uploaded file (in ms-word for eg.), try libroffice in case abiword fails.
gibus
parents: 452
diff changeset
    44
            converted_input = pandoc_convert(html_input, 'html', format)
360
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    45
        else:
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    46
          html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    47
          if format == 'html':
252
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
    48
                _not_used_css, converted_input = extract_css_body(xhtml_input)
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
    49
                #converted_input = xhtml_input
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 253
diff changeset
    50
        
360
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
    51
          converted_input = pandoc_convert(html_input, 'html', format)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    52
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    53
    ##############################
118
75d94dd14511 add latex conversion (with pandoc)
raph
parents: 78
diff changeset
    54
    # latex
75d94dd14511 add latex conversion (with pandoc)
raph
parents: 78
diff changeset
    55
    elif mime_type in ['application/x-latex','text/x-tex',]:
75d94dd14511 add latex conversion (with pandoc)
raph
parents: 78
diff changeset
    56
        converted_input = pandoc_convert(to_unicode(input), 'latex', format)
75d94dd14511 add latex conversion (with pandoc)
raph
parents: 78
diff changeset
    57
    
75d94dd14511 add latex conversion (with pandoc)
raph
parents: 78
diff changeset
    58
    ##############################
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    59
    # anything looks like code: put them into markdown citation
118
75d94dd14511 add latex conversion (with pandoc)
raph
parents: 78
diff changeset
    60
    elif mime_type.startswith('text/x-') or mime_type in ['application/x-ruby',]:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    61
        converted_input = markdown_from_code(input)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    62
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    63
    ##############################
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    64
    # html
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    65
    elif mime_type in ['text/html', 'application/xhtml+xml']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    66
        if format == 'html':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    67
            converted_input = input
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 253
diff changeset
    68
        
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 253
diff changeset
    69
        converted_input = pandoc_convert(input, 'html', format)
78
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    70
    ##############################
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    71
    # anything looks like text -> markdown
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    72
    elif mime_type in ['text/plain',
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    73
                       'text/english',
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    74
                       'text/enriched'
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    75
                      ]:
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    76
        converted_input = to_unicode(input)
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    77
    ##############################
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    78
    # default case: assume it's text
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    79
    else:
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    80
        converted_input = to_unicode(input)
dda94db1149a add default case in upload mime decoding (assume text)
raph
parents: 77
diff changeset
    81
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    82
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    83
    return converted_input, attachs
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    84
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    85
def fix_img_path(html, xhtml, imgs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    86
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    87
    imgs : name --> path
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    88
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    89
    finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    90
    len_res_html = len(re.findall(finder_re, html, re.IGNORECASE))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    91
    len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    92
    res_html = re.finditer(finder_re, html, re.IGNORECASE)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    93
    res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    94
    result = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    95
    last_index = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    96
    for match_xhtml in res_xhtml:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    97
        img_path = '' 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    98
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    99
            match_html = res_html.next()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   100
            if match_html:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   101
                img_name = match_html.group(1)
253
a844469257b0 fix img path
raph
parents: 252
diff changeset
   102
                img_path = os.path.split(img_name)[-1]
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   103
        except StopIteration:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   104
            # TODO : report pb
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   105
            pass 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   106
        offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   107
        result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   108
        result.append(img_path)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   109
        last_index = match_xhtml.end() - 1 # -1 because trailing "
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   110
    result.append(xhtml[last_index:len(xhtml)])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   111
    return u''.join(result)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   112
77
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
   113
def convert_oo_to_html(input):
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
   114
    from oo_converters import convert    
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   115
    html_input, images = convert(input, 'html')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   116
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   117
    enc = chardet.detect(html_input)['encoding']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   118
    try_encodings = [enc, 'utf8', 'latin1']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   119
    for encoding in try_encodings:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   120
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   121
            res_content_html = unicode(html_input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   122
            break;
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   123
        except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   124
            pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   125
    if not res_content_html:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   126
        raise Exception('UnicodeDecodeError: could not decode')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   127
    return res_content_html, images
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   128
253
a844469257b0 fix img path
raph
parents: 252
diff changeset
   129
def fix_html_img_path(html):
a844469257b0 fix img path
raph
parents: 252
diff changeset
   130
    return html.replace('IMG SRC="../outdir/','IMG SRC="')
a844469257b0 fix img path
raph
parents: 252
diff changeset
   131
    
252
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
   132
def convert_oo_to_html_and_xhtml(input): 
77
fe91eb717a96 import oo_converters locally (not at module level) to avoid weird uno imports
raph
parents: 50
diff changeset
   133
    from oo_converters import convert   
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   134
    html_input, images = convert(input, 'html')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   135
    xhtml_input, _not_used_ = convert(input, 'xhtml')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   136
    enc = chardet.detect(xhtml_input)['encoding']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   137
    try_encodings = [enc, 'utf8', 'latin1']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   138
    for encoding in try_encodings:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   139
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   140
            res_content_html = unicode(html_input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   141
            res_content_xhtml = unicode(xhtml_input, encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   142
            break;
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   143
        except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   144
            pass
253
a844469257b0 fix img path
raph
parents: 252
diff changeset
   145
a844469257b0 fix img path
raph
parents: 252
diff changeset
   146
    res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images)
a844469257b0 fix img path
raph
parents: 252
diff changeset
   147
    res_content_html = fix_html_img_path(res_content_html)
a844469257b0 fix img path
raph
parents: 252
diff changeset
   148
    
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   149
    if not res_content_html or not res_content_xhtml:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   150
        raise Exception('UnicodeDecodeError: could not decode')
252
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
   151
    return res_content_html, cleanup(res_content_xhtml), images
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   152
        
252
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
   153
def cleanup(string):
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
   154
    return string.replace(u'\xc2\xa0',u'')
0f0a79f7f213 do not use pandoc for html content
raph
parents: 149
diff changeset
   155
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   156
def markdown_from_code(code):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   157
    CODE_INDICATOR = "    " # 4 spaces
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   158
    return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   159
360
bfaab8740995 Add abiword as an alternative to open office for conversions
gibus
parents: 259
diff changeset
   160