src/cm/converters/__init__.py
changeset 260 9075dc2fb93c
parent 259 0371caf8bcc6
child 360 bfaab8740995
equal deleted inserted replaced
257:7b7ba5e47268 260:9075dc2fb93c
     1 from pandoc_converters import pandoc_convert
     1 from pandoc_converters import pandoc_convert
     2 import chardet 
     2 import chardet 
     3 from cm.utils.string_utils import to_unicode 
     3 from cm.utils.string_utils import to_unicode 
     4 import re
     4 import re
       
     5 import os
       
     6 from cm.converters.oo_converters import extract_css_body
       
     7 
     5 
     8 
     6 # TODO: move that in text_base: save images
     9 # TODO: move that in text_base: save images
     7 def convert_from_mimetype(file_name, mime_type, format):
    10 def convert_from_mimetype(file_name, mime_type, format):
     8     input = open(file_name, 'r').read()
    11     input = open(file_name, 'r').read()
     9     return _convert_from_mimetype(input, mime_type, format)
    12     return _convert_from_mimetype(input, mime_type, format)
    17     ##############################
    20     ##############################
    18     if mime_type in ['application/vnd.oasis.opendocument.text',
    21     if mime_type in ['application/vnd.oasis.opendocument.text',
    19                      'application/msword',
    22                      'application/msword',
    20                      ]:
    23                      ]:
    21         
    24         
    22         xhtml_input, attachs = convert_oo_to_html(input)
    25         html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
    23         converted_input = pandoc_convert(xhtml_input, 'html', format)
    26         if format == 'html':
       
    27                 _not_used_css, converted_input = extract_css_body(xhtml_input)
       
    28                 #converted_input = xhtml_input
       
    29         
       
    30         converted_input = pandoc_convert(html_input, 'html', format)
    24         
    31         
    25     ##############################
    32     ##############################
    26     # latex
    33     # latex
    27     elif mime_type in ['application/x-latex','text/x-tex',]:
    34     elif mime_type in ['application/x-latex','text/x-tex',]:
    28         converted_input = pandoc_convert(to_unicode(input), 'latex', format)
    35         converted_input = pandoc_convert(to_unicode(input), 'latex', format)
    35     ##############################
    42     ##############################
    36     # html
    43     # html
    37     elif mime_type in ['text/html', 'application/xhtml+xml']:
    44     elif mime_type in ['text/html', 'application/xhtml+xml']:
    38         if format == 'html':
    45         if format == 'html':
    39             converted_input = input
    46             converted_input = input
    40         else:
    47         
    41             converted_input = pandoc_convert(input, 'html', format)
    48         converted_input = pandoc_convert(input, 'html', format)
    42     ##############################
    49     ##############################
    43     # anything looks like text -> markdown
    50     # anything looks like text -> markdown
    44     elif mime_type in ['text/plain',
    51     elif mime_type in ['text/plain',
    45                        'text/english',
    52                        'text/english',
    46                        'text/enriched'
    53                        'text/enriched'
    69         img_path = '' 
    76         img_path = '' 
    70         try:
    77         try:
    71             match_html = res_html.next()
    78             match_html = res_html.next()
    72             if match_html:
    79             if match_html:
    73                 img_name = match_html.group(1)
    80                 img_name = match_html.group(1)
    74                 img_path = imgs[img_name]
    81                 img_path = os.path.split(img_name)[-1]
    75         except StopIteration:
    82         except StopIteration:
    76             # TODO : report pb
    83             # TODO : report pb
    77             pass 
    84             pass 
    78         offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
    85         offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
    79         result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
    86         result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
    86     from oo_converters import convert    
    93     from oo_converters import convert    
    87     html_input, images = convert(input, 'html')
    94     html_input, images = convert(input, 'html')
    88     
    95     
    89     enc = chardet.detect(html_input)['encoding']
    96     enc = chardet.detect(html_input)['encoding']
    90     try_encodings = [enc, 'utf8', 'latin1']
    97     try_encodings = [enc, 'utf8', 'latin1']
    91     res_content = None
       
    92     for encoding in try_encodings:
    98     for encoding in try_encodings:
    93         try:
    99         try:
    94             res_content_html = unicode(html_input, encoding)
   100             res_content_html = unicode(html_input, encoding)
    95             break;
   101             break;
    96         except UnicodeDecodeError:
   102         except UnicodeDecodeError:
    97             pass
   103             pass
    98     if not res_content_html:
   104     if not res_content_html:
    99         raise Exception('UnicodeDecodeError: could not decode')
   105         raise Exception('UnicodeDecodeError: could not decode')
   100     return res_content_html, images
   106     return res_content_html, images
   101 
   107 
   102 def old_convert_oo_to_html(input): 
   108 def fix_html_img_path(html):
       
   109     return html.replace('IMG SRC="../outdir/','IMG SRC="')
       
   110     
       
   111 def convert_oo_to_html_and_xhtml(input): 
   103     from oo_converters import convert   
   112     from oo_converters import convert   
   104     html_input, images = convert(input, 'html')
   113     html_input, images = convert(input, 'html')
   105     xhtml_input, _not_used_ = convert(input, 'xhtml')
   114     xhtml_input, _not_used_ = convert(input, 'xhtml')
   106     
       
   107     enc = chardet.detect(xhtml_input)['encoding']
   115     enc = chardet.detect(xhtml_input)['encoding']
   108     try_encodings = [enc, 'utf8', 'latin1']
   116     try_encodings = [enc, 'utf8', 'latin1']
   109     res_content = None
       
   110     for encoding in try_encodings:
   117     for encoding in try_encodings:
   111         try:
   118         try:
   112             # TODO: fix path and manage images
       
   113             #res_content = fix_img_path(unicode(html_res_content,encoding),
       
   114             #                           unicode(xhtml_res_content,encoding),
       
   115             #                           iimg)
       
   116             res_content_html = unicode(html_input, encoding)
   119             res_content_html = unicode(html_input, encoding)
   117             res_content_xhtml = unicode(xhtml_input, encoding)
   120             res_content_xhtml = unicode(xhtml_input, encoding)
   118             break;
   121             break;
   119         except UnicodeDecodeError:
   122         except UnicodeDecodeError:
   120             pass
   123             pass
       
   124 
       
   125     res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images)
       
   126     res_content_html = fix_html_img_path(res_content_html)
       
   127     
   121     if not res_content_html or not res_content_xhtml:
   128     if not res_content_html or not res_content_xhtml:
   122         raise Exception('UnicodeDecodeError: could not decode')
   129         raise Exception('UnicodeDecodeError: could not decode')
   123     return res_content_html, res_content_xhtml, images
   130     return res_content_html, cleanup(res_content_xhtml), images
   124         
   131         
       
   132 def cleanup(string):
       
   133     return string.replace(u'\xc2\xa0',u'')
       
   134 
   125 def markdown_from_code(code):
   135 def markdown_from_code(code):
   126     CODE_INDICATOR = "    " # 4 spaces
   136     CODE_INDICATOR = "    " # 4 spaces
   127     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
   137     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
   128 
   138 
   129         
   139