src/cm/converters/__init__.py
changeset 0 40c8f766c9b8
child 50 6db6c011a310
equal deleted inserted replaced
-1:000000000000 0:40c8f766c9b8
       
     1 from oo_converters import convert
       
     2 from pandoc_converters import pandoc_convert
       
     3 import chardet 
       
     4 
       
     5 # TODO: move that in text_base: save images
       
     6 def convert_from_mimetype(file_name, mime_type, format):
       
     7     input = open(file_name, 'r').read()
       
     8     return _convert_from_mimetype(input, mime_type, format)
       
     9 
       
    10 def to_unicode(input):
       
    11     if type(input) == str:
       
    12         res = None
       
    13         for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
       
    14             try:
       
    15                 res = unicode(input, encoding)
       
    16                 break;
       
    17             except UnicodeDecodeError:
       
    18                 pass
       
    19         if not res:
       
    20             raise Exception('UnicodeDecodeError: could not decode')
       
    21         return res
       
    22     return input
       
    23 
       
    24 def _convert_from_mimetype(input, mime_type, format):
       
    25     #input = to_unicode(input)
       
    26         
       
    27     attachs = []
       
    28     attachs_dir = None
       
    29     ##############################
       
    30     if mime_type in ['application/vnd.oasis.opendocument.text',
       
    31                      'application/msword',
       
    32                      ]:
       
    33         
       
    34         xhtml_input, attachs = convert_oo_to_html(input)
       
    35         converted_input = pandoc_convert(xhtml_input, 'html', format)
       
    36         
       
    37     ##############################
       
    38     # anything looks like text -> markdown
       
    39     elif mime_type in ['text/plain',
       
    40                        'text/english',
       
    41                        'text/enriched'
       
    42                       ]:
       
    43         converted_input = input
       
    44         
       
    45     ##############################
       
    46     # anything looks like code: put them into markdown citation
       
    47     elif mime_type.startswith('text/x-') or mime_type in ['application/x-latex',
       
    48                                                           'application/x-ruby',
       
    49                        ]:
       
    50         converted_input = markdown_from_code(input)
       
    51 
       
    52     ##############################
       
    53     # html
       
    54     elif mime_type in ['text/html', 'application/xhtml+xml']:
       
    55         if format == 'html':
       
    56             converted_input = input
       
    57         else:
       
    58             converted_input = pandoc_convert(input, 'html', format)
       
    59 
       
    60     return converted_input, attachs
       
    61     
       
    62 def fix_img_path(html, xhtml, imgs):
       
    63     """
       
    64     imgs : name --> path
       
    65     """
       
    66     finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
       
    67     len_res_html = len(re.findall(finder_re, html, re.IGNORECASE))
       
    68     len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE))
       
    69     res_html = re.finditer(finder_re, html, re.IGNORECASE)
       
    70     res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE)
       
    71     result = []
       
    72     last_index = 0
       
    73     for match_xhtml in res_xhtml:
       
    74         img_path = '' 
       
    75         try:
       
    76             match_html = res_html.next()
       
    77             if match_html:
       
    78                 img_name = match_html.group(1)
       
    79                 img_path = imgs[img_name]
       
    80         except StopIteration:
       
    81             # TODO : report pb
       
    82             pass 
       
    83         offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
       
    84         result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
       
    85         result.append(img_path)
       
    86         last_index = match_xhtml.end() - 1 # -1 because trailing "
       
    87     result.append(xhtml[last_index:len(xhtml)])
       
    88     return u''.join(result)
       
    89 
       
    90 def convert_oo_to_html(input):    
       
    91     html_input, images = convert(input, 'html')
       
    92     
       
    93     enc = chardet.detect(html_input)['encoding']
       
    94     try_encodings = [enc, 'utf8', 'latin1']
       
    95     res_content = None
       
    96     for encoding in try_encodings:
       
    97         try:
       
    98             res_content_html = unicode(html_input, encoding)
       
    99             break;
       
   100         except UnicodeDecodeError:
       
   101             pass
       
   102     if not res_content_html:
       
   103         raise Exception('UnicodeDecodeError: could not decode')
       
   104     return res_content_html, images
       
   105 
       
   106 def old_convert_oo_to_html(input):    
       
   107     html_input, images = convert(input, 'html')
       
   108     xhtml_input, _not_used_ = convert(input, 'xhtml')
       
   109     
       
   110     enc = chardet.detect(xhtml_input)['encoding']
       
   111     try_encodings = [enc, 'utf8', 'latin1']
       
   112     res_content = None
       
   113     for encoding in try_encodings:
       
   114         try:
       
   115             # TODO: fix path and manage images
       
   116             #res_content = fix_img_path(unicode(html_res_content,encoding),
       
   117             #                           unicode(xhtml_res_content,encoding),
       
   118             #                           iimg)
       
   119             res_content_html = unicode(html_input, encoding)
       
   120             res_content_xhtml = unicode(xhtml_input, encoding)
       
   121             break;
       
   122         except UnicodeDecodeError:
       
   123             pass
       
   124     if not res_content_html or not res_content_xhtml:
       
   125         raise Exception('UnicodeDecodeError: could not decode')
       
   126     return res_content_html, res_content_xhtml, images
       
   127         
       
   128 def markdown_from_code(code):
       
   129     CODE_INDICATOR = "    " # 4 spaces
       
   130     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
       
   131 
       
   132