src/cm/converters/__init__.py
branchpreserve_html
changeset 252 0f0a79f7f213
parent 149 0f2c5744b39b
child 253 a844469257b0
equal deleted inserted replaced
251:3eb5299e8085 252:0f0a79f7f213
     1 from pandoc_converters import pandoc_convert
     1 from pandoc_converters import pandoc_convert
     2 import chardet 
     2 import chardet 
     3 from cm.utils.string_utils import to_unicode 
     3 from cm.utils.string_utils import to_unicode 
     4 import re
     4 import re
       
     5 from cm.converters.oo_converters import extract_css_body
       
     6 
     5 
     7 
     6 # TODO: move that in text_base: save images
     8 # TODO: move that in text_base: save images
     7 def convert_from_mimetype(file_name, mime_type, format):
     9 def convert_from_mimetype(file_name, mime_type, format):
     8     input = open(file_name, 'r').read()
    10     input = open(file_name, 'r').read()
     9     return _convert_from_mimetype(input, mime_type, format)
    11     return _convert_from_mimetype(input, mime_type, format)
    17     ##############################
    19     ##############################
    18     if mime_type in ['application/vnd.oasis.opendocument.text',
    20     if mime_type in ['application/vnd.oasis.opendocument.text',
    19                      'application/msword',
    21                      'application/msword',
    20                      ]:
    22                      ]:
    21         
    23         
    22         xhtml_input, attachs = convert_oo_to_html(input)
    24         html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
    23         converted_input = pandoc_convert(xhtml_input, 'html', format)
    25         if format == 'html':
       
    26                 _not_used_css, converted_input = extract_css_body(xhtml_input)
       
    27                 #converted_input = xhtml_input
       
    28         else:
       
    29             converted_input = pandoc_convert(html_input, 'html', format)
    24         
    30         
    25     ##############################
    31     ##############################
    26     # latex
    32     # latex
    27     elif mime_type in ['application/x-latex','text/x-tex',]:
    33     elif mime_type in ['application/x-latex','text/x-tex',]:
    28         converted_input = pandoc_convert(to_unicode(input), 'latex', format)
    34         converted_input = pandoc_convert(to_unicode(input), 'latex', format)
    86     from oo_converters import convert    
    92     from oo_converters import convert    
    87     html_input, images = convert(input, 'html')
    93     html_input, images = convert(input, 'html')
    88     
    94     
    89     enc = chardet.detect(html_input)['encoding']
    95     enc = chardet.detect(html_input)['encoding']
    90     try_encodings = [enc, 'utf8', 'latin1']
    96     try_encodings = [enc, 'utf8', 'latin1']
    91     res_content = None
       
    92     for encoding in try_encodings:
    97     for encoding in try_encodings:
    93         try:
    98         try:
    94             res_content_html = unicode(html_input, encoding)
    99             res_content_html = unicode(html_input, encoding)
    95             break;
   100             break;
    96         except UnicodeDecodeError:
   101         except UnicodeDecodeError:
    97             pass
   102             pass
    98     if not res_content_html:
   103     if not res_content_html:
    99         raise Exception('UnicodeDecodeError: could not decode')
   104         raise Exception('UnicodeDecodeError: could not decode')
   100     return res_content_html, images
   105     return res_content_html, images
   101 
   106 
   102 def old_convert_oo_to_html(input): 
   107 def convert_oo_to_html_and_xhtml(input): 
   103     from oo_converters import convert   
   108     from oo_converters import convert   
   104     html_input, images = convert(input, 'html')
   109     html_input, images = convert(input, 'html')
   105     xhtml_input, _not_used_ = convert(input, 'xhtml')
   110     xhtml_input, _not_used_ = convert(input, 'xhtml')
   106     
   111     
   107     enc = chardet.detect(xhtml_input)['encoding']
   112     enc = chardet.detect(xhtml_input)['encoding']
   108     try_encodings = [enc, 'utf8', 'latin1']
   113     try_encodings = [enc, 'utf8', 'latin1']
   109     res_content = None
       
   110     for encoding in try_encodings:
   114     for encoding in try_encodings:
   111         try:
   115         try:
   112             # TODO: fix path and manage images
   116             # TODO: fix path and manage images
   113             #res_content = fix_img_path(unicode(html_res_content,encoding),
   117             #res_content = fix_img_path(unicode(html_res_content,encoding),
   114             #                           unicode(xhtml_res_content,encoding),
   118             #                           unicode(xhtml_res_content,encoding),
   118             break;
   122             break;
   119         except UnicodeDecodeError:
   123         except UnicodeDecodeError:
   120             pass
   124             pass
   121     if not res_content_html or not res_content_xhtml:
   125     if not res_content_html or not res_content_xhtml:
   122         raise Exception('UnicodeDecodeError: could not decode')
   126         raise Exception('UnicodeDecodeError: could not decode')
   123     return res_content_html, res_content_xhtml, images
   127     return res_content_html, cleanup(res_content_xhtml), images
   124         
   128         
       
   129 def cleanup(string):
       
   130     return string.replace(u'\xc2\xa0',u'')
       
   131 
   125 def markdown_from_code(code):
   132 def markdown_from_code(code):
   126     CODE_INDICATOR = "    " # 4 spaces
   133     CODE_INDICATOR = "    " # 4 spaces
   127     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
   134     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
   128 
   135 
   129         
   136