src/cm/converters/__init__.py
changeset 0 40c8f766c9b8
child 50 6db6c011a310
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cm/converters/__init__.py	Mon Nov 23 15:14:29 2009 +0100
@@ -0,0 +1,132 @@
+from oo_converters import convert
+from pandoc_converters import pandoc_convert
+import chardet 
+
+# TODO: move that in text_base: save images
+def convert_from_mimetype(file_name, mime_type, format):
+    input = open(file_name, 'r').read()
+    return _convert_from_mimetype(input, mime_type, format)
+
+def to_unicode(input):
+    if type(input) == str:
+        res = None
+        for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
+            try:
+                res = unicode(input, encoding)
+                break;
+            except UnicodeDecodeError:
+                pass
+        if not res:
+            raise Exception('UnicodeDecodeError: could not decode')
+        return res
+    return input
+
+def _convert_from_mimetype(input, mime_type, format):
+    #input = to_unicode(input)
+        
+    attachs = []
+    attachs_dir = None
+    ##############################
+    if mime_type in ['application/vnd.oasis.opendocument.text',
+                     'application/msword',
+                     ]:
+        
+        xhtml_input, attachs = convert_oo_to_html(input)
+        converted_input = pandoc_convert(xhtml_input, 'html', format)
+        
+    ##############################
+    # anything looks like text -> markdown
+    elif mime_type in ['text/plain',
+                       'text/english',
+                       'text/enriched'
+                      ]:
+        converted_input = input
+        
+    ##############################
+    # anything looks like code: put them into markdown citation
+    elif mime_type.startswith('text/x-') or mime_type in ['application/x-latex',
+                                                          'application/x-ruby',
+                       ]:
+        converted_input = markdown_from_code(input)
+
+    ##############################
+    # html
+    elif mime_type in ['text/html', 'application/xhtml+xml']:
+        if format == 'html':
+            converted_input = input
+        else:
+            converted_input = pandoc_convert(input, 'html', format)
+
+    return converted_input, attachs
+    
+def fix_img_path(html, xhtml, imgs):
+    """
+    imgs : name --> path
+    """
+    finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
+    len_res_html = len(re.findall(finder_re, html, re.IGNORECASE))
+    len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE))
+    res_html = re.finditer(finder_re, html, re.IGNORECASE)
+    res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE)
+    result = []
+    last_index = 0
+    for match_xhtml in res_xhtml:
+        img_path = '' 
+        try:
+            match_html = res_html.next()
+            if match_html:
+                img_name = match_html.group(1)
+                img_path = imgs[img_name]
+        except StopIteration:
+            # TODO : report pb
+            pass 
+        offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
+        result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
+        result.append(img_path)
+        last_index = match_xhtml.end() - 1 # -1 because trailing "
+    result.append(xhtml[last_index:len(xhtml)])
+    return u''.join(result)
+
+def convert_oo_to_html(input):    
+    html_input, images = convert(input, 'html')
+    
+    enc = chardet.detect(html_input)['encoding']
+    try_encodings = [enc, 'utf8', 'latin1']
+    res_content = None
+    for encoding in try_encodings:
+        try:
+            res_content_html = unicode(html_input, encoding)
+            break;
+        except UnicodeDecodeError:
+            pass
+    if not res_content_html:
+        raise Exception('UnicodeDecodeError: could not decode')
+    return res_content_html, images
+
+def old_convert_oo_to_html(input):    
+    html_input, images = convert(input, 'html')
+    xhtml_input, _not_used_ = convert(input, 'xhtml')
+    
+    enc = chardet.detect(xhtml_input)['encoding']
+    try_encodings = [enc, 'utf8', 'latin1']
+    res_content = None
+    for encoding in try_encodings:
+        try:
+            # TODO: fix path and manage images
+            #res_content = fix_img_path(unicode(html_res_content,encoding),
+            #                           unicode(xhtml_res_content,encoding),
+            #                           iimg)
+            res_content_html = unicode(html_input, encoding)
+            res_content_xhtml = unicode(xhtml_input, encoding)
+            break;
+        except UnicodeDecodeError:
+            pass
+    if not res_content_html or not res_content_xhtml:
+        raise Exception('UnicodeDecodeError: could not decode')
+    return res_content_html, res_content_xhtml, images
+        
+def markdown_from_code(code):
+    CODE_INDICATOR = "    " # 4 spaces
+    return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
+
+        
\ No newline at end of file