comt: comparison src/cm/converters/__init_

equal deleted inserted replaced

--1:000000000000
+:40c8f766c9b8
+from oo_converters import convert
+from pandoc_converters import pandoc_convert
+import chardet
+# TODO: move that in text_base: save images
+def convert_from_mimetype(file_name, mime_type, format):
+input = open(file_name, 'r').read()
+return _convert_from_mimetype(input, mime_type, format)
+def to_unicode(input):
+if type(input) == str:
+res = None
+for encoding in [chardet.detect(input)['encoding'], 'utf8', 'latin1']:
+try:
+res = unicode(input, encoding)
+break;
+except UnicodeDecodeError:
+pass
+if not res:
+raise Exception('UnicodeDecodeError: could not decode')
+return res
+return input
+def _convert_from_mimetype(input, mime_type, format):
+#input = to_unicode(input)
+attachs = []
+attachs_dir = None
+##############################
+if mime_type in ['application/vnd.oasis.opendocument.text',
+'application/msword',
+]:
+xhtml_input, attachs = convert_oo_to_html(input)
+converted_input = pandoc_convert(xhtml_input, 'html', format)
+##############################
+# anything looks like text -> markdown
+elif mime_type in ['text/plain',
+'text/english',
+'text/enriched'
+]:
+converted_input = input
+##############################
+# anything looks like code: put them into markdown citation
+elif mime_type.startswith('text/x-') or mime_type in ['application/x-latex',
+'application/x-ruby',
+]:
+converted_input = markdown_from_code(input)
+##############################
+# html
+elif mime_type in ['text/html', 'application/xhtml+xml']:
+if format == 'html':
+converted_input = input
+else:
+converted_input = pandoc_convert(input, 'html', format)
+return converted_input, attachs
+def fix_img_path(html, xhtml, imgs):
+"""
+imgs : name --> path
+"""
+finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
+len_res_html = len(re.findall(finder_re, html, re.IGNORECASE))
+len_res_xhtml = len(re.findall(finder_re, xhtml, re.IGNORECASE))
+res_html = re.finditer(finder_re, html, re.IGNORECASE)
+res_xhtml = re.finditer(finder_re, xhtml, re.IGNORECASE)
+result = []
+last_index = 0
+for match_xhtml in res_xhtml:
+img_path = ''
+try:
+match_html = res_html.next()
+if match_html:
+img_name = match_html.group(1)
+img_path = imgs[img_name]
+except StopIteration:
+# TODO : report pb
+pass
+offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
+result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
+result.append(img_path)
+last_index = match_xhtml.end() - 1 # -1 because trailing "
+result.append(xhtml[last_index:len(xhtml)])
+return u''.join(result)
+def convert_oo_to_html(input):
+html_input, images = convert(input, 'html')
+enc = chardet.detect(html_input)['encoding']
+try_encodings = [enc, 'utf8', 'latin1']
+res_content = None
+for encoding in try_encodings:
+try:
+res_content_html = unicode(html_input, encoding)
+break;
+except UnicodeDecodeError:
+pass
+if not res_content_html:
+raise Exception('UnicodeDecodeError: could not decode')
+return res_content_html, images
+def old_convert_oo_to_html(input):
+html_input, images = convert(input, 'html')
+xhtml_input, _not_used_ = convert(input, 'xhtml')
+enc = chardet.detect(xhtml_input)['encoding']
+try_encodings = [enc, 'utf8', 'latin1']
+res_content = None
+for encoding in try_encodings:
+try:
+# TODO: fix path and manage images
+#res_content = fix_img_path(unicode(html_res_content,encoding),
+#                           unicode(xhtml_res_content,encoding),
+#                           iimg)
+res_content_html = unicode(html_input, encoding)
+res_content_xhtml = unicode(xhtml_input, encoding)
+break;
+except UnicodeDecodeError:
+pass
+if not res_content_html or not res_content_xhtml:
+raise Exception('UnicodeDecodeError: could not decode')
+return res_content_html, res_content_xhtml, images
+def markdown_from_code(code):
+CODE_INDICATOR = "    " # 4 spaces
+return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])

changeset 0	40c8f766c9b8
child 50	6db6c011a310