Merge with 0371caf8bcc6864a0f2250933431d3a913c9b379
authorraph
Tue, 20 Apr 2010 11:37:33 +0200
changeset 260 9075dc2fb93c
parent 257 7b7ba5e47268 (current diff)
parent 259 0371caf8bcc6 (diff)
child 261 b60ab54b6782
Merge with 0371caf8bcc6864a0f2250933431d3a913c9b379
--- a/src/cm/converters/__init__.py	Tue Apr 20 10:47:04 2010 +0200
+++ b/src/cm/converters/__init__.py	Tue Apr 20 11:37:33 2010 +0200
@@ -2,6 +2,9 @@
 import chardet 
 from cm.utils.string_utils import to_unicode 
 import re
+import os
+from cm.converters.oo_converters import extract_css_body
+
 
 # TODO: move that in text_base: save images
 def convert_from_mimetype(file_name, mime_type, format):
@@ -19,8 +22,12 @@
                      'application/msword',
                      ]:
         
-        xhtml_input, attachs = convert_oo_to_html(input)
-        converted_input = pandoc_convert(xhtml_input, 'html', format)
+        html_input, xhtml_input, attachs = convert_oo_to_html_and_xhtml(input)
+        if format == 'html':
+                _not_used_css, converted_input = extract_css_body(xhtml_input)
+                #converted_input = xhtml_input
+        
+        converted_input = pandoc_convert(html_input, 'html', format)
         
     ##############################
     # latex
@@ -37,8 +44,8 @@
     elif mime_type in ['text/html', 'application/xhtml+xml']:
         if format == 'html':
             converted_input = input
-        else:
-            converted_input = pandoc_convert(input, 'html', format)
+        
+        converted_input = pandoc_convert(input, 'html', format)
     ##############################
     # anything looks like text -> markdown
     elif mime_type in ['text/plain',
@@ -71,7 +78,7 @@
             match_html = res_html.next()
             if match_html:
                 img_name = match_html.group(1)
-                img_path = imgs[img_name]
+                img_path = os.path.split(img_name)[-1]
         except StopIteration:
             # TODO : report pb
             pass 
@@ -88,7 +95,6 @@
     
     enc = chardet.detect(html_input)['encoding']
     try_encodings = [enc, 'utf8', 'latin1']
-    res_content = None
     for encoding in try_encodings:
         try:
             res_content_html = unicode(html_input, encoding)
@@ -99,29 +105,33 @@
         raise Exception('UnicodeDecodeError: could not decode')
     return res_content_html, images
 
-def old_convert_oo_to_html(input): 
+def fix_html_img_path(html):
+    return html.replace('IMG SRC="../outdir/','IMG SRC="')
+    
+def convert_oo_to_html_and_xhtml(input): 
     from oo_converters import convert   
     html_input, images = convert(input, 'html')
     xhtml_input, _not_used_ = convert(input, 'xhtml')
-    
     enc = chardet.detect(xhtml_input)['encoding']
     try_encodings = [enc, 'utf8', 'latin1']
-    res_content = None
     for encoding in try_encodings:
         try:
-            # TODO: fix path and manage images
-            #res_content = fix_img_path(unicode(html_res_content,encoding),
-            #                           unicode(xhtml_res_content,encoding),
-            #                           iimg)
             res_content_html = unicode(html_input, encoding)
             res_content_xhtml = unicode(xhtml_input, encoding)
             break;
         except UnicodeDecodeError:
             pass
+
+    res_content_xhtml = fix_img_path(res_content_html, res_content_xhtml, images)
+    res_content_html = fix_html_img_path(res_content_html)
+    
     if not res_content_html or not res_content_xhtml:
         raise Exception('UnicodeDecodeError: could not decode')
-    return res_content_html, res_content_xhtml, images
+    return res_content_html, cleanup(res_content_xhtml), images
         
+def cleanup(string):
+    return string.replace(u'\xc2\xa0',u'')
+
 def markdown_from_code(code):
     CODE_INDICATOR = "    " # 4 spaces
     return '\n'.join([CODE_INDICATOR + line for line in code.split('\n')])
--- a/src/cm/converters/oo_converters.py	Tue Apr 20 10:47:04 2010 +0200
+++ b/src/cm/converters/oo_converters.py	Tue Apr 20 11:37:33 2010 +0200
@@ -223,35 +223,6 @@
 
 THE_INDIR = "indir"
 THE_INFILE = "infile"
-
-def fix_img_path(html,xhtml,imgs):
-    """
-    imgs : name --> path
-    """
-    finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
-    len_res_html = len(re.findall(finder_re,html,re.IGNORECASE))
-    len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE))
-    res_html = re.finditer(finder_re,html,re.IGNORECASE)
-    res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE)
-    result = []
-    last_index = 0
-    for match_xhtml in res_xhtml:
-        img_path = '' 
-        try:
-            match_html = res_html.next()
-            if match_html:
-                img_name = match_html.group(1)
-                img_path = imgs[img_name]
-        except StopIteration:
-            # TODO : report pb
-            pass 
-        offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
-        result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
-        result.append(img_path)
-        last_index = match_xhtml.end() - 1 # -1 because trailing "
-    result.append(xhtml[last_index:len(xhtml)])
-    return u''.join(result)
-
   
 def extract_css_body(xhtml):
     dom = parseString(xhtml.encode('utf8'))
--- a/src/cm/converters/pandoc_converters.py	Tue Apr 20 10:47:04 2010 +0200
+++ b/src/cm/converters/pandoc_converters.py	Tue Apr 20 11:37:33 2010 +0200
@@ -13,7 +13,8 @@
 from cm.utils.string_utils import to_unicode
 
 PANDOC_BIN = "pandoc"
-PANDOC_OPTIONS = "--sanitize-html "
+PANDOC_OPTIONS = " --sanitize-html "
+PANDOC_OPTIONS_RAW = " -R "
 
 MARKDOWN2PDF_BIN = "markdown2pdf"
 
@@ -37,7 +38,7 @@
 _PANDOC_ENCODING = 'utf8'
 
 @memoize
-def pandoc_convert(content, from_format, to_format, full=False):
+def pandoc_convert(content, from_format, to_format, full=False, raw=False):
     """
     Convert markdown content to pdf
     
@@ -56,7 +57,7 @@
         if from_format != 'markdown':
             content = pandoc_convert(content, from_format, 'markdown', True)
         return pandoc_markdown2pdf(content)
-    return pandoc_pandoc(content, from_format, to_format, full)
+    return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
 
 def content_or_file_name(content, file_name):
     if not content and not file_name:
@@ -154,7 +155,7 @@
 # TODO: use tidy to cleanup html
 
 @memoize
-def pandoc_pandoc(content, from_format, to_format, full=False):
+def pandoc_pandoc(content, from_format, to_format, full=False, raw=False):
     """
     Convert content (should be unicode) from from_format to to_format
     (if full: includes header & co [html, latex])
@@ -189,7 +190,11 @@
     input_file.close()
     
     # pandoc arguments and command line
-    cmd_args = ' %s -o %s ' %(PANDOC_OPTIONS,output_temp_name) 
+    p_options = PANDOC_OPTIONS
+    if raw:
+        p_options = PANDOC_OPTIONS_RAW
+                
+    cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
     if full:
         cmd_args += ' -s '
     cmd_args += ' -f %s ' % from_format