src/cm/converters/pandoc_converters.py
changeset 442 b6e443be2a9b
parent 428 9591c651391d
child 443 cacd524f5279
equal deleted inserted replaced
441:d5d3bcd26a0b 442:b6e443be2a9b
    11 import StringIO
    11 import StringIO
    12 import tidy
    12 import tidy
    13 from cm.utils.string_utils import to_unicode
    13 from cm.utils.string_utils import to_unicode
    14 from xml.dom.minidom import parseString
    14 from xml.dom.minidom import parseString
    15 import re
    15 import re
       
    16 from distutils.version import LooseVersion
    16 
    17 
    17 PANDOC_BIN = "pandoc"
    18 PANDOC_BIN = "pandoc"
    18 PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none "
    19 import commands
       
    20 PANDOC_VERSION = commands.getstatusoutput(PANDOC_BIN + " -v|head -n 1|awk '{print $2;}'")[1]
       
    21 if LooseVersion(PANDOC_VERSION) < '1.8':
       
    22   PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none "
       
    23 else:
       
    24   PANDOC_OPTIONS = " --email-obfuscation=none "
       
    25 
    19 PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none "
    26 PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none "
    20 
    27 
    21 MARKDOWN2PDF_BIN = "markdown2pdf"
    28 if LooseVersion(PANDOC_VERSION) < '1.9':
       
    29   MARKDOWN2PDF_BIN = "markdown2pdf"
       
    30 else:
       
    31   MARKDOWN2PDF_BIN = None
    22 
    32 
    23 # make sure binaries are available
    33 # make sure binaries are available
    24 from cm.utils.system import bin_search
    34 from cm.utils.system import bin_search
    25 bin_search(PANDOC_BIN)
    35 bin_search(PANDOC_BIN)
    26 bin_search(MARKDOWN2PDF_BIN)
    36 if MARKDOWN2PDF_BIN:
       
    37   bin_search(MARKDOWN2PDF_BIN)
    27 
    38 
    28 # pandoc capabilities
    39 # pandoc capabilities
    29 INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
    40 INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
    30 OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf']
    41 OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf', 'pdf']
    31 
       
    32 # add pdf output using markdown2pdf
       
    33 OUTPUT_FORMATS.append('pdf')
       
    34 
    42 
    35 # input formats
    43 # input formats
    36 CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]
    44 CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]
    37 
    45 
    38 DEFAULT_INPUT_FORMAT = 'markdown'
    46 DEFAULT_INPUT_FORMAT = 'markdown'
    53             content = do_tidy(content)
    61             content = do_tidy(content)
    54         except:
    62         except:
    55             # tidy fails ... try pandoc anyway...
    63             # tidy fails ... try pandoc anyway...
    56             content = to_unicode(content)
    64             content = to_unicode(content)
    57     # if to_format is pdf: use markdown2pdf
    65     # if to_format is pdf: use markdown2pdf
    58     if to_format == 'pdf':        
    66     if MARKDOWN2PDF_BIN and to_format == 'pdf':        
    59         if from_format != 'markdown':
    67         if from_format != 'markdown':
    60             content = pandoc_convert(content, from_format, 'markdown', True)
    68             content = pandoc_convert(content, from_format, 'markdown', True)
    61         return pandoc_markdown2pdf(content)
    69         return pandoc_markdown2pdf(content)
    62     return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
    70     return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
    63 
    71 
   189     if type(content) != unicode:
   197     if type(content) != unicode:
   190         raise Exception('Content is not in unicode format!')
   198         raise Exception('Content is not in unicode format!')
   191 
   199 
   192     # temp file
   200     # temp file
   193     input_file, input_temp_name = get_filetemp('w', 'input')
   201     input_file, input_temp_name = get_filetemp('w', 'input')
   194     output_temp_fp, output_temp_name = get_filetemp('r', 'output')
   202     # For some reason when pandoc > 1.9 converts to PDF, '-t' shouldn't be used but output file name extension has to be '.pdf'
       
   203     if to_format != 'pdf':
       
   204       output_temp_fp, output_temp_name = get_filetemp('r', 'output')
       
   205     else:
       
   206       output_temp_fp, output_temp_name = get_filetemp('r', 'output.pdf')
   195     output_temp_fp.close()
   207     output_temp_fp.close()
   196     
   208     
   197     error_temp_fp, error_temp_name = get_filetemp('w', 'err')
   209     error_temp_fp, error_temp_name = get_filetemp('w', 'err')
   198     error_temp_fp.close()
   210     error_temp_fp.close()
   199     
   211     
   229 
   241 
   230     cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
   242     cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
   231     if full:
   243     if full:
   232         cmd_args += ' -s '
   244         cmd_args += ' -s '
   233     cmd_args += ' -f %s ' % from_format
   245     cmd_args += ' -f %s ' % from_format
   234     cmd_args += ' -t %s ' % to_format
   246     if to_format != 'pdf':
       
   247       cmd_args += ' -t %s ' % to_format
   235     cmd_args += ' %s ' % input_temp_name
   248     cmd_args += ' %s ' % input_temp_name
   236     cmd = PANDOC_BIN + ' ' + cmd_args
   249     cmd = PANDOC_BIN + ' ' + cmd_args
   237 
   250 
   238     #from socommons.converters.new_conv import controlled_Popen 
   251     #from socommons.converters.new_conv import controlled_Popen 
   239     #controlled_Popen(cmd, stderr=file(error_temp_name,'w'))
   252     #controlled_Popen(cmd, stderr=file(error_temp_name,'w'))