src/cm/converters/pandoc_converters.py
author gibus
Thu, 09 Aug 2012 16:55:25 +0200
changeset 459 a69421197502
parent 458 ba7e05582435
child 517 e7bc083fa9dc
permissions -rw-r--r--
Fixed renderContents on BeautifilSoup if there is no body in source html
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
# python 2.5 compat
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
from __future__ import with_statement
261
b60ab54b6782 fix usage of dj caching
raph
parents: 259
diff changeset
     3
from cm.utils.cache import memoize, dj_memoize
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     4
######
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     5
## This module requires pandoc v > 1.0 (pandoc & markdown executables) 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     6
######
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     7
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
from subprocess import Popen, PIPE, call
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
import os
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    10
from tempfile import mkstemp
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    11
import StringIO
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    12
import tidy
149
0f2c5744b39b cleanup diff files / add experimental diff
raph
parents: 119
diff changeset
    13
from cm.utils.string_utils import to_unicode
458
ba7e05582435 When extracting body of a document in HTML source format (i.e. without pandoc conversion before display), use BeautifilSoup, which is more tolerant, instead of xml.dom.minidom, which crashes too often.
gibus
parents: 443
diff changeset
    14
from BeautifulSoup import BeautifulSoup
352
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
    15
import re
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    16
from distutils.version import LooseVersion
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    17
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    18
PANDOC_BIN = "pandoc"
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    19
import commands
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    20
PANDOC_VERSION = commands.getstatusoutput(PANDOC_BIN + " -v|head -n 1|awk '{print $2;}'")[1]
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    21
if LooseVersion(PANDOC_VERSION) < '1.8':
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    22
  PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none "
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    23
else:
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    24
  PANDOC_OPTIONS = " --email-obfuscation=none "
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    25
352
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
    26
PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none "
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    28
if LooseVersion(PANDOC_VERSION) < '1.9':
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    29
  MARKDOWN2PDF_BIN = "markdown2pdf"
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    30
else:
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    31
  MARKDOWN2PDF_BIN = None
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    32
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    33
# make sure binaries are available
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    34
from cm.utils.system import bin_search
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    35
bin_search(PANDOC_BIN)
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    36
if MARKDOWN2PDF_BIN:
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    37
  bin_search(MARKDOWN2PDF_BIN)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    38
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    39
# pandoc capabilities
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    40
INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
443
cacd524f5279 Adds export to epub.
gibus
parents: 442
diff changeset
    41
OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf', 'pdf', 'epub']
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    42
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    43
# input formats
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    44
CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    45
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    46
DEFAULT_INPUT_FORMAT = 'markdown'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    47
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    48
_PANDOC_ENCODING = 'utf8'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    49
261
b60ab54b6782 fix usage of dj caching
raph
parents: 259
diff changeset
    50
@dj_memoize
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
    51
def pandoc_convert(content, from_format, to_format, full=False, raw=False):
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    52
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    53
    Convert markdown content to pdf
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    54
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    55
    >>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    56
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    57
    # pandoc does not react well when html is not valid
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    58
    # use tidy to clean html  
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    59
    if from_format == 'html':
119
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents: 0
diff changeset
    60
        try:
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents: 0
diff changeset
    61
            content = do_tidy(content)
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents: 0
diff changeset
    62
        except:
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents: 0
diff changeset
    63
            # tidy fails ... try pandoc anyway...
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents: 0
diff changeset
    64
            content = to_unicode(content)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    65
    # if to_format is pdf: use markdown2pdf
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
    66
    if MARKDOWN2PDF_BIN and to_format == 'pdf':        
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    67
        if from_format != 'markdown':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    68
            content = pandoc_convert(content, from_format, 'markdown', True)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    69
        return pandoc_markdown2pdf(content)
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
    70
    return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    71
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    72
def content_or_file_name(content, file_name):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    73
    if not content and not file_name:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    74
        raise Exception('You should provide either a content or a file_name')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    75
    if content and file_name:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    76
        raise Exception('You should not provide a content AND a file_name')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    77
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    78
    if file_name:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    79
        fp = file(file_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    80
        content = fp.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    81
        fp.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    82
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    83
    return content
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    84
261
b60ab54b6782 fix usage of dj caching
raph
parents: 259
diff changeset
    85
@dj_memoize
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    86
def do_tidy(content=None, file_name=None):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    87
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    88
    Tidy (html) content
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    89
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    90
    >>> res = do_tidy('<span>sdd')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    91
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    92
    content = content_or_file_name(content, file_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    93
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    94
    tidy_options = dict(output_xhtml=1, 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    95
                        add_xml_decl=0, 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    96
                        indent=0, 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    97
                        tidy_mark=0,
352
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
    98
                        logical_emphasis=1,
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
    99
                        wrap=0,
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   100
                        input_encoding='utf8',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   101
                        output_encoding='utf8',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   102
                        )
119
5e8dda1b7631 recover when tidy trashes: try markdown anyway
raph
parents: 0
diff changeset
   103
    tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   104
    tidyied_content = str(tidyied_content)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   105
    if content and not tidyied_content.strip():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   106
        raise Exception('Content could not be tidyfied') 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   107
    return str(tidyied_content).decode('utf8')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   108
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   109
351
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   110
def get_filetemp(mode="r", suffix=''):
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   111
    (fd, fname) = mkstemp(suffix)
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   112
    return (os.fdopen(fd, mode), fname)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   113
261
b60ab54b6782 fix usage of dj caching
raph
parents: 259
diff changeset
   114
@dj_memoize
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   115
def pandoc_markdown2pdf(content=None, file_name=None):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   116
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   117
    Convert markdown content to pdf
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   118
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   119
    >>> pdf_content = pandoc_markdown2pdf('# dssd')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   120
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   121
    content = content_or_file_name(content, file_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   122
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   123
    # write file to disk
351
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   124
    temp_file, input_temp_name = get_filetemp('w', 'input')
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   125
    fp_error, error_temp_name = get_filetemp('w', 'err')
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   126
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   127
    temp_file.write(content.encode(_PANDOC_ENCODING))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   128
    temp_file.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   129
    
357
2f4587a37ff9 if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents: 356
diff changeset
   130
    cust_tex = " --xetex "
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   131
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   132
    # use markdown2pdf
351
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   133
    retcode = call(MARKDOWN2PDF_BIN + cust_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)
357
2f4587a37ff9 if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents: 356
diff changeset
   134
2f4587a37ff9 if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents: 356
diff changeset
   135
    # xetex seems to randomly cause "Invalid or incomplete multibyte or wide character" errors, try without it
2f4587a37ff9 if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents: 356
diff changeset
   136
    if retcode:
358
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   137
      # build absolute address for latex header file
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   138
      _tmp_ = __file__.split(os.path.sep)[:-1]
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   139
      _tmp_.append('latex_header.txt')
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   140
      _tmp_.insert(0, os.path.sep)
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   141
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   142
      LATEX_HEADER_PATH = os.path.join(*_tmp_)
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   143
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   144
      if not os.path.isfile(LATEX_HEADER_PATH):
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   145
        raise Exception('LATEX_HEADER_PATH is not a file!')
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   146
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   147
      # custom latex header
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   148
      cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   149
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on
Production Moz <dev@sopinspace.com>
parents: 357
diff changeset
   150
      retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)
357
2f4587a37ff9 if markdown2pdf fails try without -xetex option
Production Moz <dev@sopinspace.com>
parents: 356
diff changeset
   151
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   152
    fp_error.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   153
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   154
    fp_error = file(error_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   155
    error = fp_error.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   156
    fp_error.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   157
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   158
    os.remove(input_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   159
    os.remove(error_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   160
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   161
    if retcode:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   162
        raise Exception(error)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   163
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   164
    output_temp_name = input_temp_name + '.pdf'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   165
    fp_output = file(output_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   166
    pdf_content = fp_output.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   167
    fp_output.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   168
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   169
    os.remove(output_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   170
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   171
    return pdf_content
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   172
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   173
# TODO: manage images in pandoc (?)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   174
# TODO: use tidy to cleanup html
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   175
261
b60ab54b6782 fix usage of dj caching
raph
parents: 259
diff changeset
   176
@dj_memoize
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
   177
def pandoc_pandoc(content, from_format, to_format, full=False, raw=False):
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   178
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   179
    Convert content (should be unicode) from from_format to to_format
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   180
    (if full: includes header & co [html, latex])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   181
    Returns out (unicode), err
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   182
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   183
    >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   184
    >>> print err
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   185
    None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   186
    >>> res.replace("\\n","")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   187
    u'<h1 id="sdsd">sdsd</h1>'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   188
    >>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   189
    >>> print err
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   190
    None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   191
    """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   192
    # verify formats
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   193
    if from_format not in INPUT_FORMATS:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   194
        raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS)))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   195
    if to_format not in OUTPUT_FORMATS:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   196
        raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS)))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   197
    if type(content) != unicode:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   198
        raise Exception('Content is not in unicode format!')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   199
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   200
    # temp file
351
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   201
    input_file, input_temp_name = get_filetemp('w', 'input')
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   202
    # For some reason when pandoc > 1.9 converts to PDF, '-t' shouldn't be used but output file name extension has to be '.pdf'
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   203
    if to_format != 'pdf':
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   204
      output_temp_fp, output_temp_name = get_filetemp('r', 'output')
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   205
    else:
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   206
      output_temp_fp, output_temp_name = get_filetemp('r', 'output.pdf')
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   207
    output_temp_fp.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   208
    
351
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf
Production Moz <dev@sopinspace.com>
parents: 261
diff changeset
   209
    error_temp_fp, error_temp_name = get_filetemp('w', 'err')
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   210
    error_temp_fp.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   211
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   212
    input_file.write(content.encode(_PANDOC_ENCODING))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   213
    input_file.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   214
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   215
    # pandoc arguments and command line
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
   216
    p_options = PANDOC_OPTIONS
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
   217
    if raw:
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
   218
        p_options = PANDOC_OPTIONS_RAW
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
   219
                
352
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   220
    # do not use pandoc to convert from html to html
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   221
    if from_format==to_format=='html':
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   222
      # get body content
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   223
      stdoutdata = (content.encode('utf8'))
458
ba7e05582435 When extracting body of a document in HTML source format (i.e. without pandoc conversion before display), use BeautifilSoup, which is more tolerant, instead of xml.dom.minidom, which crashes too often.
gibus
parents: 443
diff changeset
   224
      soup = BeautifulSoup(stdoutdata)
ba7e05582435 When extracting body of a document in HTML source format (i.e. without pandoc conversion before display), use BeautifilSoup, which is more tolerant, instead of xml.dom.minidom, which crashes too often.
gibus
parents: 443
diff changeset
   225
      body = soup.body
459
a69421197502 Fixed renderContents on BeautifilSoup if there is no body in source html
gibus
parents: 458
diff changeset
   226
      if body:
a69421197502 Fixed renderContents on BeautifilSoup if there is no body in source html
gibus
parents: 458
diff changeset
   227
        stdoutdata = body.renderContents()
352
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   228
      # strip leading spaces
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   229
      stdoutdata = re.sub(r"^\s+", '', stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   230
      # add new line before closing bracket
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   231
      stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   232
      # do not split closing tag with following opening tag
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   233
      stdoutdata = re.sub(r">\n<", r"><", stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   234
      return stdoutdata
07a1fba18fff do not use pandoc to convert from html to html
Production Moz <dev@sopinspace.com>
parents: 351
diff changeset
   235
259
0371caf8bcc6 always use pandoc but in raw mode for html->html convert
raph
parents: 252
diff changeset
   236
    cmd_args = ' %s -o %s ' %(p_options,output_temp_name) 
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   237
    if full:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   238
        cmd_args += ' -s '
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   239
    cmd_args += ' -f %s ' % from_format
442
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   240
    if to_format != 'pdf':
b6e443be2a9b Takes into account various releases of pandoc.
gibus
parents: 428
diff changeset
   241
      cmd_args += ' -t %s ' % to_format
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   242
    cmd_args += ' %s ' % input_temp_name
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   243
    cmd = PANDOC_BIN + ' ' + cmd_args
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   244
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   245
    fp_error = file(error_temp_name,'w')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   246
    retcode = call(cmd, shell=True, stderr=fp_error)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   247
    fp_error.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   248
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   249
    fp_error = file(error_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   250
    error = fp_error.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   251
    fp_error.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   252
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   253
    fp_output = file(output_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   254
    stdoutdata = fp_output.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   255
    fp_output.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   256
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   257
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   258
    # cleanup
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   259
    os.remove(output_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   260
    os.remove(input_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   261
    os.remove(error_temp_name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   262
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   263
    if retcode:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   264
        raise Exception(error)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   265
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   266
    # try converting to unicode
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   267
    try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   268
        stdoutdata = stdoutdata.decode(_PANDOC_ENCODING)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   269
    except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   270
        # this will fail for binary output formats such as odt
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   271
        # return result without conversion then
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   272
        pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   273
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   274
    return stdoutdata
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   275
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   276
if __name__ == "__main__":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   277
    import doctest
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   278
    doctest.testmod()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   279