comt: src/cm/converters/pandoc_converters.py@b6e443be2a9b (annotated)

0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	1	# python 2.5 compat
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	2	from __future__ import with_statement
261 b60ab54b6782 fix usage of dj caching raph parents: 259 diff changeset	3	from cm.utils.cache import memoize, dj_memoize
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	4	######
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	5	## This module requires pandoc v > 1.0 (pandoc & markdown executables)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	6	######
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	7
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	8	from subprocess import Popen, PIPE, call
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	9	import os
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	10	from tempfile import mkstemp
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	11	import StringIO
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	12	import tidy
149 0f2c5744b39b cleanup diff files / add experimental diff raph parents: 119 diff changeset	13	from cm.utils.string_utils import to_unicode
352 07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	14	from xml.dom.minidom import parseString
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	15	import re
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	16	from distutils.version import LooseVersion
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	17
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	18	PANDOC_BIN = "pandoc"
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	19	import commands
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	20	PANDOC_VERSION = commands.getstatusoutput(PANDOC_BIN + " -v\|head -n 1\|awk '{print $2;}'")[1]
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	21	if LooseVersion(PANDOC_VERSION) < '1.8':
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	22	PANDOC_OPTIONS = " --sanitize-html --email-obfuscation=none "
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	23	else:
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	24	PANDOC_OPTIONS = " --email-obfuscation=none "
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	25
352 07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	26	PANDOC_OPTIONS_RAW = " -R --email-obfuscation=none "
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	27
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	28	if LooseVersion(PANDOC_VERSION) < '1.9':
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	29	MARKDOWN2PDF_BIN = "markdown2pdf"
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	30	else:
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	31	MARKDOWN2PDF_BIN = None
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	32
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	33	# make sure binaries are available
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	34	from cm.utils.system import bin_search
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	35	bin_search(PANDOC_BIN)
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	36	if MARKDOWN2PDF_BIN:
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	37	bin_search(MARKDOWN2PDF_BIN)
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	38
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	39	# pandoc capabilities
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	40	INPUT_FORMATS = ['native', 'markdown', 'rst', 'html', 'latex']
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	41	OUTPUT_FORMATS = ['native', 'html', 's5', 'docbook', 'opendocument', 'odt', 'latex', 'context', 'texinfo', 'man', 'markdown', 'rst', 'mediawiki', 'rtf', 'pdf']
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	42
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	43	# input formats
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	44	CHOICES_INPUT_FORMATS = [(f, f) for f in ['markdown', 'rst', 'html']]
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	45
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	46	DEFAULT_INPUT_FORMAT = 'markdown'
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	47
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	48	_PANDOC_ENCODING = 'utf8'
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	49
261 b60ab54b6782 fix usage of dj caching raph parents: 259 diff changeset	50	@dj_memoize
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	51	def pandoc_convert(content, from_format, to_format, full=False, raw=False):
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	52	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	53	Convert markdown content to pdf
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	54
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	55	>>> res = pandoc_convert('<span>dssd', 'html', 'pdf')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	56	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	57	# pandoc does not react well when html is not valid
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	58	# use tidy to clean html
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	59	if from_format == 'html':
119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: 0 diff changeset	60	try:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: 0 diff changeset	61	content = do_tidy(content)
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: 0 diff changeset	62	except:
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: 0 diff changeset	63	# tidy fails ... try pandoc anyway...
5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: 0 diff changeset	64	content = to_unicode(content)
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	65	# if to_format is pdf: use markdown2pdf
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	66	if MARKDOWN2PDF_BIN and to_format == 'pdf':
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	67	if from_format != 'markdown':
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	68	content = pandoc_convert(content, from_format, 'markdown', True)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	69	return pandoc_markdown2pdf(content)
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	70	return pandoc_pandoc(content, from_format, to_format, full, from_format==to_format=='html') # use raw pandoc convertion if html->html
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	71
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	72	def content_or_file_name(content, file_name):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	73	if not content and not file_name:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	74	raise Exception('You should provide either a content or a file_name')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	75	if content and file_name:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	76	raise Exception('You should not provide a content AND a file_name')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	77
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	78	if file_name:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	79	fp = file(file_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	80	content = fp.read()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	81	fp.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	82
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	83	return content
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	84
261 b60ab54b6782 fix usage of dj caching raph parents: 259 diff changeset	85	@dj_memoize
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	86	def do_tidy(content=None, file_name=None):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	87	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	88	Tidy (html) content
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	89
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	90	>>> res = do_tidy('<span>sdd')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	91	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	92	content = content_or_file_name(content, file_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	93
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	94	tidy_options = dict(output_xhtml=1,
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	95	add_xml_decl=0,
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	96	indent=0,
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	97	tidy_mark=0,
352 07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	98	logical_emphasis=1,
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	99	wrap=0,
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	100	input_encoding='utf8',
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	101	output_encoding='utf8',
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	102	)
119 5e8dda1b7631 recover when tidy trashes: try markdown anyway raph parents: 0 diff changeset	103	tidyied_content = tidy.parseString(to_unicode(content).encode('utf8'), **tidy_options)
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	104	tidyied_content = str(tidyied_content)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	105	if content and not tidyied_content.strip():
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	106	raise Exception('Content could not be tidyfied')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	107	return str(tidyied_content).decode('utf8')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	108
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	109
351 9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	110	def get_filetemp(mode="r", suffix=''):
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	111	(fd, fname) = mkstemp(suffix)
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	112	return (os.fdopen(fd, mode), fname)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	113
261 b60ab54b6782 fix usage of dj caching raph parents: 259 diff changeset	114	@dj_memoize
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	115	def pandoc_markdown2pdf(content=None, file_name=None):
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	116	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	117	Convert markdown content to pdf
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	118
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	119	>>> pdf_content = pandoc_markdown2pdf('# dssd')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	120	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	121	content = content_or_file_name(content, file_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	122
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	123	# write file to disk
351 9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	124	temp_file, input_temp_name = get_filetemp('w', 'input')
9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	125	fp_error, error_temp_name = get_filetemp('w', 'err')
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	126
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	127	temp_file.write(content.encode(_PANDOC_ENCODING))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	128	temp_file.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	129
357 2f4587a37ff9 if markdown2pdf fails try without -xetex option Production Moz <dev@sopinspace.com> parents: 356 diff changeset	130	cust_tex = " --xetex "
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	131
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	132	# use markdown2pdf
351 9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	133	retcode = call(MARKDOWN2PDF_BIN + cust_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)
357 2f4587a37ff9 if markdown2pdf fails try without -xetex option Production Moz <dev@sopinspace.com> parents: 356 diff changeset	134
2f4587a37ff9 if markdown2pdf fails try without -xetex option Production Moz <dev@sopinspace.com> parents: 356 diff changeset	135	# xetex seems to randomly cause "Invalid or incomplete multibyte or wide character" errors, try without it
2f4587a37ff9 if markdown2pdf fails try without -xetex option Production Moz <dev@sopinspace.com> parents: 356 diff changeset	136	if retcode:
358 3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	137	# build absolute address for latex header file
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	138	_tmp_ = __file__.split(os.path.sep)[:-1]
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	139	_tmp_.append('latex_header.txt')
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	140	_tmp_.insert(0, os.path.sep)
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	141
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	142	LATEX_HEADER_PATH = os.path.join(*_tmp_)
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	143
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	144	if not os.path.isfile(LATEX_HEADER_PATH):
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	145	raise Exception('LATEX_HEADER_PATH is not a file!')
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	146
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	147	# custom latex header
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	148	cust_head_tex = " --custom-header=%s " %LATEX_HEADER_PATH
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	149
3e58bf6b3f3e when trying markdown2pdf without xetex option, use custom-header for propoer input encoding and so on Production Moz <dev@sopinspace.com> parents: 357 diff changeset	150	retcode = call(MARKDOWN2PDF_BIN + cust_head_tex + ' ' + input_temp_name, shell=True, stderr=fp_error)
357 2f4587a37ff9 if markdown2pdf fails try without -xetex option Production Moz <dev@sopinspace.com> parents: 356 diff changeset	151
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	152	fp_error.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	153
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	154	fp_error = file(error_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	155	error = fp_error.read()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	156	fp_error.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	157
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	158	os.remove(input_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	159	os.remove(error_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	160
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	161	if retcode:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	162	raise Exception(error)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	163
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	164	output_temp_name = input_temp_name + '.pdf'
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	165	fp_output = file(output_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	166	pdf_content = fp_output.read()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	167	fp_output.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	168
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	169	os.remove(output_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	170
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	171	return pdf_content
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	172
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	173	# TODO: manage images in pandoc (?)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	174	# TODO: use tidy to cleanup html
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	175
261 b60ab54b6782 fix usage of dj caching raph parents: 259 diff changeset	176	@dj_memoize
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	177	def pandoc_pandoc(content, from_format, to_format, full=False, raw=False):
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	178	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	179	Convert content (should be unicode) from from_format to to_format
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	180	(if full: includes header & co [html, latex])
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	181	Returns out (unicode), err
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	182
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	183	>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', False)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	184	>>> print err
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	185	None
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	186	>>> res.replace("\\n","")
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	187	u'<h1 id="sdsd">sdsd</h1>'
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	188	>>> res, err = pandoc_pandoc(u'# sdsd', 'markdown', 'html', True)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	189	>>> print err
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	190	None
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	191	"""
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	192	# verify formats
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	193	if from_format not in INPUT_FORMATS:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	194	raise Exception("Input format [%s] is not a supported format [%s]" % (from_format, ' '.join(INPUT_FORMATS)))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	195	if to_format not in OUTPUT_FORMATS:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	196	raise Exception("Output format [%s] is not a supported format [%s]" % (to_format, ' '.join(OUTPUT_FORMATS)))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	197	if type(content) != unicode:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	198	raise Exception('Content is not in unicode format!')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	199
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	200	# temp file
351 9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	201	input_file, input_temp_name = get_filetemp('w', 'input')
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	202	# For some reason when pandoc > 1.9 converts to PDF, '-t' shouldn't be used but output file name extension has to be '.pdf'
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	203	if to_format != 'pdf':
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	204	output_temp_fp, output_temp_name = get_filetemp('r', 'output')
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	205	else:
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	206	output_temp_fp, output_temp_name = get_filetemp('r', 'output.pdf')
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	207	output_temp_fp.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	208
351 9245a73f5787 use different tempfiles for in and out + use xtex for markdown2pdf Production Moz <dev@sopinspace.com> parents: 261 diff changeset	209	error_temp_fp, error_temp_name = get_filetemp('w', 'err')
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	210	error_temp_fp.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	211
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	212	input_file.write(content.encode(_PANDOC_ENCODING))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	213	input_file.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	214
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	215	# pandoc arguments and command line
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	216	p_options = PANDOC_OPTIONS
0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	217	if raw:
0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	218	p_options = PANDOC_OPTIONS_RAW
0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	219
352 07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	220	# do not use pandoc to convert from html to html
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	221	if from_format==to_format=='html':
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	222	# get body content
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	223	stdoutdata = (content.encode('utf8'))
393 8806f683d0be Some CSS styles Production Moz <dev@sopinspace.com> parents: 358 diff changeset	224	#stdoutdata = re.sub(r".<body[^>]>", '', stdoutdata)
8806f683d0be Some CSS styles Production Moz <dev@sopinspace.com> parents: 358 diff changeset	225	#stdoutdata = re.sub(r"</body>.*", '', stdoutdata)
352 07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	226	# if for some reason, tidy has not guess the doctype, make xml.dom.minidom happy with HTML entities ( )
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	227	stdoutdata = re.sub(r" ", '\xc2\xa0', stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	228	dom = parseString(stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	229	body = dom.getElementsByTagName("body")[0].toxml()
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	230	stdoutdata = body[body.find('>')+1:body.rfind('</')]
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	231	# strip leading spaces
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	232	stdoutdata = re.sub(r"^\s+", '', stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	233	# add new line before closing bracket
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	234	stdoutdata = re.sub(r"(\/?)>", r"\n\1>", stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	235	# do not split closing tag with following opening tag
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	236	stdoutdata = re.sub(r">\n<", r"><", stdoutdata)
355 c926868cf7e6 if DECORATED_CREATORS take into account the "fake" username if has_own_perm() gibus parents: 352 diff changeset	237	# nest headers tags
352 07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	238	#stdoutdata = re.sub(r'<h(\d) id="([^"]+)"\n>', r'<div id="\2"><h\1>', stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	239	#stdoutdata = re.sub(r'<\/h(\d)\n>', r'</h\1></div>', stdoutdata)
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	240	return stdoutdata
07a1fba18fff do not use pandoc to convert from html to html Production Moz <dev@sopinspace.com> parents: 351 diff changeset	241
259 0371caf8bcc6 always use pandoc but in raw mode for html->html convert raph parents: 252 diff changeset	242	cmd_args = ' %s -o %s ' %(p_options,output_temp_name)
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	243	if full:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	244	cmd_args += ' -s '
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	245	cmd_args += ' -f %s ' % from_format
442 b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	246	if to_format != 'pdf':
b6e443be2a9b Takes into account various releases of pandoc. gibus parents: 428 diff changeset	247	cmd_args += ' -t %s ' % to_format
0 40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	248	cmd_args += ' %s ' % input_temp_name
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	249	cmd = PANDOC_BIN + ' ' + cmd_args
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	250
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	251	#from socommons.converters.new_conv import controlled_Popen
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	252	#controlled_Popen(cmd, stderr=file(error_temp_name,'w'))
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	253	fp_error = file(error_temp_name,'w')
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	254	retcode = call(cmd, shell=True, stderr=fp_error)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	255	fp_error.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	256
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	257	fp_error = file(error_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	258	error = fp_error.read()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	259	fp_error.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	260
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	261	fp_output = file(output_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	262	stdoutdata = fp_output.read()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	263	fp_output.close()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	264
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	265
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	266	# cleanup
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	267	os.remove(output_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	268	os.remove(input_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	269	os.remove(error_temp_name)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	270
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	271	if retcode:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	272	raise Exception(error)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	273
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	274	# try converting to unicode
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	275	try:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	276	stdoutdata = stdoutdata.decode(_PANDOC_ENCODING)
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	277	except UnicodeDecodeError:
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	278	# this will fail for binary output formats such as odt
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	279	# return result without conversion then
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	280	pass
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	281
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	282	return stdoutdata
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	283
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	284	if __name__ == "__main__":
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	285	import doctest
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	286	doctest.testmod()
40c8f766c9b8 import from internal svn r 4007 raph parents: diff changeset	287

author	gibus
	Thu, 24 May 2012 12:48:39 +0200
changeset 442	b6e443be2a9b
parent 428	9591c651391d
child 443	cacd524f5279
permissions	-rw-r--r--