From 3a8ad4ba4224f3a6e3ae3f38380df996c0859bb2 Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Thu, 7 Jun 2018 18:00:45 +0200 Subject: [PATCH] workflows: add pdf filing in export-to-model action (#24364) Using pdftk and vendoring small interface lib pypdftk. --- debian/control | 3 +- wcs/qommon/vendor/pypdftk.py | 261 +++++++++++++++++++++++++++++++++++ wcs/wf/export_to_model.py | 93 ++++++++++++- 3 files changed, 354 insertions(+), 3 deletions(-) create mode 100644 wcs/qommon/vendor/pypdftk.py diff --git a/debian/control b/debian/control index 207a294c..5cc79cc0 100644 --- a/debian/control +++ b/debian/control @@ -24,7 +24,8 @@ Recommends: python-dns, python-xlwt, python-qrcode, libjs-leaflet, - python-magic + python-magic, + pdftk Suggests: python-libxml2, python-lasso, python-psycopg2 diff --git a/wcs/qommon/vendor/pypdftk.py b/wcs/qommon/vendor/pypdftk.py new file mode 100644 index 00000000..ccafcd90 --- /dev/null +++ b/wcs/qommon/vendor/pypdftk.py @@ -0,0 +1,261 @@ +# -*- encoding: UTF-8 -*- + +''' pypdftk + +Python module to drive the awesome pdftk binary. +See http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/ + +''' + +import logging +import os +import shutil +import subprocess +import tempfile +import itertools + +log = logging.getLogger(__name__) + +if os.getenv('PDFTK_PATH'): + PDFTK_PATH = os.getenv('PDFTK_PATH') +else: + PDFTK_PATH = '/usr/bin/pdftk' + if not os.path.isfile(PDFTK_PATH): + PDFTK_PATH = 'pdftk' + + +def check_output(*popenargs, **kwargs): + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise subprocess.CalledProcessError(retcode, cmd, output=output) + return output + + +def run_command(command, shell=False): + ''' run a system command and yield output ''' + p = check_output(command, shell=shell) + return p.split('\n') + +def check_pdftk(): + try: + run_command([PDFTK_PATH]) + return True + except OSError: + logging.warning('pdftk test call failed (PDFTK_PATH=%r).', PDFTK_PATH) + return False + + +def get_num_pages(pdf_path): + ''' return number of pages in a given PDF file ''' + for line in run_command([PDFTK_PATH, pdf_path, 'dump_data']): + if line.lower().startswith('numberofpages'): + return int(line.split(':')[1]) + return 0 + + +def fill_form(pdf_path, datas={}, out_file=None, flatten=True): + ''' + Fills a PDF form with given dict input data. + Return temp file if no out_file provided. + ''' + cleanOnFail = False + tmp_fdf = gen_xfdf(datas) + handle = None + if not out_file: + cleanOnFail = True + handle, out_file = tempfile.mkstemp() + + cmd = "%s %s fill_form %s output %s" % (PDFTK_PATH, pdf_path, tmp_fdf, out_file) + if flatten: + cmd += ' flatten' + try: + run_command(cmd, True) + except: + if cleanOnFail: + os.remove(tmp_fdf) + raise + finally: + if handle: + os.close(handle) + return out_file + +def dump_data_fields(pdf_path): + ''' + Return list of dicts of all fields in a PDF. + ''' + cmd = "%s %s dump_data_fields" % (PDFTK_PATH, pdf_path) + field_data = map(lambda x: x.split(': ', 1), run_command(cmd, True)) + + fields = [list(group) for k, group in itertools.groupby(field_data, lambda x: len(x) == 1) if not k] + + return map(dict, fields) + +def concat(files, out_file=None): + ''' + Merge multiples PDF files. + Return temp file if no out_file provided. + ''' + cleanOnFail = False + if not out_file: + cleanOnFail = True + handle, out_file = tempfile.mkstemp() + if len(files) == 1: + shutil.copyfile(files[0], out_file) + args = [PDFTK_PATH] + args += files + args += ['cat', 'output', out_file] + try: + run_command(args) + except: + if cleanOnFail: + os.remove(out_file) + raise + return out_file + + +def split(pdf_path, out_dir=None): + ''' + Split a single PDF file into pages. + Use a temp directory if no out_dir provided. + ''' + cleanOnFail = False + if not out_dir: + cleanOnFail = True + out_dir = tempfile.mkdtemp() + out_pattern = '%s/page_%%06d.pdf' % out_dir + try: + run_command((PDFTK_PATH, pdf_path, 'burst', 'output', out_pattern)) + except: + if cleanOnFail: + shutil.rmtree(out_dir) + raise + out_files = os.listdir(out_dir) + out_files.sort() + return [os.path.join(out_dir, filename) for filename in out_files] + + +def gen_xfdf(datas={}): + ''' Generates a temp XFDF file suited for fill_form function, based on dict input data ''' + fields = [] + for key, value in datas.items(): + fields.append(u""" %s""" % (key, value)) + tpl = u""" + + +%s + +""" % "\n".join(fields) + handle, out_file = tempfile.mkstemp() + f = open(out_file, 'w') + f.write(tpl.encode('UTF-8')) + f.close() + return out_file + +def replace_page(pdf_path, page_number, pdf_to_insert_path): + ''' + Replace a page in a PDF (pdf_path) by the PDF pointed by pdf_to_insert_path. + page_number is the number of the page in pdf_path to be replaced. It is 1-based. + ''' + A = 'A=' + pdf_path + B = 'B=' + pdf_to_insert_path + output_temp = tempfile.mktemp(suffix='.pdf') + + if page_number == 1: # At begin + upper_bound = 'A' + str(page_number + 1) + '-end' + args = ( + PDFTK_PATH, A, B, 'cat', 'B', upper_bound, 'output', output_temp) + elif page_number == get_num_pages(pdf_path): # At end + lower_bound = 'A1-' + str(page_number - 1) + args = (PDFTK_PATH, A, B, 'cat', lower_bound, 'B', 'output', output_temp) + else: # At middle + lower_bound = 'A1-' + str(page_number - 1) + upper_bound = 'A' + str(page_number + 1) + '-end' + args = ( + PDFTK_PATH, A, B, 'cat', lower_bound, 'B', upper_bound, 'output', + output_temp) + + run_command(args) + shutil.copy(output_temp, pdf_path) + os.remove(output_temp) + +def stamp(pdf_path, stamp_pdf_path, output_pdf_path=None): + ''' + Applies a stamp (from stamp_pdf_path) to the PDF file in pdf_path. Useful for watermark purposes. + If not output_pdf_path is provided, it returns a temporary file with the result PDF. + ''' + output = output_pdf_path or tempfile.mktemp(suffix='.pdf') + args = [PDFTK_PATH, pdf_path, 'multistamp', stamp_pdf_path, 'output', output] + run_command(args) + return output + +def pdftk_cmd_util(pdf_path, action="compress",out_file=None, flatten=True): + ''' + :type action: should valid action, in string format. Eg: "uncompress" + :param pdf_path: input PDF file + :param out_file: (default=auto) : output PDF path. will use tempfile if not provided + :param flatten: (default=True) : flatten the final PDF + :return: name of the output file. + ''' + actions = ["compress", "uncompress"] + assert action in actions, "Unknown action. Failed to perform given action '%s'." % action + + handle = None + cleanOnFail = False + if not out_file: + cleanOnFail = True + handle, out_file = tempfile.mkstemp() + + cmd = "%s %s output %s %s" % (PDFTK_PATH, pdf_path, out_file, action) + + if flatten: + cmd += ' flatten' + try: + run_command(cmd, True) + except: + if cleanOnFail: + os.remove(out_file) + raise + finally: + if handle: + os.close(handle) + return out_file + + + +def compress(pdf_path, out_file=None, flatten=True): + ''' + These are only useful when you want to edit PDF code in a text + editor like vim or emacs. Remove PDF page stream compression by + applying the uncompress filter. Use the compress filter to + restore compression. + + :param pdf_path: input PDF file + :param out_file: (default=auto) : output PDF path. will use tempfile if not provided + :param flatten: (default=True) : flatten the final PDF + :return: name of the output file. + ''' + + return pdftk_cmd_util(pdf_path, "compress", out_file, flatten) + + +def uncompress(pdf_path, out_file=None, flatten=True): + ''' + These are only useful when you want to edit PDF code in a text + editor like vim or emacs. Remove PDF page stream compression by + applying the uncompress filter. Use the compress filter to + restore compression. + + :param pdf_path: input PDF file + :param out_file: (default=auto) : output PDF path. will use tempfile if not provided + :param flatten: (default=True) : flatten the final PDF + :return: name of the output file. + ''' + + return pdftk_cmd_util(pdf_path, "uncompress", out_file, flatten) diff --git a/wcs/wf/export_to_model.py b/wcs/wf/export_to_model.py index 08f7d4d0..7ff5ef32 100644 --- a/wcs/wf/export_to_model.py +++ b/wcs/wf/export_to_model.py @@ -14,6 +14,7 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, see . +import os import base64 import collections from StringIO import StringIO @@ -33,10 +34,12 @@ from qommon import get_logger from qommon.form import (SingleSelectWidget, WidgetList, CheckboxWidget, StringWidget, UploadWidget, WysiwygTextWidget, Upload, UploadedFile, UploadValidationError, VarnameWidget, - RadiobuttonsWidget, PicklableUpload, ComputedExpressionWidget) + RadiobuttonsWidget, PicklableUpload, ComputedExpressionWidget, HtmlWidget) from qommon.errors import PublishError from qommon.template import TemplateError import qommon +from qommon.vendor import pypdftk + from wcs.fields import SubtitleField, TitleField, CommentField, PageField from wcs.workflows import (WorkflowStatusItem, AttachmentEvolutionPart, @@ -44,6 +47,8 @@ from wcs.workflows import (WorkflowStatusItem, AttachmentEvolutionPart, get_formdata_template_context, template_on_context) from wcs.portfolio import has_portfolio, push_document +has_pdftk = pypdftk.check_pdftk() + OO_TEXT_NS = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0' OO_OFFICE_NS = 'urn:oasis:names:tc:opendocument:xmlns:office:1.0' OO_DRAW_NS = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0' @@ -193,6 +198,20 @@ def rtf_process(value): return str2rtf(unicode(str(value), get_publisher().site_charset)) +class PDFFormDirectory(Directory): + _q_exports = [''] + + def __init__(self, item): + self.item = item + + def _q_index(self): + response = get_response() + response.content_type = 'application/pdf' + response.set_header('location', '..') + data = {field: field for field in self.item.get_pdf_fields()} + return self.item.fill_pdf_form(data).read() + + class ExportToModel(WorkflowStatusItem): description = N_('Document Creation') key = 'export_to_model' @@ -249,13 +268,17 @@ class ExportToModel(WorkflowStatusItem): base_url = formdata.get_url(backoffice=in_backoffice) return base_url + self.get_directory_name() - def model_file_validation(self, upload): + def model_file_validation(self, upload, ignore_error=False): if hasattr(upload, 'fp'): fp = upload.fp elif hasattr(upload, 'get_file'): fp = upload.get_file() else: + if ignore_error: + return None raise UploadValidationError('unknown upload object %r' % upload) + if upload.content_type and upload.content_type == 'application/pdf' and has_pdftk: + return 'pdf' if upload.content_type and upload.content_type == 'application/rtf': return 'rtf' if (upload.content_type and upload.content_type == 'application/octet-stream') or \ @@ -274,6 +297,8 @@ class ExportToModel(WorkflowStatusItem): return 'opendocument' if is_opendocument(fp): return 'opendocument' + if ignore_error: + return None raise UploadValidationError(_('Only RTF and OpenDocument files can be used')) def get_parameters(self): @@ -284,6 +309,10 @@ class ExportToModel(WorkflowStatusItem): if has_portfolio(): parameters += ('push_to_portfolio',) parameters += ('method', 'by', 'label', 'backoffice_info_text', 'filename', 'condition') + if self.has_pdf_model(): + parameters += ('pdffield',) + for field in self.get_pdf_fields(): + parameters += ('pdffield_%s' % field,) return parameters def add_parameters_widgets(self, form, parameters, prefix='', @@ -388,6 +417,40 @@ class ExportToModel(WorkflowStatusItem): if 'filename' in parameters: form.add(ComputedExpressionWidget, name='%sfilename' % prefix, title=_('File name'), value=self.filename) + if 'pdffield' in parameters: + form.add(HtmlWidget, htmltext('')) + pdf_fields_expressions = getattr(self, 'pdf_fields', {}) + pdf_fields = self.get_pdf_fields() + for parameter in parameters: + if parameter.startswith('pdffield_'): + name = parameter[9:] + form.add(ComputedExpressionWidget, + name=parameter, + title=_('PDF Field %s (%s)') % (name, pdf_fields.get(name, '')), + value=pdf_fields_expressions.get(name, '')) + + def has_pdf_model(self): + return self.model_file and self.model_file_validation(self.model_file, ignore_error=True) == 'pdf' + + def submit_admin_form(self, form): + super(ExportToModel, self).submit_admin_form(form) + if self.has_pdf_model(): + pdf_fields = {} + for field in self.get_pdf_fields(): + widget = form.get_widget('pdffield_%s' % field) + if widget: + pdf_fields[field] = widget.parse() + self.pdf_fields = pdf_fields + + def get_pdf_fields(self): + pdf_fields = {} + if self.model_file_validation(self.model_file, ignore_error=True) == 'pdf': + if hasattr(self.model_file, '_pdf_fields'): + return self.model_file._pdf_fields + for field_def in pypdftk.dump_data_fields(self.model_file.build_file_path()): + pdf_fields[field_def.get('FieldName')] = field_def.get('FieldType', 'Text') + self.model_file._pdf_fields = pdf_fields + return pdf_fields def get_filename(self): filename = None @@ -408,6 +471,8 @@ class ExportToModel(WorkflowStatusItem): outstream = self.apply_rtf_template_to_formdata(formdata) elif kind == 'opendocument': outstream = self.apply_od_template_to_formdata(formdata) + elif kind == 'pdf': + outstream = self.apply_pdf_template_to_formdata(formdata) else: raise Exception('unsupported model kind %r' % kind) if self.convert_to_pdf: @@ -416,6 +481,24 @@ class ExportToModel(WorkflowStatusItem): return transform_to_pdf(outstream) return outstream + def fill_pdf_form(self, data): + outfile = pypdftk.fill_form(self.model_file.build_file_path(), data) + try: + with open(outfile) as f: + return StringIO(f.read()) + finally: + os.unlink(outfile) + + def apply_pdf_template_to_formdata(self, formdata): + pdf_fields = self.get_pdf_fields() + data = {} + for field in pdf_fields: + expression = getattr(self, 'pdf_fields', {}).get(field) + if not expression: + continue + data[field] = self.compute(data) + return self.fill_pdf_form(data) + def apply_rtf_template_to_formdata(self, formdata): try: # force ezt_only=True because an RTF file may contain {{ characters @@ -554,4 +637,10 @@ class ExportToModel(WorkflowStatusItem): content_type, outstream.read()) + def q_admin_lookup(self, workflow, status, component, html_top): + if component == 'pdf-form' and self.has_pdf_model(): + directory = PDFFormDirectory(self) + directory.html_top = html_top + return directory + register_item_class(ExportToModel) -- 2.17.0