From 3a8ad4ba4224f3a6e3ae3f38380df996c0859bb2 Mon Sep 17 00:00:00 2001
From: Benjamin Dauvergne <bdauvergne@entrouvert.com>
Date: Thu, 7 Jun 2018 18:00:45 +0200
Subject: [PATCH] workflows: add pdf filing in export-to-model action (#24364)

Using pdftk and vendoring small interface lib pypdftk.
---
 debian/control               |   3 +-
 wcs/qommon/vendor/pypdftk.py | 261 +++++++++++++++++++++++++++++++++++
 wcs/wf/export_to_model.py    |  93 ++++++++++++-
 3 files changed, 354 insertions(+), 3 deletions(-)
 create mode 100644 wcs/qommon/vendor/pypdftk.py

diff --git a/debian/control b/debian/control
index 207a294c..5cc79cc0 100644
--- a/debian/control
+++ b/debian/control
@@ -24,7 +24,8 @@ Recommends: python-dns,
     python-xlwt,
     python-qrcode,
     libjs-leaflet,
-    python-magic
+    python-magic,
+    pdftk
 Suggests: python-libxml2,
     python-lasso,
     python-psycopg2
diff --git a/wcs/qommon/vendor/pypdftk.py b/wcs/qommon/vendor/pypdftk.py
new file mode 100644
index 00000000..ccafcd90
--- /dev/null
+++ b/wcs/qommon/vendor/pypdftk.py
@@ -0,0 +1,261 @@
+# -*- encoding: UTF-8 -*-
+
+''' pypdftk
+
+Python module to drive the awesome pdftk binary.
+See http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
+
+'''
+
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+import itertools
+
+log = logging.getLogger(__name__)
+
+if os.getenv('PDFTK_PATH'):
+    PDFTK_PATH = os.getenv('PDFTK_PATH')
+else:
+    PDFTK_PATH = '/usr/bin/pdftk'
+    if not os.path.isfile(PDFTK_PATH):
+        PDFTK_PATH = 'pdftk'
+
+
+def check_output(*popenargs, **kwargs):
+    if 'stdout' in kwargs:
+        raise ValueError('stdout argument not allowed, it will be overridden.')
+    process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
+    output, unused_err = process.communicate()
+    retcode = process.poll()
+    if retcode:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = popenargs[0]
+        raise subprocess.CalledProcessError(retcode, cmd, output=output)
+    return output
+
+
+def run_command(command, shell=False):
+    ''' run a system command and yield output '''
+    p = check_output(command, shell=shell)
+    return p.split('\n')
+
+def check_pdftk():
+    try:
+        run_command([PDFTK_PATH])
+        return True
+    except OSError:
+        logging.warning('pdftk test call failed (PDFTK_PATH=%r).', PDFTK_PATH)
+    return False
+
+
+def get_num_pages(pdf_path):
+    ''' return number of pages in a given PDF file '''
+    for line in run_command([PDFTK_PATH, pdf_path, 'dump_data']):
+        if line.lower().startswith('numberofpages'):
+            return int(line.split(':')[1])
+    return 0
+
+
+def fill_form(pdf_path, datas={}, out_file=None, flatten=True):
+    '''
+        Fills a PDF form with given dict input data.
+        Return temp file if no out_file provided.
+    '''
+    cleanOnFail = False
+    tmp_fdf = gen_xfdf(datas)
+    handle = None
+    if not out_file:
+        cleanOnFail = True
+        handle, out_file = tempfile.mkstemp()
+
+    cmd = "%s %s fill_form %s output %s" % (PDFTK_PATH, pdf_path, tmp_fdf, out_file)
+    if flatten:
+        cmd += ' flatten'
+    try:
+        run_command(cmd, True)
+    except:
+        if cleanOnFail:
+            os.remove(tmp_fdf)
+        raise
+    finally:
+        if handle:
+            os.close(handle)
+    return out_file
+
+def dump_data_fields(pdf_path):
+    '''
+        Return list of dicts of all fields in a PDF.
+    '''
+    cmd = "%s %s dump_data_fields" % (PDFTK_PATH, pdf_path)
+    field_data = map(lambda x: x.split(': ', 1), run_command(cmd, True))
+
+    fields = [list(group) for k, group in itertools.groupby(field_data, lambda x: len(x) == 1) if not k]
+
+    return map(dict, fields)
+
+def concat(files, out_file=None):
+    '''
+        Merge multiples PDF files.
+        Return temp file if no out_file provided.
+    '''
+    cleanOnFail = False
+    if not out_file:
+        cleanOnFail = True
+        handle, out_file = tempfile.mkstemp()
+    if len(files) == 1:
+        shutil.copyfile(files[0], out_file)
+    args = [PDFTK_PATH]
+    args += files
+    args += ['cat', 'output', out_file]
+    try:
+        run_command(args)
+    except:
+        if cleanOnFail:
+            os.remove(out_file)
+        raise
+    return out_file
+
+
+def split(pdf_path, out_dir=None):
+    '''
+        Split a single PDF file into pages.
+        Use a temp directory if no out_dir provided.
+    '''
+    cleanOnFail = False
+    if not out_dir:
+        cleanOnFail = True
+        out_dir = tempfile.mkdtemp()
+    out_pattern = '%s/page_%%06d.pdf' % out_dir
+    try:
+        run_command((PDFTK_PATH, pdf_path, 'burst', 'output', out_pattern))
+    except:
+        if cleanOnFail:
+            shutil.rmtree(out_dir)
+        raise
+    out_files = os.listdir(out_dir)
+    out_files.sort()
+    return [os.path.join(out_dir, filename) for filename in out_files]
+
+
+def gen_xfdf(datas={}):
+    ''' Generates a temp XFDF file suited for fill_form function, based on dict input data '''
+    fields = []
+    for key, value in datas.items():
+        fields.append(u"""        <field name="%s"><value>%s</value></field>""" % (key, value))
+    tpl = u"""<?xml version="1.0" encoding="UTF-8"?>
+<xfdf xmlns="http://ns.adobe.com/xfdf/" xml:space="preserve">
+    <fields>
+%s
+    </fields>
+</xfdf>""" % "\n".join(fields)
+    handle, out_file = tempfile.mkstemp()
+    f = open(out_file, 'w')
+    f.write(tpl.encode('UTF-8'))
+    f.close()
+    return out_file
+
+def replace_page(pdf_path, page_number, pdf_to_insert_path):
+    '''
+    Replace a page in a PDF (pdf_path) by the PDF pointed by pdf_to_insert_path.
+    page_number is the number of the page in pdf_path to be replaced. It is 1-based.
+    '''
+    A = 'A=' + pdf_path
+    B = 'B=' + pdf_to_insert_path
+    output_temp = tempfile.mktemp(suffix='.pdf')
+
+    if page_number == 1:  # At begin
+        upper_bound = 'A' + str(page_number + 1) + '-end'
+        args = (
+            PDFTK_PATH, A, B, 'cat', 'B', upper_bound, 'output', output_temp)
+    elif page_number == get_num_pages(pdf_path):  # At end
+        lower_bound = 'A1-' + str(page_number - 1)
+        args = (PDFTK_PATH, A, B, 'cat', lower_bound, 'B', 'output', output_temp)
+    else:  # At middle
+        lower_bound = 'A1-' + str(page_number - 1)
+        upper_bound = 'A' + str(page_number + 1) + '-end'
+        args = (
+            PDFTK_PATH, A, B, 'cat', lower_bound, 'B', upper_bound, 'output',
+            output_temp)
+
+    run_command(args)
+    shutil.copy(output_temp, pdf_path)
+    os.remove(output_temp)
+
+def stamp(pdf_path, stamp_pdf_path, output_pdf_path=None):
+    '''
+    Applies a stamp (from stamp_pdf_path) to the PDF file in pdf_path. Useful for watermark purposes.
+    If not output_pdf_path is provided, it returns a temporary file with the result PDF.
+    '''
+    output = output_pdf_path or tempfile.mktemp(suffix='.pdf')
+    args = [PDFTK_PATH, pdf_path, 'multistamp', stamp_pdf_path, 'output', output]
+    run_command(args)
+    return output
+
+def pdftk_cmd_util(pdf_path, action="compress",out_file=None, flatten=True):
+    '''
+    :type action: should valid action, in string format. Eg: "uncompress"
+    :param pdf_path: input PDF file
+    :param out_file: (default=auto) : output PDF path. will use tempfile if not provided
+    :param flatten: (default=True) : flatten the final PDF
+    :return: name of the output file.
+    '''
+    actions = ["compress", "uncompress"]
+    assert action in actions, "Unknown action. Failed to perform given action '%s'." % action
+
+    handle = None
+    cleanOnFail = False
+    if not out_file:
+        cleanOnFail = True
+        handle, out_file = tempfile.mkstemp()
+
+    cmd = "%s %s output %s %s" % (PDFTK_PATH, pdf_path, out_file, action)
+
+    if flatten:
+        cmd += ' flatten'
+    try:
+        run_command(cmd, True)
+    except:
+        if cleanOnFail:
+            os.remove(out_file)
+        raise
+    finally:
+        if handle:
+            os.close(handle)
+    return out_file
+
+
+
+def compress(pdf_path, out_file=None, flatten=True):
+    '''
+    These are only useful when you want to edit PDF code in a text
+    editor like vim or emacs.  Remove PDF page stream compression by
+    applying the uncompress filter. Use the compress filter to
+    restore compression.
+
+    :param pdf_path: input PDF file
+    :param out_file: (default=auto) : output PDF path. will use tempfile if not provided
+    :param flatten: (default=True) : flatten the final PDF
+    :return: name of the output file.
+    '''
+
+    return pdftk_cmd_util(pdf_path, "compress", out_file, flatten)
+
+
+def uncompress(pdf_path, out_file=None, flatten=True):
+    '''
+    These are only useful when you want to edit PDF code in a text
+    editor like vim or emacs.  Remove PDF page stream compression by
+    applying the uncompress filter. Use the compress filter to
+    restore compression.
+
+    :param pdf_path: input PDF file
+    :param out_file: (default=auto) : output PDF path. will use tempfile if not provided
+    :param flatten: (default=True) : flatten the final PDF
+    :return: name of the output file.
+    '''
+
+    return pdftk_cmd_util(pdf_path, "uncompress", out_file, flatten)
diff --git a/wcs/wf/export_to_model.py b/wcs/wf/export_to_model.py
index 08f7d4d0..7ff5ef32 100644
--- a/wcs/wf/export_to_model.py
+++ b/wcs/wf/export_to_model.py
@@ -14,6 +14,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.
 
+import os
 import base64
 import collections
 from StringIO import StringIO
@@ -33,10 +34,12 @@ from qommon import get_logger
 from qommon.form import (SingleSelectWidget, WidgetList, CheckboxWidget,
                          StringWidget, UploadWidget, WysiwygTextWidget, Upload,
                          UploadedFile, UploadValidationError, VarnameWidget,
-                         RadiobuttonsWidget, PicklableUpload, ComputedExpressionWidget)
+                         RadiobuttonsWidget, PicklableUpload, ComputedExpressionWidget, HtmlWidget)
 from qommon.errors import PublishError
 from qommon.template import TemplateError
 import qommon
+from qommon.vendor import pypdftk
+
 
 from wcs.fields import SubtitleField, TitleField, CommentField, PageField
 from wcs.workflows import (WorkflowStatusItem, AttachmentEvolutionPart,
@@ -44,6 +47,8 @@ from wcs.workflows import (WorkflowStatusItem, AttachmentEvolutionPart,
                            get_formdata_template_context, template_on_context)
 from wcs.portfolio import has_portfolio, push_document
 
+has_pdftk = pypdftk.check_pdftk()
+
 OO_TEXT_NS = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
 OO_OFFICE_NS = 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'
 OO_DRAW_NS = 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'
@@ -193,6 +198,20 @@ def rtf_process(value):
     return str2rtf(unicode(str(value), get_publisher().site_charset))
 
 
+class PDFFormDirectory(Directory):
+    _q_exports = ['']
+
+    def __init__(self, item):
+        self.item = item
+
+    def _q_index(self):
+        response = get_response()
+        response.content_type = 'application/pdf'
+        response.set_header('location', '..')
+        data = {field: field for field in self.item.get_pdf_fields()}
+        return self.item.fill_pdf_form(data).read()
+
+
 class ExportToModel(WorkflowStatusItem):
     description = N_('Document Creation')
     key = 'export_to_model'
@@ -249,13 +268,17 @@ class ExportToModel(WorkflowStatusItem):
             base_url = formdata.get_url(backoffice=in_backoffice)
             return base_url + self.get_directory_name()
 
-    def model_file_validation(self, upload):
+    def model_file_validation(self, upload, ignore_error=False):
         if hasattr(upload, 'fp'):
             fp = upload.fp
         elif hasattr(upload, 'get_file'):
             fp = upload.get_file()
         else:
+            if ignore_error:
+                return None
             raise UploadValidationError('unknown upload object %r' % upload)
+        if upload.content_type and upload.content_type == 'application/pdf' and has_pdftk:
+            return 'pdf'
         if upload.content_type and upload.content_type == 'application/rtf':
             return 'rtf'
         if (upload.content_type and upload.content_type == 'application/octet-stream') or \
@@ -274,6 +297,8 @@ class ExportToModel(WorkflowStatusItem):
                 return 'opendocument'
         if is_opendocument(fp):
             return 'opendocument'
+        if ignore_error:
+            return None
         raise UploadValidationError(_('Only RTF and OpenDocument files can be used'))
 
     def get_parameters(self):
@@ -284,6 +309,10 @@ class ExportToModel(WorkflowStatusItem):
         if has_portfolio():
             parameters += ('push_to_portfolio',)
         parameters += ('method', 'by', 'label', 'backoffice_info_text', 'filename', 'condition')
+        if self.has_pdf_model():
+            parameters += ('pdffield',)
+            for field in self.get_pdf_fields():
+                parameters += ('pdffield_%s' % field,)
         return parameters
 
     def add_parameters_widgets(self, form, parameters, prefix='',
@@ -388,6 +417,40 @@ class ExportToModel(WorkflowStatusItem):
         if 'filename' in parameters:
             form.add(ComputedExpressionWidget, name='%sfilename' % prefix, title=_('File name'),
                      value=self.filename)
+        if 'pdffield' in parameters:
+            form.add(HtmlWidget, htmltext('<label><a href="pdf-form/">PDF form example</a></label>'))
+        pdf_fields_expressions = getattr(self, 'pdf_fields', {})
+        pdf_fields = self.get_pdf_fields()
+        for parameter in parameters:
+            if parameter.startswith('pdffield_'):
+                name = parameter[9:]
+                form.add(ComputedExpressionWidget,
+                         name=parameter,
+                         title=_('PDF Field %s (%s)') % (name, pdf_fields.get(name, '')),
+                         value=pdf_fields_expressions.get(name, ''))
+
+    def has_pdf_model(self):
+        return self.model_file and self.model_file_validation(self.model_file, ignore_error=True) == 'pdf'
+
+    def submit_admin_form(self, form):
+        super(ExportToModel, self).submit_admin_form(form)
+        if self.has_pdf_model():
+            pdf_fields = {}
+            for field in self.get_pdf_fields():
+                widget = form.get_widget('pdffield_%s' % field)
+                if widget:
+                    pdf_fields[field] = widget.parse()
+            self.pdf_fields = pdf_fields
+
+    def get_pdf_fields(self):
+        pdf_fields = {}
+        if self.model_file_validation(self.model_file, ignore_error=True) == 'pdf':
+            if hasattr(self.model_file, '_pdf_fields'):
+                return self.model_file._pdf_fields
+            for field_def in pypdftk.dump_data_fields(self.model_file.build_file_path()):
+                pdf_fields[field_def.get('FieldName')] = field_def.get('FieldType', 'Text')
+            self.model_file._pdf_fields = pdf_fields
+        return pdf_fields
 
     def get_filename(self):
         filename = None
@@ -408,6 +471,8 @@ class ExportToModel(WorkflowStatusItem):
             outstream = self.apply_rtf_template_to_formdata(formdata)
         elif kind == 'opendocument':
             outstream = self.apply_od_template_to_formdata(formdata)
+        elif kind == 'pdf':
+            outstream = self.apply_pdf_template_to_formdata(formdata)
         else:
             raise Exception('unsupported model kind %r' % kind)
         if self.convert_to_pdf:
@@ -416,6 +481,24 @@ class ExportToModel(WorkflowStatusItem):
             return transform_to_pdf(outstream)
         return outstream
 
+    def fill_pdf_form(self, data):
+        outfile = pypdftk.fill_form(self.model_file.build_file_path(), data)
+        try:
+            with open(outfile) as f:
+                return StringIO(f.read())
+        finally:
+            os.unlink(outfile)
+
+    def apply_pdf_template_to_formdata(self, formdata):
+        pdf_fields = self.get_pdf_fields()
+        data = {}
+        for field in pdf_fields:
+            expression = getattr(self, 'pdf_fields', {}).get(field)
+            if not expression:
+                continue
+            data[field] = self.compute(data)
+        return self.fill_pdf_form(data)
+
     def apply_rtf_template_to_formdata(self, formdata):
         try:
             # force ezt_only=True because an RTF file may contain {{ characters
@@ -554,4 +637,10 @@ class ExportToModel(WorkflowStatusItem):
                 content_type,
                 outstream.read())
 
+    def q_admin_lookup(self, workflow, status, component, html_top):
+        if component == 'pdf-form' and self.has_pdf_model():
+            directory = PDFFormDirectory(self)
+            directory.html_top = html_top
+            return directory
+
 register_item_class(ExportToModel)
-- 
2.17.0