summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pikepdf/_cpphelpers.py31
-rw-r--r--tests/test_exotic.py5
-rw-r--r--tests/test_pdfa.py45
3 files changed, 74 insertions, 7 deletions
diff --git a/pikepdf/_cpphelpers.py b/pikepdf/_cpphelpers.py
index 601d58c..4e11289 100644
--- a/pikepdf/_cpphelpers.py
+++ b/pikepdf/_cpphelpers.py
@@ -109,11 +109,36 @@ def _write_startxref(new_xref_offset, output):
def repair_pdfa(filename):
+ """Repair "missing EOL before endstream" errors in PDF/A
+
+ QPDF generates PDFs without putting an EOL before the endstream keyword.
+
+ 1.
+ stream
+ dostuffdostuffdostuff_endstream <-- PDF/A error, missing LF
+ endobj
+
+ 2.
+ << /Length 5 ... >>
+ stream
+ 1234 <-- 4 characters + LF
+ endstream <-- PDF/A error, missing LF because it is read as part of stream
+ endobj
+
+ This is legal, PDF/A requires the EOL. Specifically it requires that
+ at least one EOL appears after reading /Length bytes from the stream,
+ so a stream that happens to terminate on an EOL character will not pass.
+
+ This assumes a simple PDF with no updates in the xref table, as would
+ normally be the case after one is passed through QPDF.
+
+ It also cannot find improperly laid out content streams inside object
+ streams so object streams need to be disabled.
+
+ """
import mmap
from shutil import copy2
- copy2(filename, filename.with_suffix('.bak.pdf'))
-
with NamedTemporaryFile(suffix='.pdf', mode='r+b') as temp:
copy2(filename, temp.name)
temp.flush()
@@ -140,3 +165,5 @@ def repair_pdfa(filename):
# Succeeded
copy2(output.name, filename)
+ mm.close()
+
diff --git a/tests/test_exotic.py b/tests/test_exotic.py
index 4524f2b..75df790 100644
--- a/tests/test_exotic.py
+++ b/tests/test_exotic.py
@@ -77,8 +77,3 @@ def test_create_form_xobjects(outdir):
pdf.add_page(page, True)
pdf.save(outdir / 'formxobj.pdf')
-
-def test_pdfa_sanity(resources, outdir):
- pdf = qpdf.QPDF.open(
- resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf')
- pdf.save(outdir / 'pdfa.pdf')
diff --git a/tests/test_pdfa.py b/tests/test_pdfa.py
new file mode 100644
index 0000000..b15b6fd
--- /dev/null
+++ b/tests/test_pdfa.py
@@ -0,0 +1,45 @@
+import pytest
+from pikepdf import qpdf
+
+import os
+import platform
+import shutil
+from pathlib import Path
+from contextlib import suppress
+from subprocess import run, PIPE, STDOUT, DEVNULL
+import xml.etree.ElementTree as ET
+
+
+VERAPDF = Path(os.environ['HOME']) / 'verapdf' / 'verapdf'
+
+
+NO_PDFA_VALIDATOR = not VERAPDF.is_file()
+
+
+def verapdf_validate(filename):
+ with open(filename, 'rb') as f:
+ proc = run([VERAPDF], stdin=f, stdout=PIPE, stderr=STDOUT, check=True)
+ result = proc.stdout.decode('utf-8')
+
+ xml_start = result.find('<?xml version')
+ xml = result[xml_start:]
+
+ root = ET.fromstring(xml)
+ node = root.find(".//taskResult[@type='VALIDATE']")
+ return node.attrib['isExecuted'] == 'true' and \
+ node.attrib['isSuccess'] == 'true'
+
+
+@pytest.mark.skipif(NO_PDFA_VALIDATOR, reason="can't find verapdf")
+def test_pdfa_sanity(resources, outdir):
+ filename = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'
+
+ assert verapdf_validate(filename)
+
+ pdf = qpdf.QPDF.open(filename)
+ pdf.save(outdir / 'pdfa.pdf')
+
+ assert verapdf_validate(outdir / 'pdfa.pdf')
+
+
+