diff options
-rw-r--r-- | pikepdf/_cpphelpers.py | 31 | ||||
-rw-r--r-- | tests/test_exotic.py | 5 | ||||
-rw-r--r-- | tests/test_pdfa.py | 45 |
3 files changed, 74 insertions, 7 deletions
diff --git a/pikepdf/_cpphelpers.py b/pikepdf/_cpphelpers.py index 601d58c..4e11289 100644 --- a/pikepdf/_cpphelpers.py +++ b/pikepdf/_cpphelpers.py @@ -109,11 +109,36 @@ def _write_startxref(new_xref_offset, output): def repair_pdfa(filename): + """Repair "missing EOL before endstream" errors in PDF/A + + QPDF generates PDFs without putting an EOL before the endstream keyword. + + 1. + stream + dostuffdostuffdostuff_endstream <-- PDF/A error, missing LF + endobj + + 2. + << /Length 5 ... >> + stream + 1234 <-- 4 characters + LF + endstream <-- PDF/A error, missing LF because it is read as part of stream + endobj + + This is legal, PDF/A requires the EOL. Specifically it requires that + at least one EOL appears after reading /Length bytes from the stream, + so a stream that happens to terminate on an EOL character will not pass. + + This assumes a simple PDF with no updates in the xref table, as would + normally be the case after one is passed through QPDF. + + It also cannot find improperly laid out content streams inside object + streams so object streams need to be disabled. + + """ import mmap from shutil import copy2 - copy2(filename, filename.with_suffix('.bak.pdf')) - with NamedTemporaryFile(suffix='.pdf', mode='r+b') as temp: copy2(filename, temp.name) temp.flush() @@ -140,3 +165,5 @@ def repair_pdfa(filename): # Succeeded copy2(output.name, filename) + mm.close() + diff --git a/tests/test_exotic.py b/tests/test_exotic.py index 4524f2b..75df790 100644 --- a/tests/test_exotic.py +++ b/tests/test_exotic.py @@ -77,8 +77,3 @@ def test_create_form_xobjects(outdir): pdf.add_page(page, True) pdf.save(outdir / 'formxobj.pdf') - -def test_pdfa_sanity(resources, outdir): - pdf = qpdf.QPDF.open( - resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf') - pdf.save(outdir / 'pdfa.pdf') diff --git a/tests/test_pdfa.py b/tests/test_pdfa.py new file mode 100644 index 0000000..b15b6fd --- /dev/null +++ b/tests/test_pdfa.py @@ -0,0 +1,45 @@ +import pytest +from pikepdf import qpdf + +import os +import platform +import shutil +from pathlib import Path +from contextlib import suppress +from subprocess import run, PIPE, STDOUT, DEVNULL +import xml.etree.ElementTree as ET + + +VERAPDF = Path(os.environ['HOME']) / 'verapdf' / 'verapdf' + + +NO_PDFA_VALIDATOR = not VERAPDF.is_file() + + +def verapdf_validate(filename): + with open(filename, 'rb') as f: + proc = run([VERAPDF], stdin=f, stdout=PIPE, stderr=STDOUT, check=True) + result = proc.stdout.decode('utf-8') + + xml_start = result.find('<?xml version') + xml = result[xml_start:] + + root = ET.fromstring(xml) + node = root.find(".//taskResult[@type='VALIDATE']") + return node.attrib['isExecuted'] == 'true' and \ + node.attrib['isSuccess'] == 'true' + + +@pytest.mark.skipif(NO_PDFA_VALIDATOR, reason="can't find verapdf") +def test_pdfa_sanity(resources, outdir): + filename = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf' + + assert verapdf_validate(filename) + + pdf = qpdf.QPDF.open(filename) + pdf.save(outdir / 'pdfa.pdf') + + assert verapdf_validate(outdir / 'pdfa.pdf') + + + |