summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/pikepdf/__init__.py42
-rw-r--r--src/pikepdf/_cpphelpers.py20
-rw-r--r--src/pikepdf/_methods.py319
-rw-r--r--src/pikepdf/_version.py13
-rw-r--r--src/pikepdf/codec.py48
-rw-r--r--src/pikepdf/models/__init__.py17
-rw-r--r--src/pikepdf/models/encryption.py154
-rw-r--r--src/pikepdf/models/image.py247
-rw-r--r--src/pikepdf/models/matrix.py16
-rw-r--r--src/pikepdf/models/metadata.py86
-rw-r--r--src/pikepdf/objects.py49
-rw-r--r--src/qpdf/annotation.cpp52
-rw-r--r--src/qpdf/object.cpp102
-rw-r--r--src/qpdf/pikepdf.cpp98
-rw-r--r--src/qpdf/pikepdf.h30
-rw-r--r--src/qpdf/qpdf.cpp585
-rw-r--r--src/qpdf/qpdf_inputsource.h2
-rw-r--r--src/qpdf/qpdf_pagelist.cpp87
-rw-r--r--src/qpdf/qpdf_pipeline.h77
19 files changed, 1633 insertions, 411 deletions
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py
index 2e42605..8de467a 100644
--- a/src/pikepdf/__init__.py
+++ b/src/pikepdf/__init__.py
@@ -4,38 +4,46 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-
-from pkg_resources import (
- get_distribution as _get_distribution,
- DistributionNotFound
-)
+"""A library for manipulating PDFs"""
try:
from . import _qpdf
except ImportError:
raise ImportError("pikepdf's extension library failed to import")
-from ._qpdf import (
- PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel
-)
+from ._version import __version__
+from ._qpdf import PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel
from .objects import (
- Object, ObjectType, Name, String, Array, Dictionary, Stream, Operator
+ Object,
+ ObjectType,
+ Name,
+ String,
+ Array,
+ Dictionary,
+ Stream,
+ Operator,
)
from .models import (
- PdfImage, PdfInlineImage, UnsupportedImageTypeError, PdfMatrix,
- parse_content_stream
+ PdfImage,
+ PdfInlineImage,
+ UnsupportedImageTypeError,
+ PdfMatrix,
+ Encryption,
+ Permissions,
+ parse_content_stream,
)
from . import _methods
-
-try:
- __version__ = _get_distribution(__name__).version
-except DistributionNotFound:
- __version__ = "Not installed"
+from . import codec
__libqpdf_version__ = _qpdf.qpdf_version()
def open(*args, **kwargs): # pylint: disable=redefined-builtin
- "Alias for :func:`pikepdf.Pdf.open`."
+ """Alias for :func:`pikepdf.Pdf.open`. Open a PDF."""
return Pdf.open(*args, **kwargs)
+
+
+def new(*args, **kwargs):
+ """Alias for :func:`pikepdf.Pdf.new`. Create a new empty PDF."""
+ return Pdf.new(*args, **kwargs)
diff --git a/src/pikepdf/_cpphelpers.py b/src/pikepdf/_cpphelpers.py
index d975657..7ef0654 100644
--- a/src/pikepdf/_cpphelpers.py
+++ b/src/pikepdf/_cpphelpers.py
@@ -12,12 +12,13 @@ called from Python, and subject to change at any time.
import os
import sys
-
# Provide os.fspath equivalent for Python <3.6
if sys.version_info[0:2] <= (3, 5): # pragma: no cover
+
def fspath(path):
'''https://www.python.org/dev/peps/pep-0519/#os'''
import pathlib
+
if isinstance(path, (str, bytes)):
return path
@@ -36,12 +37,23 @@ if sys.version_info[0:2] <= (3, 5): # pragma: no cover
if isinstance(path, (str, bytes)):
return path
else:
- raise TypeError("expected __fspath__() to return str or bytes, "
- "not " + type(path).__name__)
+ raise TypeError(
+ "expected __fspath__() to return str or bytes, "
+ "not " + type(path).__name__
+ )
raise TypeError(
"expected str, bytes, pathlib.Path or os.PathLike object, not "
- + path_type.__name__)
+ + path_type.__name__
+ )
+
else:
fspath = os.fspath
+
+
+def update_xmp_pdfversion(pdf, version):
+
+ with pdf.open_metadata(set_pikepdf_as_editor=False, update_docinfo=False) as meta:
+ if 'pdf:PDFVersion' in meta:
+ meta['pdf:PDFVersion'] = version
diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py
index 6c2b90b..ac6134c 100644
--- a/src/pikepdf/_methods.py
+++ b/src/pikepdf/_methods.py
@@ -12,22 +12,21 @@ bindings after the fact.
We can also move the implementation to C++ if desired.
"""
-from tempfile import NamedTemporaryFile
-from subprocess import run, PIPE
-from io import BytesIO
-
-from collections.abc import KeysView
-
import inspect
+from collections import namedtuple
+from collections.abc import KeysView
+from io import BytesIO
+from subprocess import PIPE, run
+from tempfile import NamedTemporaryFile
-from . import Pdf, Dictionary, Array, Name, Stream, Object
+from . import Array, Dictionary, Name, Object, Pdf, Stream
from ._qpdf import _ObjectMapping
-from .models import PdfMetadata
-
+from .models import PdfMetadata, Permissions, EncryptionInfo
# pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object
-def extends(cls_cpp):
+
+def augments(cls_cpp):
"""Attach methods of a Python support class to an existing class
This monkeypatches all methods defined in the support class onto an
@@ -35,37 +34,44 @@ def extends(cls_cpp):
.. code-block:: python
- @extends(ClassDefinedInCpp)
+ @augments(ClassDefinedInCpp)
class SupportClass:
def foo(self):
pass
- The method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
+ The Python method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
has no meaning on its own and should not be used, but gets returned from
this function so IDE code inspection doesn't get too confused.
We don't subclass because it's much more convenient to monkeypatch Python
methods onto the existing Python binding of the C++ class. For one thing,
this allows the implementation to be moved from Python to C++ or vice
- versa. It saves having to implement an intermediate subclass and then
- ensures that the superclass never 'leaks' to pikepdf users.
+ versa. It saves having to implement an intermediate Python subclass and then
+ ensures that the C++ superclass never 'leaks' to pikepdf users. Finally,
+ wrapper classes and subclasses can become problematic if the call stack
+ crosses the C++/Python boundary multiple times.
Any existing methods may be used, regardless of whether they defined
elsewhere in the support class or in the target class.
+
+ The target class does not have to be C++ or derived from pybind11.
"""
- def real_class_extend(cls, cls_cpp=cls_cpp):
+ def class_augment(cls, cls_cpp=cls_cpp):
for name, fn in inspect.getmembers(cls, inspect.isfunction):
- fn.__qualname__ = fn.__qualname__.replace(
- cls.__name__, cls_cpp.__name__)
+ fn.__qualname__ = fn.__qualname__.replace(cls.__name__, cls_cpp.__name__)
setattr(cls_cpp, name, fn)
for name, fn in inspect.getmembers(cls, inspect.isdatadescriptor):
setattr(cls_cpp, name, fn)
+
def block_init(self):
+ # Prevent initialization of the support class
raise NotImplementedError(self.__class__.__name__ + '.__init__')
+
cls.__init__ = block_init
return cls
- return real_class_extend
+
+ return class_augment
def _single_page_pdf(page):
@@ -86,17 +92,15 @@ def _mudraw(buffer, fmt):
tmp_in.flush()
proc = run(
- ['mudraw', '-F', fmt, '-o', '-', tmp_in.name],
- stdout=PIPE, stderr=PIPE
+ ['mudraw', '-F', fmt, '-o', '-', tmp_in.name], stdout=PIPE, stderr=PIPE
)
if proc.stderr:
raise RuntimeError(proc.stderr.decode())
return proc.stdout
-@extends(Object)
+@augments(Object)
class Extend_Object:
-
def _repr_mimebundle_(self, **kwargs):
"""Present options to IPython for rich display of this object
@@ -127,13 +131,116 @@ class Extend_Object:
pass
return data
+ def emplace(self, other):
+ """Copy all items from other without making a new object.
-@extends(Pdf)
-class Extend_Pdf:
+ Particularly when working with pages, it may be desirable to remove all
+ of the existing page's contents and emplace (insert) a new page on top
+ of it, in a way that preserves all links and references to the original
+ page. (Or similarly, for other Dictionary objects in a PDF.)
- def _repr_mimebundle_(self, **kwargs):
+ When a page is assigned (``pdf.pages[0] = new_page``), only the
+ application knows if references to the original the original page are
+ still valid. For example, a PDF optimizer might restructure a page
+ object into another visually similar one, and references would be valid;
+ but for a program that reorganizes page contents such as a N-up
+ compositor, references may not be valid anymore.
+
+ This method takes precautions to ensure that child objects in common
+ with ``self`` and ``other`` are not inadvertently deleted.
+
+ Example:
+ >>> pdf.pages[0].objgen
+ (16, 0)
+ >>> pdf.pages[0].emplace(pdf.pages[1])
+ >>> pdf.pages[0].objgen
+ (16, 0) # Same object
+ """
+ del_keys = set(self.keys()) - set(other.keys())
+ for k in other.keys():
+ self[k] = other[k] # pylint: disable=unsupported-assignment-operation
+ for k in del_keys:
+ del self[k] # pylint: disable=unsupported-delete-operation
+
+ def write(self, data, *, filter=None, decode_parms=None, type_check=True):
+ """
+ Replace stream object's data with new (possibly compressed) `data`.
+
+ `filter` and `decode_parms` specify that compression that is present on
+ the input `data`.
+
+ When writing the PDF in :meth:`pikepdf.Pdf.save`,
+ pikepdf may change the compression or apply compression to data that was
+ not compressed, depending on the parameters given to that function. It
+ will never change lossless to lossy encoding.
+
+ PNG and TIFF images, even if compressed, cannot be directly inserted
+ into a PDF and displayed as images.
+
+ Args:
+ data (bytes): the new data to use for replacement
+ filter (pikepdf.Name or pikepdf.Array): The filter(s) with which the
+ data is (already) encoded
+ decode_parms (pikepdf.Dictionary or pikepdf.Array): Parameters for the
+ filters with which the object is encode
+ type_check (bool): Check arguments; use False only if you want to
+ intentionally create malformed PDFs.
+
+ If only one `filter` is specified, it may be a name such as
+ `Name('/FlateDecode')`. If there are multiple filters, then array
+ of names should be given.
+
+ If there is only one filter, `decode_parms` is a Dictionary of
+ parameters for that filter. If there are multiple filters, then
+ `decode_parms` is an Array of Dictionary, where each array index
+ is corresponds to the filter.
"""
- Present options to IPython for rich display of this object
+
+ if type_check and filter is not None:
+ if isinstance(filter, list):
+ filter = Array(filter)
+ filter = filter.wrap_in_array()
+
+ if isinstance(decode_parms, list):
+ decode_parms = Array(decode_parms)
+ elif decode_parms is None:
+ decode_parms = Array([])
+ else:
+ decode_parms = decode_parms.wrap_in_array()
+
+ if not all(isinstance(item, Name) for item in filter):
+ raise TypeError(
+ "filter must be: pikepdf.Name or pikepdf.Array([pikepdf.Name])"
+ )
+ if not all(
+ (isinstance(item, Dictionary) or item is None) for item in decode_parms
+ ):
+ raise TypeError(
+ "decode_parms must be: pikepdf.Dictionary or "
+ "pikepdf.Array([pikepdf.Dictionary])"
+ )
+ if len(decode_parms) != 0:
+ if len(filter) != len(decode_parms):
+ raise ValueError(
+ (
+ "filter ({}) and decode_parms ({}) must be arrays of "
+ " same length"
+ ).format(repr(filter), repr(decode_parms))
+ )
+ if len(filter) == 1:
+ filter = filter[0]
+ if len(decode_parms) == 0:
+ decode_parms = None
+ elif len(decode_parms) == 1:
+ decode_parms = decode_parms[0]
+ self._write(data, filter=filter, decode_parms=decode_parms)
+
+
+@augments(Pdf)
+class Extend_Pdf:
+ def _repr_mimebundle_(self, **_kwargs):
+ """
+ Present options to IPython or Jupyter for rich display of this object
See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display
"""
@@ -145,16 +252,12 @@ class Extend_Pdf:
data = {'application/pdf': bio.read()}
return data
- def open_metadata(
- self,
- set_pikepdf_as_editor=True,
- update_docinfo=True
- ):
+ def open_metadata(self, set_pikepdf_as_editor=True, update_docinfo=True):
"""
Open the PDF's XMP metadata for editing
Recommend for use in a ``with`` block. Changes are committed to the
- PDF when the block exits.
+ PDF when the block exits. (The ``Pdf`` must still be opened.)
Example:
>>> with pdf.open_metadata() as meta:
@@ -173,11 +276,128 @@ class Extend_Pdf:
pikepdf.models.PdfMetadata
"""
return PdfMetadata(
- self,
- pikepdf_mark=set_pikepdf_as_editor,
- sync_docinfo=update_docinfo
+ self, pikepdf_mark=set_pikepdf_as_editor, sync_docinfo=update_docinfo
+ )
+
+ def make_stream(self, data):
+ """
+ Create a new pikepdf.Stream object that is attached to this PDF.
+
+ Args:
+ data (bytes): Binary data for the stream object
+ """
+ return Stream(self, data)
+
+ def add_blank_page(self, *, page_size=(612, 792)):
+ """
+ Add a blank page to this PD. If pages already exist, the page will be added to
+ the end. Pages may be reordered using ``Pdf.pages``.
+
+ The caller may add content to the page by modifying its objects after creating
+ it.
+
+ Args:
+ page_size (tuple): The size of the page in PDF units (1/72 inch or 0.35mm).
+ Default size is set to a US Letter 8.5" x 11" page.
+ """
+ for dim in page_size:
+ if not (3 <= dim <= 14400):
+ raise ValueError('Page size must be between 3 and 14400 PDF units')
+
+ page_dict = Dictionary(
+ Type=Name.Page,
+ MediaBox=Array([0, 0, page_size[0], page_size[1]]),
+ Contents=self.make_stream(b''),
+ Resources=Dictionary(),
+ )
+ page = self.make_indirect(page_dict)
+ self._add_page(page, first=False)
+ return page
+
+ def close(self):
+ """
+ Close a Pdf object and release resources acquired by pikepdf
+
+ If pikepdf opened the file handle it will close it (e.g. when opened with a file
+ path). If the caller opened the file for pikepdf, the caller close the file.
+
+ pikepdf lazily loads data from PDFs, so some :class:`pikepdf.Object` may
+ implicitly depend on the :class:`pikepdf.Pdf` being open. This is always the
+ case for :class:`pikepdf.Stream` but can be true for any object. Do not close
+ the `Pdf` object if you might still be accessing content from it.
+
+ When an ``Object`` is copied from one ``Pdf`` to another, the ``Object`` is copied into
+ the destination ``Pdf`` immediately, so after accessing all desired information
+ from the source ``Pdf`` it may be closed.
+
+ Caution:
+ Closing the ``Pdf`` is currently implemented by resetting it to an empty
+ sentinel. It is currently possible to edit the sentinel as if it were a live
+ object. This behavior should not be relied on and is subject to change.
+
+ """
+
+ EMPTY_PDF = (
+ b"%PDF-1.3\n"
+ b"1 0 obj\n"
+ b"<< /Type /Catalog /Pages 2 0 R >>\n"
+ b"endobj\n"
+ b"2 0 obj\n"
+ b"<< /Type /Pages /Kids [] /Count 0 >>\n"
+ b"endobj\n"
+ b"xref\n"
+ b"0 3\n"
+ b"0000000000 65535 f \n"
+ b"0000000009 00000 n \n"
+ b"0000000058 00000 n \n"
+ b"trailer << /Size 3 /Root 1 0 R >>\n"
+ b"startxref\n"
+ b"110\n"
+ b"%%EOF\n"
)
+ if self.filename:
+ description = "closed file: " + self.filename
+ else:
+ description = "closed object"
+ self._process(description, EMPTY_PDF)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ @property
+ def allow(self):
+ """
+ Report permissions associated with this PDF.
+
+ By default these permissions will be replicated when the PDF is
+ saved. Permissions may also only be changed when a PDF is being saved,
+ and are only available for encrypted PDFs. If a PDF is not encrypted,
+ all operations are reported as allowed.
+
+ pikepdf has no way of enforcing permissions.
+
+ Returns: pikepdf.models.Permissions
+ """
+ results = {}
+ for field in Permissions.fields():
+ results[field] = getattr(self, '_allow_' + field)
+ return Permissions(**results)
+
+ @property
+ def encryption(self):
+ """
+ Report encryption information for this PDF.
+
+ Encryption settings may only be changed when a PDF is saved.
+
+ Returns: pikepdf.models.EncryptionInfo
+ """
+ return EncryptionInfo(self._encryption_data)
+
def _attach(self, *, basename, filebytes, mime=None, desc=''):
"""
Attach a file to this PDF
@@ -219,6 +439,7 @@ class Extend_Pdf:
if not mime:
from mimetypes import guess_type
+
mime, _encoding = guess_type(basename)
if not mime:
mime = 'application/octet-stream'
@@ -226,28 +447,28 @@ class Extend_Pdf:
filestream = Stream(self, filebytes)
filestream.Subtype = Name('/' + mime)
- filespec = Dictionary({
- '/Type': Name.Filespec,
- '/F': basename,
- '/UF': basename,
- '/Desc': desc,
- '/EF': Dictionary({
- '/F': filestream
- })
- })
+ filespec = Dictionary(
+ {
+ '/Type': Name.Filespec,
+ '/F': basename,
+ '/UF': basename,
+ '/Desc': desc,
+ '/EF': Dictionary({'/F': filestream}),
+ }
+ )
# names = self.Root.Names.EmbeddedFiles.Names.as_list()
# names.append(filename) # Key
# names.append(self.make_indirect(filespec))
- self.Root.Names.EmbeddedFiles.Names = Array([
- basename, # key
- self.make_indirect(filespec)
- ])
+ self.Root.Names.EmbeddedFiles.Names = Array(
+ [basename, self.make_indirect(filespec)] # key
+ )
if '/PageMode' not in self.Root:
self.Root.PageMode = Name.UseAttachments
-@extends(_ObjectMapping)
+
+@augments(_ObjectMapping)
class Extend_ObjectMapping:
def __contains__(self, key):
try:
diff --git a/src/pikepdf/_version.py b/src/pikepdf/_version.py
new file mode 100644
index 0000000..c9d4b7b
--- /dev/null
+++ b/src/pikepdf/_version.py
@@ -0,0 +1,13 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+from pkg_resources import DistributionNotFound
+from pkg_resources import get_distribution as _get_distribution
+
+try:
+ __version__ = _get_distribution(__package__).version
+except DistributionNotFound:
+ __version__ = "Not installed"
diff --git a/src/pikepdf/codec.py b/src/pikepdf/codec.py
new file mode 100644
index 0000000..d008fb2
--- /dev/null
+++ b/src/pikepdf/codec.py
@@ -0,0 +1,48 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+import codecs
+
+from ._qpdf import utf8_to_pdf_doc, pdf_doc_to_utf8
+
+
+def pdfdoc_encode(input, errors='strict'):
+ error_marker = b'?' if errors == 'replace' else b'\xad'
+ success, pdfdoc = utf8_to_pdf_doc(input, error_marker)
+ if not success:
+ if errors == 'strict':
+ raise ValueError("'pdfdoc' codec can't encode")
+ if errors == 'ignore':
+ pdfdoc = pdfdoc.replace(b'\xad', b'')
+ return pdfdoc, len(input)
+
+
+def pdfdoc_decode(input, errors='strict'):
+ if isinstance(input, memoryview):
+ input = input.tobytes()
+ utf8 = pdf_doc_to_utf8(input)
+ return utf8, len(input)
+
+
+class PdfDocCodec(codecs.Codec):
+ """Implements PdfDocEncoding character map used inside PDFs"""
+
+ def encode(self, input, errors='strict'):
+ return pdfdoc_encode(input, errors)
+
+ def decode(self, input, errors='strict'):
+ return pdfdoc_decode(input, errors)
+
+
+def find_pdfdoc(encoding):
+ if encoding == 'pdfdoc':
+ return codecs.CodecInfo(
+ name='pdfdoc', encode=PdfDocCodec().encode, decode=PdfDocCodec().decode
+ )
+ return None
+
+
+codecs.register(find_pdfdoc)
diff --git a/src/pikepdf/models/__init__.py b/src/pikepdf/models/__init__.py
index b0d27bc..023b836 100644
--- a/src/pikepdf/models/__init__.py
+++ b/src/pikepdf/models/__init__.py
@@ -4,10 +4,11 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-from .. import Object, ObjectType, PdfError
-from .matrix import PdfMatrix
+from pikepdf import Object, ObjectType, PdfError
+from .encryption import Permissions, Encryption, EncryptionInfo
from .image import PdfImage, PdfInlineImage, UnsupportedImageTypeError
+from .matrix import PdfMatrix
from .metadata import PdfMetadata
@@ -50,8 +51,10 @@ def parse_content_stream(page_or_stream, operators=''):
if not isinstance(page_or_stream, Object):
raise TypeError("stream must a PDF object")
- if page_or_stream._type_code != ObjectType.stream \
- and page_or_stream.get('/Type') != '/Page':
+ if (
+ page_or_stream._type_code != ObjectType.stream
+ and page_or_stream.get('/Type') != '/Page'
+ ):
raise TypeError("parse_content_stream called on page or stream object")
try:
@@ -87,8 +90,7 @@ class _Page:
raise AttributeError(item)
def __repr__(self):
- return repr(self.obj).replace(
- 'pikepdf.Dictionary', 'pikepdf.Page', 1)
+ return repr(self.obj).replace('pikepdf.Dictionary', 'pikepdf.Page', 1)
@property
def mediabox(self):
@@ -107,8 +109,7 @@ class _Page:
:return: True if there is text
"""
text_showing_operators = """TJ " ' Tj"""
- text_showing_insts = parse_content_stream(
- self.obj, text_showing_operators)
+ text_showing_insts = parse_content_stream(self.obj, text_showing_operators)
if len(text_showing_insts) > 0:
return True
return False
diff --git a/src/pikepdf/models/encryption.py b/src/pikepdf/models/encryption.py
new file mode 100644
index 0000000..c61df71
--- /dev/null
+++ b/src/pikepdf/models/encryption.py
@@ -0,0 +1,154 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+import collections
+import types
+
+
+class Permissions(types.SimpleNamespace):
+ """
+ Stores the permissions for an encrypted PDF.
+
+ Unencrypted PDFs implicitly have all permissions allowed.
+ pikepdf does not enforce the restrictions in any way. Permissions
+ can only be changed when a PDF is saved.
+ """
+
+ def __init__(
+ self,
+ accessibility=True,
+ extract=True,
+ modify_annotation=True,
+ modify_assembly=False,
+ modify_form=True,
+ modify_other=True,
+ print_lowres=True,
+ print_highres=True,
+ ):
+ kvs = locals()
+ del kvs['self']
+ super().__init__(**kvs)
+
+ def _readonly(self, *args):
+ raise TypeError("object is read-only")
+
+ __setattr__ = _readonly
+
+ __delattr__ = _readonly
+
+ def keys(self):
+ yield from (k for k in self.__dict__ if not k.startswith('_'))
+
+ def values(self):
+ yield from (v for k, v in self.__dict__.items() if not k.startswith('_'))
+
+ @classmethod
+ def fields(cls):
+ yield from (k for k in cls().__dict__ if not k.startswith('_'))
+
+
+class EncryptionInfo:
+ """
+ Reports encryption information for an encrypted PDF.
+
+ This information may not be changed, except when a PDF is saved.
+ This object is not used to specify the encryption settings to save
+ a PDF, due to non-overlapping information requirements.
+ """
+
+ def __init__(self, encdict):
+ self._encdict = encdict
+
+ @property
+ def R(self):
+ """Revision number of the security handler."""
+ return self._encdict['R']
+
+ @property
+ def V(self):
+ """Version of PDF password algorithm."""
+ return self._encdict['V']
+
+ @property
+ def P(self):
+ """Encoded permission bits.
+
+ See :meth:`Pdf.allow` instead.
+ """
+ return self._encdict['P']
+
+ @property
+ def stream_method(self):
+ """Encryption method used to encode streams."""
+ return self._encdict['stream']
+
+ @property
+ def string_method(self):
+ """Encryption method used to encode strings."""
+ return self._encdict['string']
+
+ @property
+ def file_method(self):
+ """Encryption method used to encode the whole file."""
+ return self._encdict['file']
+
+ @property
+ def user_password(self):
+ """If possible, return the user password.
+
+ The user password can only be retrieved when a PDF is opened
+ with the owner password and when older versions of the
+ encryption algorithm are used.
+
+ The password is always returned as ``bytes`` even if it has
+ a clear Unicode representation.
+ """
+ return self._encdict['user_passwd']
+
+ @property
+ def encryption_key(self):
+ """The RC4 or AES encryption key used for this file."""
+ return self._encdict['encryption_key']
+
+ @property
+ def bits(self):
+ """The number of encryption bits."""
+ return len(self._encdict['encryption_key']) * 8
+
+
+class Encryption(dict):
+ """
+ Specify the encryption settings to apply when a PDF is saved.
+
+ Args:
+ owner (str): The owner password to use. This allows full control
+ of the file. If blank, the PDF will be encrypted and
+ present as "(SECURED)" in PDF viewers. If the owner password
+ is blank, the user password should be as well.
+ user (str): The user password to use. With this password, some
+ restrictions will be imposed by a typical PDF reader.
+ If blank, the PDF can be opened by anyone, but only modified
+ as allowed by the permissions in ``allow``.
+ R (int): Select the security handler algorithm to use. Choose from:
+ ``2``, ``3``, ``4`` or ``6``. By default, the highest version of
+ is selected (``6``). ``5`` is a deprecated algorithm that should
+ not be used.
+ allow (pikepdf.Permissions): The permissions to set.
+ If omitted, all permissions are granted to the user.
+ aes (bool): If True, request the AES algorithm. If False, use RC4.
+ If omitted, AES is selected whenever possible (R >= 4).
+ metadata (bool): If True, also encrypt the PDF metadata. If False,
+ metadata is not encrypted. Reading document metadata without
+ decryption may be desirable in some cases. Requires ``aes=True``.
+ If omitted, metadata is encrypted whenever possible.
+ """
+
+ def __init__(
+ self, *, owner, user, R=6, allow=Permissions(), aes=True, metadata=True
+ ):
+ self.update(
+ dict(R=R, owner=owner, user=user, allow=allow, aes=aes, metadata=metadata)
+ )
diff --git a/src/pikepdf/models/image.py b/src/pikepdf/models/image.py
index 8ecb571..6493d85 100644
--- a/src/pikepdf/models/image.py
+++ b/src/pikepdf/models/image.py
@@ -4,20 +4,23 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+import struct
+from abc import ABC, abstractmethod
+from decimal import Decimal
from io import BytesIO
from itertools import zip_longest
-from abc import ABC, abstractmethod
-import struct
+from pathlib import Path
+from shutil import copyfileobj
+from tempfile import NamedTemporaryFile
+from zlib import decompress, error as ZlibError
-from decimal import Decimal
+from .. import Array, Dictionary, Name, Object, PdfError, Stream
-from .. import (
- Object, Array, PdfError, Name, Dictionary, Stream
-)
class DependencyError(Exception):
pass
+
class UnsupportedImageTypeError(Exception):
pass
@@ -37,7 +40,11 @@ def array_str_colorspace(value):
result = [str(items[n]) for n in range(3)]
result.append(bytes(items[3]))
return result
+ if len(items) == 2 and items[0] == '/ICCBased':
+ result = [str(items[0]), items[1]]
+ return result
return array_str(items)
+
return array_str(value)
@@ -112,16 +119,16 @@ class PdfImageBase(ABC):
if self._colorspaces:
if self._colorspaces[0] in self.SIMPLE_COLORSPACES:
return self._colorspaces[0]
- if self._colorspaces[0] == '/DeviceCMYK':
+ if self._colorspaces[0] in ('/DeviceCMYK', '/ICCBased'):
return self._colorspaces[0]
- if self._colorspaces[0] == '/Indexed' \
- and self._colorspaces[1] in self.SIMPLE_COLORSPACES:
+ if (
+ self._colorspaces[0] == '/Indexed'
+ and self._colorspaces[1] in self.SIMPLE_COLORSPACES
+ ):
return self._colorspaces[1]
- if self._colorspaces[0] == '/ICCBased':
- icc = self._colorspaces[1]
- return icc.stream_dict.get('/Alternate', '')
raise NotImplementedError(
- "not sure how to get colorspace: " + repr(self._colorspaces))
+ "not sure how to get colorspace: " + repr(self._colorspaces)
+ )
@property
def bits_per_component(self):
@@ -136,6 +143,11 @@ class PdfImageBase(ABC):
pass
@property
+ @abstractmethod
+ def icc(self):
+ pass
+
+ @property
def indexed(self):
"""``True`` if the image has a defined color palette"""
return '/Indexed' in self._colorspaces
@@ -147,7 +159,12 @@ class PdfImageBase(ABC):
@property
def mode(self):
- """``PIL.Image.mode`` equivalent for this image"""
+ """``PIL.Image.mode`` equivalent for this image, where possible
+
+ If an ICC profile is attached to the image, we still attempt to resolve a Pillow
+ mode.
+ """
+
m = ''
if self.indexed:
m = 'P'
@@ -160,6 +177,18 @@ class PdfImageBase(ABC):
m = 'L'
elif self.colorspace == '/DeviceCMYK':
m = 'CMYK'
+ elif self.colorspace == '/ICCBased':
+ try:
+ icc_profile = self._colorspaces[1]
+ icc_profile_nchannels = int(icc_profile['/N'])
+ if icc_profile_nchannels == 1:
+ m = 'L'
+ elif icc_profile_nchannels == 3:
+ m = 'RGB'
+ elif icc_profile_nchannels == 4:
+ m = 'CMYK'
+ except (ValueError, TypeError):
+ pass
if m == '':
raise NotImplementedError("Not sure how to handle PDF image of this type")
return m
@@ -175,7 +204,6 @@ class PdfImageBase(ABC):
[(/FilterName, {/DecodeParmName: Value, ...}), ...]
The order of /Filter matters as indicates the encoding/decoding sequence.
-
"""
return list(zip_longest(self.filters, self.decode_parms, fillvalue={}))
@@ -183,8 +211,8 @@ class PdfImageBase(ABC):
def palette(self):
"""Retrieves the color palette for this image
- :returns: (base_colorspace: str, palette: bytes)
- :rtype: tuple
+ Returns:
+ tuple (base_colorspace: str, palette: bytes)
"""
if not self.indexed:
@@ -209,6 +237,29 @@ class PdfImageBase(ABC):
def as_pil_image(self):
pass
+ @staticmethod
+ def _unstack_compression(buffer, filters):
+ """Remove stacked compression where it appears.
+
+ Stacked compression means when an image is set to:
+ ``[/FlateDecode /DCTDecode]``
+ for example.
+
+ Only Flate can be stripped off the front currently.
+
+ Args:
+ buffer (pikepdf._qpdf.Buffer): the compressed image data
+ filters (list of str): all files on the data
+ """
+ data = memoryview(buffer)
+ while len(filters) > 1 and filters[0] == '/FlateDecode':
+ try:
+ data = decompress(data)
+ except ZlibError as e:
+ raise UnsupportedImageTypeError() from e
+ filters = filters[1:]
+ return data, filters
+
class PdfImage(PdfImageBase):
"""Support class to provide a consistent API for manipulating PDF images
@@ -237,21 +288,20 @@ class PdfImage(PdfImageBase):
obj (pikepdf.Object): an Image XObject
"""
- if isinstance(obj, Stream) and \
- obj.stream_dict.get("/Subtype") != "/Image":
+ if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image":
raise TypeError("can't construct PdfImage from non-image")
self.obj = obj
+ self._icc = None
@classmethod
def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover
"""Insert a PIL image into a PDF (rudimentary)
- :param pdf: the PDF to attach the image to
- :type pdf: pikepdf.Pdf
- :param page: the page to attach the image to
- :param name: the name to set the image
- :param image: image
- :type image: PIL.Image.Image
+ Args:
+ pdf (pikepdf.Pdf): the PDF to attach the image to
+ page (pikepdf.Object): the page to attach the image to
+ name (str or pikepdf.Name): the name to set the image
+ image (PIL.Image.Image): the image to insert
"""
data = image.tobytes()
@@ -279,6 +329,26 @@ class PdfImage(PdfImageBase):
"""``False`` for image XObject"""
return False
+ @property
+ def icc(self):
+ """If an ICC profile is attached, return a Pillow object that describe it.
+
+ Most of the information may be found in ``icc.profile``.
+
+ Returns:
+ PIL.ImageCms.ImageCmsProfile
+ """
+ from PIL import ImageCms
+
+ if self.colorspace != '/ICCBased':
+ return None
+ if not self._icc:
+ iccstream = self._colorspaces[1]
+ iccbuffer = iccstream.get_stream_buffer()
+ iccbytesio = BytesIO(iccbuffer)
+ self._icc = ImageCms.ImageCmsProfile(iccbytesio)
+ return self._icc
+
def _extract_direct(self, *, stream):
"""Attempt to extract the image directly to a usable image file
@@ -296,30 +366,38 @@ class PdfImage(PdfImageBase):
# saved as a standard JPEG. RGB JPEGs without YUV conversion can't
# be saved as JPEGs, and are probably bugs. Some software in the
# wild actually produces RGB JPEGs in PDFs (probably a bug).
- return (self.mode == 'RGB' and
- self.filter_decodeparms[0][1].get('/ColorTransform', 1))
+ DEFAULT_CT_RGB = 1
+ ct = self.filter_decodeparms[0][1].get('/ColorTransform', DEFAULT_CT_RGB)
+ return self.mode == 'RGB' and ct == DEFAULT_CT_RGB
def normal_dct_cmyk():
# Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved.
# There is a YUVK colorspace but CMYK JPEGs don't generally use it
- return (self.mode == 'CMYK' and
- self.filter_decodeparms[0][1].get('/ColorTransform', 0))
+ DEFAULT_CT_CMYK = 0
+ ct = self.filter_decodeparms[0][1].get('/ColorTransform', DEFAULT_CT_CMYK)
+ return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK
+
+ data, filters = self._unstack_compression(
+ self.obj.get_raw_stream_buffer(), self.filters
+ )
- if self.filters == ['/CCITTFaxDecode']:
- data = self.obj.read_raw_bytes()
+ if filters == ['/CCITTFaxDecode']:
+ if self.colorspace == '/ICCBased':
+ raise UnsupportedImageTypeError("Cannot direct-extract CCITT + ICC")
stream.write(self._generate_ccitt_header(data))
stream.write(data)
return '.tif'
- elif self.filters == ['/DCTDecode'] and (
- self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk):
- buffer = self.obj.get_raw_stream_buffer()
- stream.write(buffer)
+ elif filters == ['/DCTDecode'] and (
+ self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk()
+ ):
+ stream.write(data)
return '.jpg'
raise UnsupportedImageTypeError()
def _extract_transcoded(self):
from PIL import Image
+
im = None
if self.mode == 'RGB' and self.bits_per_component == 8:
# No point in accessing the buffer here, size qpdf decodes to 3-byte
@@ -330,8 +408,7 @@ class PdfImage(PdfImageBase):
buffer = self.get_stream_buffer()
stride = 0 # tell Pillow to calculate stride from line width
ystep = 1 # image is top to bottom in memory
- im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride,
- ystep)
+ im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride, ystep)
if self.mode == 'P':
base_mode, palette = self.palette
if base_mode in ('RGB', 'L'):
@@ -347,14 +424,15 @@ class PdfImage(PdfImageBase):
im = Image.frombytes('1', self.size, data)
base_mode, palette = self.palette
- if not (palette == b'\x00\x00\x00\xff\xff\xff'
- or palette == b'\x00\xff'):
- raise NotImplementedError(
- 'monochrome image with nontrivial palette')
+ if not (palette == b'\x00\x00\x00\xff\xff\xff' or palette == b'\x00\xff'):
+ raise NotImplementedError('monochrome image with nontrivial palette')
+
+ if self.colorspace == '/ICCBased':
+ im.info['icc_profile'] = self.icc.tobytes()
return im
- def extract_to(self, *, stream):
+ def _extract_to_stream(self, *, stream):
"""Attempt to extract the image directly to a usable image file
If possible, the compressed data is extracted and inserted into
@@ -386,6 +464,51 @@ class PdfImage(PdfImageBase):
raise UnsupportedImageTypeError(repr(self))
+ def extract_to(self, *, stream=None, fileprefix=''):
+ """Attempt to extract the image directly to a usable image file
+
+ If possible, the compressed data is extracted and inserted into
+ a compressed image file format without transcoding the compressed
+ content. If this is not possible, the data will be decompressed
+ and extracted to an appropriate format.
+
+ Because it is not known until attempted what image format will be
+ extracted, users should not assume what format they are getting back.
+ When saving the image to a file, use a temporary filename, and then
+ rename the file to its final name based on the returned file extension.
+
+ Examples:
+
+ >>> im.extract_to(stream=bytes_io)
+ '.png'
+
+ >>> im.extract_to(fileprefix='/tmp/image00')
+ '/tmp/image00.jpg'
+
+ Args:
+ stream: Writable stream to write data to.
+ fileprefix (str or Path): The path to write the extracted image to,
+ without the file extension.
+
+ Returns:
+ str: If *fileprefix* was provided, then the fileprefix with the
+ appropriate extension. If no *fileprefix*, then an extension
+ indicating the file type.
+ """
+
+ if bool(stream) == bool(fileprefix):
+ raise ValueError("Cannot set both stream and fileprefix")
+ if stream:
+ return self._extract_to_stream(stream=stream)
+
+ bio = BytesIO()
+ extension = self._extract_to_stream(stream=bio)
+ bio.seek(0)
+ filepath = Path(str(Path(fileprefix)) + extension)
+ with filepath.open('wb') as target:
+ copyfileobj(bio, target)
+ return str(filepath)
+
def read_bytes(self):
"""Decompress this image and return it as unencoded bytes"""
return self.obj.read_bytes()
@@ -433,6 +556,7 @@ class PdfImage(PdfImageBase):
img_size = len(data)
tiff_header_struct = '<' + '2s' + 'H' + 'L' + 'H' + 'HHLL' * 8 + 'L'
+ # fmt: off
tiff_header = struct.pack(
tiff_header_struct,
b'II', # Byte order indication: Little endian
@@ -449,6 +573,7 @@ class PdfImage(PdfImageBase):
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image
0 # last IFD
)
+ # fmt: on
return tiff_header
def show(self):
@@ -457,7 +582,8 @@ class PdfImage(PdfImageBase):
def __repr__(self):
return '<pikepdf.PdfImage image mode={} size={}x{} at {}>'.format(
- self.mode, self.width, self.height, hex(id(self)))
+ self.mode, self.width, self.height, hex(id(self))
+ )
def _repr_png_(self):
"""Display hook for IPython/Jupyter"""
@@ -468,14 +594,17 @@ class PdfImage(PdfImageBase):
class PdfJpxImage(PdfImage):
-
def __init__(self, obj):
super().__init__(obj)
self.pil = self.as_pil_image()
def _extract_direct(self, *, stream):
- buffer = self.obj.get_raw_stream_buffer()
- stream.write(buffer)
+ data, filters = self._unstack_compression(
+ self.obj.get_raw_stream_buffer(), self.filters
+ )
+ if filters != ['/JPXDecode']:
+ raise UnsupportedImageTypeError(self.filters)
+ stream.write(data)
return '.jp2'
@property
@@ -508,7 +637,8 @@ class PdfJpxImage(PdfImage):
def __repr__(self):
return '<pikepdf.PdfJpxImage JPEG2000 image mode={} size={}x{} at {}>'.format(
- self.mode, self.width, self.height, hex(id(self)))
+ self.mode, self.width, self.height, hex(id(self))
+ )
class PdfInlineImage(PdfImageBase):
@@ -532,7 +662,7 @@ class PdfInlineImage(PdfImageBase):
b'/LZW': b'/LZWDecode',
b'/RL': b'/RunLengthDecode',
b'/CCF': b'/CCITTFaxDecode',
- b'/DCT': b'/DCTDecode'
+ b'/DCT': b'/DCTDecode',
}
def __init__(self, *, image_data, image_object: tuple):
@@ -554,8 +684,7 @@ class PdfInlineImage(PdfImageBase):
try:
reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')
except PdfError as e:
- raise PdfError(
- "parsing inline " + reparse.decode('unicode_escape')) from e
+ raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e
self.obj = reparsed_obj
self.pil = None
@@ -575,7 +704,6 @@ class PdfInlineImage(PdfImageBase):
else:
raise NotImplementedError(repr(obj))
-
def _metadata(self, name, type_, default):
return metadata_from_obj(self.obj, name, type_, default)
@@ -597,6 +725,10 @@ class PdfInlineImage(PdfImageBase):
def is_inline(self):
return True
+ @property
+ def icc(self):
+ raise ValueError("Inline images may not have ICC profiles")
+
def __repr__(self):
mode = '?'
try:
@@ -604,23 +736,24 @@ class PdfInlineImage(PdfImageBase):
except Exception:
pass
return '<pikepdf.PdfInlineImage image mode={} size={}x{} at {}>'.format(
- mode, self.width, self.height, hex(id(self)))
+ mode, self.width, self.height, hex(id(self))
+ )
def as_pil_image(self):
- from PIL import Image
-
if self.pil:
return self.pil
raise NotImplementedError('not yet')
- def extract_to(self, *, stream): # pylint: disable=unused-argument
+ def extract_to(
+ self, *, stream=None, fileprefix=''
+ ): # pylint: disable=unused-argument
raise UnsupportedImageTypeError("inline images don't support extract")
def read_bytes(self):
raise NotImplementedError("qpdf returns compressed")
- #return self._data._inline_image_bytes()
+ # return self._data._inline_image_bytes()
def get_stream_buffer(self):
raise NotImplementedError("qpdf returns compressed")
- #return memoryview(self._data.inline_image_bytes())
+ # return memoryview(self._data.inline_image_bytes())
diff --git a/src/pikepdf/models/matrix.py b/src/pikepdf/models/matrix.py
index d68fae6..4c5c2fb 100644
--- a/src/pikepdf/models/matrix.py
+++ b/src/pikepdf/models/matrix.py
@@ -4,7 +4,8 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-from math import cos, sin, pi
+from math import cos, pi, sin
+
class PdfMatrix:
"""
@@ -31,6 +32,7 @@ class PdfMatrix:
"""
def __init__(self, *args):
+ # fmt: off
if not args:
self.values = ((1, 0, 0), (0, 1, 0), (0, 0, 1))
elif len(args) == 6:
@@ -51,6 +53,7 @@ class PdfMatrix:
tuple(args[0][2]))
else:
raise ValueError('arguments')
+ # fmt: on
@staticmethod
def identity():
@@ -66,10 +69,13 @@ class PdfMatrix:
a = self.values
b = other.values
return PdfMatrix(
- [[sum([float(i) * float(j)
- for i, j in zip(row, col)]
- ) for col in zip(*b)]
- for row in a]
+ [
+ [
+ sum([float(i) * float(j) for i, j in zip(row, col)])
+ for col in zip(*b)
+ ]
+ for row in a
+ ]
)
def scaled(self, x, y):
diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
index 65934cd..a029b2b 100644
--- a/src/pikepdf/models/metadata.py
+++ b/src/pikepdf/models/metadata.py
@@ -4,24 +4,20 @@
#
# Copyright (C) 2018, James R. Barlow (https://github.com/jbarlow83/)
+import re
+import sys
from collections import namedtuple
from collections.abc import MutableMapping
from datetime import datetime
from functools import wraps
from io import BytesIO
-from pkg_resources import (
- get_distribution as _get_distribution,
- DistributionNotFound
-)
from warnings import warn
-import re
-import sys
from lxml import etree
-from lxml.etree import QName, XMLSyntaxError
-from defusedxml.lxml import parse
+from lxml.etree import parse, QName, XMLSyntaxError
-from .. import Stream, Name, String, PdfError
+from .. import Name, PdfError, Stream, String
+from .. import __version__ as pikepdf_version
XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
@@ -79,27 +75,21 @@ XMP_CONTAINERS = [
XmpContainer('Seq', list, list.append),
]
-LANG_ALTS = frozenset([
- str(QName(XMP_NS_DC, 'title')),
- str(QName(XMP_NS_DC, 'description')),
- str(QName(XMP_NS_DC, 'rights')),
- str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
-])
+LANG_ALTS = frozenset(
+ [
+ str(QName(XMP_NS_DC, 'title')),
+ str(QName(XMP_NS_DC, 'description')),
+ str(QName(XMP_NS_DC, 'rights')),
+ str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
+ ]
+)
# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,
# but we'll be strict to ensure wider compatibility.)
re_xml_illegal_chars = re.compile(
r"(?u)[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"
)
-re_xml_illegal_bytes = re.compile(
- br"[^\x09\x0A\x0D\x20-\xFF]|&#0;"
-)
-
-# Repeat this to avoid circular from top package's pikepdf.__version__
-try:
- pikepdf_version = _get_distribution(__name__).version
-except DistributionNotFound:
- pikepdf_version = ""
+re_xml_illegal_bytes = re.compile(br"[^\x09\x0A\x0D\x20-\xFF]|&#0;")
def encode_pdf_date(d: datetime) -> str:
@@ -171,6 +161,7 @@ class AuthorConverter:
if sys.version_info < (3, 7):
+
def fromisoformat(datestr):
# strptime %z can't parse a timezone with punctuation
if re.search(r'[+-]\d{2}[-:]\d{2}$', datestr):
@@ -179,9 +170,12 @@ if sys.version_info < (3, 7):
return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S%z")
except ValueError:
return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S")
+
+
else:
fromisoformat = datetime.fromisoformat
+
class DateConverter:
@staticmethod
def xmp_from_docinfo(docinfo_val):
@@ -203,6 +197,7 @@ def ensure_loaded(fn):
if not self._xmp:
self._load()
return fn(self, *args, **kwargs)
+
return wrapper
@@ -228,10 +223,10 @@ class PdfMetadata(MutableMapping):
To update metadata, use a with block.
- .. code-block:: python
+ Example:
- with pdf.open_metadata() as records:
- records['dc:title'] = 'New Title'
+ >>> with pdf.open_metadata() as records:
+ records['dc:title'] = 'New Title'
See Also:
:meth:`pikepdf.Pdf.open_metadata`
@@ -289,7 +284,9 @@ class PdfMetadata(MutableMapping):
continue
self[qname] = val
except (ValueError, AttributeError) as e:
- msg = "The metadata field {} could not be copied to XMP".format(docinfo_name)
+ msg = "The metadata field {} could not be copied to XMP".format(
+ docinfo_name
+ )
if raise_failure:
raise ValueError(msg) from e
else:
@@ -314,6 +311,15 @@ class PdfMetadata(MutableMapping):
pis = self._xmp.xpath('/processing-instruction()')
for pi in pis:
etree.strip_tags(self._xmp, pi.tag)
+ try:
+ self._get_rdf_root()
+ except ValueError:
+ if self._xmp.find('.', self.NS).tag == '{adobe:ns:meta/}xmpmeta':
+ # Looks like: <x:xmpmeta></x:xmpmeta>, so reload with template
+ # that includes <rdf:RDF>
+ return self._load_from(XMP_EMPTY)
+ else:
+ raise # Probably not XMP
@ensure_loaded
def __enter__(self):
@@ -347,7 +353,11 @@ class PdfMetadata(MutableMapping):
try:
value = converter.docinfo_from_xmp(value)
except ValueError:
- warn("The DocumentInfo field {} could not be updated from XMP".format(docinfo_name))
+ warn(
+ "The DocumentInfo field {} could not be updated from XMP".format(
+ docinfo_name
+ )
+ )
value = None
if value is None:
if docinfo_name in self._pdf.docinfo:
@@ -562,19 +572,19 @@ class PdfMetadata(MutableMapping):
val = AltList([clean(val)])
if isinstance(val, (list, set)):
rdfdesc = etree.SubElement(
- rdf, QName(XMP_NS_RDF, 'Description'),
- attrib={
- QName(XMP_NS_RDF, 'about'): '',
- },
+ rdf,
+ QName(XMP_NS_RDF, 'Description'),
+ attrib={QName(XMP_NS_RDF, 'about'): ''},
)
node = etree.SubElement(rdfdesc, self._qname(key))
add_array(node, val)
elif isinstance(val, str):
rdfdesc = etree.SubElement(
- rdf, QName(XMP_NS_RDF, 'Description'),
+ rdf,
+ QName(XMP_NS_RDF, 'Description'),
attrib={
QName(XMP_NS_RDF, 'about'): '',
- self._qname(key): clean(val)
+ self._qname(key): clean(val),
},
)
else:
@@ -588,7 +598,11 @@ class PdfMetadata(MutableMapping):
node, attrib, _oldval, parent = next(self._get_elements(key))
if attrib: # Inline
del node.attrib[attrib]
- if len(node.attrib) == 1 and len(node) == 0 and QName(XMP_NS_RDF, 'about') in node.attrib:
+ if (
+ len(node.attrib) == 1
+ and len(node) == 0
+ and QName(XMP_NS_RDF, 'about') in node.attrib
+ ):
# The only thing left on this node is rdf:about="", so remove it
parent.remove(node)
else:
diff --git a/src/pikepdf/objects.py b/src/pikepdf/objects.py
index a888b97..2e42eb9 100644
--- a/src/pikepdf/objects.py
+++ b/src/pikepdf/objects.py
@@ -8,24 +8,33 @@
The purpose of these is to provide nice-looking classes to allow explicit
construction of PDF objects and more pythonic idioms and facilitate discovery
-by documentation generators.
+by documentation generators and linters.
It's also a place to narrow the scope of input types to those more easily
converted to C++.
-In reality all of these return objects of class pikepdf.Object or rather
-QPDFObjectHandle which is a generic type.
-
+There is some deliberate "smoke and mirrors" here: all of the objects are truly
+instances of ``pikepdf.Object``, which is a variant container object. The
+``__new__`` constructs a ``pikepdf.Object`` in each case, and the rest of the
+class definition is present as an aide for code introspection.
"""
from . import _qpdf
-from ._qpdf import Object, ObjectType
-# pylint: disable=unused-import
-from ._qpdf import Operator
+# pylint: disable=unused-import, abstract-method
+from ._qpdf import Object, ObjectType, Operator
+
+# By default pikepdf.Object will identify itself as pikepdf._qpdf.Object
+# Here we change the module to discourage people from using that internal name
+# Instead it will become pikepdf.objects.Object
+Object.__module__ = __name__
+ObjectType.__module__ = __name__
+Operator.__module__ = __name__
-class _ObjectMeta(type):
+
+# type(Object) is the metaclass that pybind11 defines; we wish to extend that
+class _ObjectMeta(type(Object)):
"""Supports instance checking"""
def __instancecheck__(cls, instance):
@@ -38,9 +47,13 @@ class _NameObjectMeta(_ObjectMeta):
"""Supports usage pikepdf.Name.Whatever -> Name('/Whatever')"""
def __getattr__(self, attr):
+ if attr.startswith('_'):
+ return _ObjectMeta.__getattr__(attr)
return Name('/' + attr)
- def __setattr__(self, name, value):
+ def __setattr__(self, attr, value):
+ if attr.startswith('_'):
+ return _ObjectMeta.__setattr__(attr, value)
raise TypeError("Attributes may not be set on pikepdf.Name")
def __getitem__(self, item):
@@ -56,7 +69,7 @@ class _NameObjectMeta(_ObjectMeta):
)
-class Name(metaclass=_NameObjectMeta):
+class Name(Object, metaclass=_NameObjectMeta):
"""Constructs a PDF Name object
Names can be constructed with two notations:
@@ -69,6 +82,7 @@ class Name(metaclass=_NameObjectMeta):
that are normally expected to be in a PDF. The latter is preferred for
dynamic names and attributes.
"""
+
object_type = ObjectType.name
def __new__(cls, name):
@@ -79,8 +93,9 @@ class Name(metaclass=_NameObjectMeta):
return _qpdf._new_name(name)
-class String(metaclass=_ObjectMeta):
+class String(Object, metaclass=_ObjectMeta):
"""Constructs a PDF String object"""
+
object_type = ObjectType.string
def __new__(cls, s):
@@ -97,8 +112,9 @@ class String(metaclass=_ObjectMeta):
return _qpdf._new_string_utf8(s)
-class Array(metaclass=_ObjectMeta):
+class Array(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Array object"""
+
object_type = ObjectType.array
def __new__(cls, a=None):
@@ -118,8 +134,9 @@ class Array(metaclass=_ObjectMeta):
return _qpdf._new_array(a)
-class Dictionary(metaclass=_ObjectMeta):
+class Dictionary(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Dictionary object"""
+
object_type = ObjectType.dictionary
def __new__(cls, d=None, **kwargs):
@@ -147,15 +164,15 @@ class Dictionary(metaclass=_ObjectMeta):
if kwargs:
# Add leading slash
# Allows Dictionary(MediaBox=(0,0,1,1), Type=Name('/Page')...
- return _qpdf._new_dictionary(
- {('/' + k) : v for k, v in kwargs.items()})
+ return _qpdf._new_dictionary({('/' + k): v for k, v in kwargs.items()})
if not d:
d = {}
return _qpdf._new_dictionary(d)
-class Stream(metaclass=_ObjectMeta):
+class Stream(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Stream object"""
+
object_type = ObjectType.stream
def __new__(cls, owner, obj):
diff --git a/src/qpdf/annotation.cpp b/src/qpdf/annotation.cpp
new file mode 100644
index 0000000..f82ebdf
--- /dev/null
+++ b/src/qpdf/annotation.cpp
@@ -0,0 +1,52 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright (C) 2019, James R. Barlow (https://github.com/jbarlow83/)
+ */
+
+
+
+#include <qpdf/Constants.h>
+#include <qpdf/Types.h>
+#include <qpdf/DLL.h>
+#include <qpdf/QPDFExc.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/QPDFAnnotationObjectHelper.hh>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "pikepdf.h"
+
+
+void init_annotation(py::module &m)
+{
+ py::class_<QPDFAnnotationObjectHelper>(m, "Annotation")
+ .def(py::init<QPDFObjectHandle &>(), py::keep_alive<0, 1>())
+ .def_property_readonly("subtype", &QPDFAnnotationObjectHelper::getSubtype)
+ .def_property_readonly("flags", &QPDFAnnotationObjectHelper::getFlags)
+ .def_property_readonly("appearance_state", &QPDFAnnotationObjectHelper::getAppearanceState)
+ .def_property_readonly("appearance_dict", &QPDFAnnotationObjectHelper::getAppearanceDictionary)
+ .def("get_appearance_stream",
+ [](QPDFAnnotationObjectHelper& anno, QPDFObjectHandle& which, std::string const& state = "") {
+ // if (!which.isName())
+ // throw py::type_error("which must be pikepdf.Name");
+ return anno.getAppearanceStream(which.getName(), state);
+ },
+ py::arg("which"),
+ py::arg("state") = ""
+ )
+ .def("get_page_content_for_appearance",
+ [](QPDFAnnotationObjectHelper& anno, QPDFObjectHandle& name, int rotate, int required_flags, int forbidden_flags) {
+ //auto name = name_.getName();
+ return anno.getPageContentForAppearance(name.getName(), rotate, required_flags, forbidden_flags);
+ },
+ py::arg("name"),
+ py::arg("rotate"),
+ py::arg("required_flags") = 0,
+ py::arg("forbidden_flags") = an_invisible | an_hidden
+ )
+ ;
+}
diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp
index 392d9ff..1270961 100644
--- a/src/qpdf/object.cpp
+++ b/src/qpdf/object.cpp
@@ -363,7 +363,8 @@ void init_object(py::module& m)
[](QPDFObjectHandle &h, std::shared_ptr<QPDF> possible_owner) {
return (h.getOwningQPDF() == possible_owner.get());
},
- "Test if this object is owned by the indicated *possible_owner*."
+ "Test if this object is owned by the indicated *possible_owner*.",
+ py::arg("possible_owner")
)
.def_property_readonly("is_indirect", &QPDFObjectHandle::isIndirect)
.def("__repr__", &objecthandle_repr)
@@ -555,9 +556,9 @@ void init_object(py::module& m)
}
return py::cast(value);
},
- "for dictionary objects, behave as dict.get(key, default=None)",
+ "For ``pikepdf.Dictionary`` objects, behave as ``dict.get(key, default=None)``",
py::arg("key"),
- py::arg("default_") = py::none(),
+ py::arg("default") = py::none(),
py::return_value_policy::reference_internal
)
.def("get",
@@ -570,9 +571,9 @@ void init_object(py::module& m)
}
return py::cast(value);
},
- "for dictionary objects, behave as dict.get(key, default=None)",
+ "For ``pikepdf.Dictionary`` objects, behave as ``dict.get(key, default=None)``",
py::arg("key"),
- py::arg("default_") = py::none(),
+ py::arg("default") = py::none(),
py::return_value_policy::reference_internal
)
.def("keys", &QPDFObjectHandle::getKeys)
@@ -664,6 +665,12 @@ void init_object(py::module& m)
h.eraseItem(u_index);
}
)
+ .def("wrap_in_array",
+ [](QPDFObjectHandle &h) {
+ return h.wrapInArray();
+ },
+ "Return the object wrapped in an array if not already an array."
+ )
.def("get_stream_buffer",
[](QPDFObjectHandle &h) {
PointerHolder<Buffer> phbuf = h.getStreamData();
@@ -694,37 +701,17 @@ void init_object(py::module& m)
},
"Read the content stream associated with this object without decoding"
)
- .def("write",
- [](QPDFObjectHandle &h, py::bytes data, py::args args, py::kwargs kwargs) {
+ .def("_write",
+ [](QPDFObjectHandle &h, py::bytes data, py::object filter, py::object decode_parms) {
std::string sdata = data;
- QPDFObjectHandle filter = QPDFObjectHandle::newNull();
- QPDFObjectHandle decode_parms = QPDFObjectHandle::newNull();
- if (args.size() != 0)
- throw py::value_error("Too many positional arguments");
- if (kwargs.contains("filter"))
- filter = objecthandle_encode(kwargs["filter"]);
- if (kwargs.contains("decode_parms"))
- decode_parms = objecthandle_encode(kwargs["decode_parms"]);
- h.replaceStreamData(sdata, filter, decode_parms);
+ QPDFObjectHandle h_filter = objecthandle_encode(filter);
+ QPDFObjectHandle h_decode_parms = objecthandle_encode(decode_parms);
+ h.replaceStreamData(sdata, h_filter, h_decode_parms);
},
R"~~~(
- Replace the content stream with `data`, compressed according to `filter` and `decode_parms`
-
- :param data: the new data to use for replacement
- :type data: bytes
- :param filter: The filter(s) with which the data is (already) encoded
- :param decode_parms: Parameters for the filters with which the object is encode
-
- If only one `filter` is specified, it may be a name such as
- `Name('/FlateDecode')`. If there are multiple filters, then array
- of names should be given.
-
- If there is only one filter, `decode_parms` is a Dictionary of
- parameters for that filter. If there are multiple filters, then
- `decode_parms` is an Array of Dictionary, where each array index
- is corresponds to the filter.
-
- )~~~"
+ Low level write/replace stream data without argument checking. Use .write().
+ )~~~",
+ py::arg("data"), py::arg("filter"), py::arg("decode_parms")
)
.def_property_readonly("images",
[](QPDFObjectHandle &h) {
@@ -749,7 +736,16 @@ void init_object(py::module& m)
py::arg("prepend") = false,
py::keep_alive<1, 2>()
)
- .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams)
+ .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams,
+ R"~~~(
+ Coalesce an array of page content streams into a single content stream.
+
+ The PDF specification allows the ``/Contents`` object to contain either
+ an array of content streams or a single content stream. However, it
+ simplifies parsing and editing if there is only a single content stream.
+ This function merges all content streams.
+ )~~~"
+ )
.def_property_readonly("_objgen",
&object_get_objgen
)
@@ -811,6 +807,41 @@ void init_object(py::module& m)
py::arg("resolved") = false,
"Convert PDF objects into their binary representation, optionally resolving indirect objects."
)
+ .def("to_json",
+ [](QPDFObjectHandle &h, bool dereference = false) -> py::bytes {
+ return h.getJSON(dereference).unparse();
+ },
+ py::arg("dereference") = false,
+ R"~~~(
+ Convert to a QPDF JSON representation of the object.
+
+ See the QPDF manual for a description of its JSON representation.
+ http://qpdf.sourceforge.net/files/qpdf-manual.html#ref.json
+
+ Not necessarily compatible with other PDF-JSON representations that
+ exist in the wild.
+
+ * Names are encoded as UTF-8 strings
+ * Indirect references are encoded as strings containing ``obj gen R``
+ * Strings are encoded as UTF-8 strings with unrepresentable binary
+ characters encoded as ``\uHHHH``
+ * Encoding streams just encodes the stream's dictionary; the stream
+ data is not represented
+ * Object types that are only valid in content streams (inline
+ image, operator) as well as "reserved" objects are not
+ representable and will be serialized as ``null``.
+
+ Args:
+ dereference (bool): If True, deference the object is this is an
+ indirect object.
+
+ Returns:
+ bytes: JSON bytestring of object. The object is UTF-8 encoded
+ and may be decoded to a Python str that represents the binary
+ values ``\x00-\xFF`` as ``U+0000`` to ``U+00FF``; that is,
+ it may contain mojibake.
+ )~~~"
+ )
; // end of QPDFObjectHandle bindings
m.def("_new_boolean", &QPDFObjectHandle::newBool, "Construct a PDF Boolean object");
@@ -900,7 +931,8 @@ void init_object(py::module& m)
[](const std::string& op) {
return QPDFObjectHandle::newOperator(op);
},
- "Construct a PDF Operator object for use in content streams"
+ "Construct a PDF Operator object for use in content streams.",
+ py::arg("op")
);
m.def("_Null", &QPDFObjectHandle::newNull,
"Construct a PDF Null object"
diff --git a/src/qpdf/pikepdf.cpp b/src/qpdf/pikepdf.cpp
new file mode 100644
index 0000000..2daa69a
--- /dev/null
+++ b/src/qpdf/pikepdf.cpp
@@ -0,0 +1,98 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright (C) 2019, James R. Barlow (https://github.com/jbarlow83/)
+ */
+
+#include <sstream>
+#include <type_traits>
+#include <cerrno>
+#include <cstring>
+
+#include "pikepdf.h"
+
+#include <qpdf/QPDFExc.hh>
+#include <qpdf/QPDFSystemError.hh>
+#include <qpdf/QUtil.hh>
+
+#include <pybind11/stl.h>
+#include <pybind11/iostream.h>
+#include <pybind11/buffer_info.h>
+
+#include "qpdf_pagelist.h"
+#include "utils.h"
+
+
+extern "C" const char* qpdf_get_qpdf_version();
+
+
+class TemporaryErrnoChange {
+public:
+ TemporaryErrnoChange(int val) {
+ stored = errno;
+ errno = val;
+ }
+ ~TemporaryErrnoChange() {
+ errno = stored;
+ }
+private:
+ int stored;
+};
+
+
+PYBIND11_MODULE(_qpdf, m) {
+ //py::options options;
+ //options.disable_function_signatures();
+
+ m.doc() = "pikepdf provides a Pythonic interface for QPDF";
+
+ m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version");
+
+ init_qpdf(m);
+ init_pagelist(m);
+ init_object(m);
+ init_annotation(m);
+
+ m.def("utf8_to_pdf_doc",
+ [](py::str utf8, char unknown) {
+ std::string pdfdoc;
+ bool success = QUtil::utf8_to_pdf_doc(std::string(utf8), pdfdoc, unknown);
+ return py::make_tuple(success, py::bytes(pdfdoc));
+ }
+ );
+ m.def("pdf_doc_to_utf8",
+ [](py::bytes pdfdoc) -> py::str {
+ return py::str(QUtil::pdf_doc_to_utf8(pdfdoc));
+ }
+ );
+
+ static py::exception<QPDFExc> exc_main(m, "PdfError");
+ static py::exception<QPDFExc> exc_password(m, "PasswordError");
+ py::register_exception_translator([](std::exception_ptr p) {
+ try {
+ if (p) std::rethrow_exception(p);
+ } catch (const QPDFExc &e) {
+ if (e.getErrorCode() == qpdf_e_password) {
+ exc_password(e.what());
+ } else {
+ exc_main(e.what());
+ }
+ } catch (const QPDFSystemError &e) {
+ if (e.getErrno() != 0) {
+ TemporaryErrnoChange errno_holder(e.getErrno());
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, e.getDescription().c_str());
+ } else {
+ exc_main(e.what());
+ }
+ }
+ });
+
+
+#ifdef VERSION_INFO
+ m.attr("__version__") = VERSION_INFO;
+#else
+ m.attr("__version__") = "dev";
+#endif
+}
diff --git a/src/qpdf/pikepdf.h b/src/qpdf/pikepdf.h
index 7fbd6e8..0acd807 100644
--- a/src/qpdf/pikepdf.h
+++ b/src/qpdf/pikepdf.h
@@ -40,9 +40,6 @@ namespace pybind11 { namespace detail {
};
}}
-#define CUSTOM_TYPE_CONVERSION 1
-#if CUSTOM_TYPE_CONVERSION
-
// From object_convert.cpp
pybind11::object decimal_from_pdfobject(QPDFObjectHandle h);
@@ -57,24 +54,9 @@ namespace pybind11 { namespace detail {
* Conversion part 1 (Python->C++): convert a PyObject into a Object
*/
bool load(handle src, bool convert) {
- // if (src.is_none()) {
- // if (!convert) return false;
- // value = QPDFObjectHandle::newNull();
- // return true;
- // }
- // Attempting to construct these does not work...
- // if (convert) {
- // if (PYBIND11_LONG_CHECK(src.ptr())) {
- // auto as_int = src.cast<long long>();
- // value = QPDFObjectHandle::newInteger(as_int);
- // } /*else if (PyFloat_Check(src.ptr())) {
- // auto as_double = src.cast<double>();
- // value = QPDFObjectHandle::newReal(as_double);
- // } */ else {
- // return base::load(src, convert);
- // }
- // return true;
- // }
+ // Do whatever our base does
+ // Potentially we could convert some scalrs to QPDFObjectHandle here,
+ // but most of the interfaces just expect straight C++ types.
return base::load(src, convert);
}
@@ -157,7 +139,6 @@ namespace pybind11 { namespace detail {
}
};
}} // namespace pybind11::detail
-#endif
namespace py = pybind11;
@@ -166,6 +147,8 @@ PYBIND11_MAKE_OPAQUE(std::vector<QPDFObjectHandle>);
typedef std::map<std::string, QPDFObjectHandle> ObjectMap;
PYBIND11_MAKE_OPAQUE(ObjectMap);
+// From qpdf.cpp
+void init_qpdf(py::module& m);
// From object.cpp
size_t list_range_check(QPDFObjectHandle h, int index);
@@ -183,6 +166,9 @@ QPDFObjectHandle objecthandle_encode(const py::handle handle);
std::vector<QPDFObjectHandle> array_builder(const py::iterable iter);
std::map<std::string, QPDFObjectHandle> dict_builder(const py::dict dict);
+// From annotation.cpp
+void init_annotation(py::module &m);
+
// Support for recursion checks
class StackGuard
{
diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp
index 5bb8ea9..0a5fc26 100644
--- a/src/qpdf/qpdf.cpp
+++ b/src/qpdf/qpdf.cpp
@@ -28,10 +28,9 @@
#include "qpdf_pagelist.h"
#include "qpdf_inputsource.h"
+#include "qpdf_pipeline.h"
#include "utils.h"
-extern "C" const char* qpdf_get_qpdf_version();
-
void check_stream_is_usable(py::object stream)
{
@@ -58,6 +57,7 @@ open_pdf(
q->setPasswordIsHexKey(hex_password);
q->setIgnoreXRefStreams(ignore_xref_streams);
q->setAttemptRecovery(attempt_recovery);
+ q->setImmediateCopyFrom(true);
if (py::hasattr(filename_or_stream, "read") && py::hasattr(filename_or_stream, "seek")) {
// Python code gave us an object with a stream interface
@@ -66,7 +66,7 @@ open_pdf(
check_stream_is_usable(stream);
// The PythonInputSource object will be owned by q
- InputSource* input_source = new PythonInputSource(stream);
+ auto input_source = PointerHolder<InputSource>(new PythonInputSource(stream));
py::gil_scoped_release release;
q->processInputSource(input_source, password.c_str());
} else {
@@ -80,7 +80,7 @@ open_pdf(
q->processFile(
description.c_str(),
file, // transferring ownership
- true, // QPDF will close the file
+ true, // QPDF will close the file (including if there are exceptions)
password.c_str()
);
file = nullptr; // QPDF owns the file and will close it
@@ -116,122 +116,320 @@ private:
};
+void update_xmp_pdfversion(QPDF &q, std::string version)
+{
+ auto impl = py::module::import("pikepdf._cpphelpers").attr("update_xmp_pdfversion");
+ auto pypdf = py::cast(q);
+ impl(pypdf, version);
+}
+
+
+void setup_encryption(
+ QPDFWriter &w,
+ py::object encryption,
+ std::string &owner,
+ std::string &user
+)
+{
+ bool aes = true;
+ bool metadata = true;
+ std::map<std::string, bool> allow;
+ int encryption_level = 6;
+
+ if (encryption.contains("R")) {
+ if (!py::isinstance<py::int_>(encryption["R"]))
+ throw py::type_error("Encryption level 'R' must be an integer");
+ encryption_level = py::int_(encryption["R"]);
+ }
+ if (encryption_level < 2 || encryption_level > 6)
+ throw py::value_error("Invalid encryption level: must be 2, 3, 4 or 6");
+
+ if (encryption_level == 5) {
+ auto warn = py::module::import("warnings").attr("warn");
+ warn("Encryption R=5 is deprecated");
+ }
+
+ if (encryption.contains("owner")) {
+ if (encryption_level <= 4) {
+ auto success = QUtil::utf8_to_pdf_doc(encryption["owner"].cast<std::string>(), owner);
+ if (!success)
+ throw py::value_error("Encryption level is R3/R4 and password is not encodable as PDFDocEncoding");
+ } else {
+ owner = encryption["owner"].cast<std::string>();
+ }
+ }
+ if (encryption.contains("user")) {
+ if (encryption_level <= 4) {
+ auto success = QUtil::utf8_to_pdf_doc(encryption["user"].cast<std::string>(), user);
+ if (!success)
+ throw py::value_error("Encryption level is R3/R4 and password is not encodable as PDFDocEncoding");
+ } else {
+ user = encryption["user"].cast<std::string>();
+ }
+ }
+ if (encryption.contains("allow")) {
+ auto pyallow = encryption["allow"];
+ allow["accessibility"] = pyallow.attr("accessibility").cast<bool>();
+ allow["extract"] = pyallow.attr("extract").cast<bool>();
+ allow["modify_assembly"] = pyallow.attr("modify_assembly").cast<bool>();
+ allow["modify_annotation"] = pyallow.attr("modify_annotation").cast<bool>();
+ allow["modify_form"] = pyallow.attr("modify_form").cast<bool>();
+ allow["modify_other"] = pyallow.attr("modify_other").cast<bool>();
+ allow["print_lowres"] = pyallow.attr("print_lowres").cast<bool>();
+ allow["print_highres"] = pyallow.attr("print_highres").cast<bool>();
+ }
+ if (encryption.contains("aes")) {
+ if (py::isinstance<py::bool_>(encryption["aes"]))
+ aes = py::bool_(encryption["aes"]);
+ else
+ throw py::type_error("aes must be bool");
+ } else {
+ aes = (encryption_level >= 4);
+ }
+ if (encryption.contains("metadata")) {
+ if (py::isinstance<py::bool_>(encryption["metadata"]))
+ metadata = py::bool_(encryption["metadata"]);
+ else
+ throw py::type_error("metadata must be bool");
+ } else {
+ metadata = (encryption_level >= 4);
+ }
+
+ if (metadata && encryption_level < 4) {
+ throw py::value_error("Cannot encrypt metadata when R < 4");
+ }
+ if (aes && encryption_level < 4) {
+ throw py::value_error("Cannot encrypt with AES when R < 4");
+ }
+ if (encryption_level == 6 && !aes) {
+ throw py::value_error("When R = 6, AES encryption must be enabled");
+ }
+ if (metadata && !aes) {
+ throw py::value_error("Cannot encrypt metadata unless AES encryption is enabled");
+ }
+
+ qpdf_r3_print_e print;
+ if (allow["print_highres"])
+ print = qpdf_r3p_full;
+ else if (allow["print_lowres"])
+ print = qpdf_r3p_low;
+ else
+ print = qpdf_r3p_none;
+
+ if (encryption_level == 6) {
+ w.setR6EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print,
+ metadata
+ );
+ } else if (encryption_level == 5) {
+ // TODO WARNING
+ w.setR5EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print,
+ metadata
+ );
+ } else if (encryption_level == 4) {
+ w.setR4EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print,
+ metadata,
+ aes
+ );
+ } else if (encryption_level == 3) {
+ w.setR3EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print
+ );
+ } else if (encryption_level == 2) {
+ w.setR2EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ (print != qpdf_r3p_none),
+ allow["modify_assembly"],
+ allow["extract"],
+ allow["modify_annotation"]
+ );
+ }
+}
+
+
+typedef std::pair<std::string, int> pdf_version_extension;
+
+pdf_version_extension get_version_extension(py::object ver_ext)
+{
+ std::string version = "";
+ int extension = 0;
+ try {
+ version = ver_ext.cast<std::string>();
+ extension = 0;
+ } catch (py::cast_error) {
+ try {
+ auto version_ext = ver_ext.cast<pdf_version_extension>();
+ version = version_ext.first;
+ extension = version_ext.second;
+ } catch (py::cast_error) {
+ throw py::type_error("PDF version must be a tuple: (str, int)");
+ }
+ }
+ return pdf_version_extension(version, extension);
+}
+
+
+/* Helper class to ensure streams we open get closed by destructor */
+class Closer
+{
+public:
+ Closer() : monitored(py::none()) {}
+ ~Closer() {
+ if (!this->monitored.is_none()) {
+ this->monitored.attr("close")();
+ }
+ }
+ void set(py::object monitored) {
+ this->monitored = monitored;
+ }
+ Closer(const Closer& other) = delete;
+ Closer(Closer&& other) = delete;
+ Closer& operator= (const Closer& other) = delete;
+ Closer& operator= (Closer&& other) = delete;
+
+private:
+ py::object monitored;
+};
+
void save_pdf(
QPDF& q,
py::object filename_or_stream,
bool static_id=false,
bool preserve_pdfa=true,
- std::string min_version="",
- std::string force_version="",
+ py::object min_version=py::none(),
+ py::object force_version=py::none(),
+ bool fix_metadata_version=true,
bool compress_streams=true,
- qpdf_stream_decode_level_e stream_decode_level=qpdf_dl_generalized,
+ py::object stream_decode_level=py::none(),
qpdf_object_stream_e object_stream_mode=qpdf_o_preserve,
bool normalize_content=false,
bool linearize=false,
bool qdf=false,
- py::object progress=py::none())
+ py::object progress=py::none(),
+ py::object encryption=py::none())
{
+ std::string owner;
+ std::string user;
+ std::string description;
QPDFWriter w(q);
- // Parameters
if (static_id) {
w.setStaticID(true);
}
w.setNewlineBeforeEndstream(preserve_pdfa);
- if (!min_version.empty()) {
- w.setMinimumPDFVersion(min_version, 0);
- }
- if (!force_version.empty()) {
- w.forcePDFVersion(force_version, 0);
+
+ if (!min_version.is_none()) {
+ auto version_ext = get_version_extension(min_version);
+ w.setMinimumPDFVersion(version_ext.first, version_ext.second);
}
w.setCompressStreams(compress_streams);
- w.setDecodeLevel(stream_decode_level);
+ if (!stream_decode_level.is_none()) {
+ // Unconditionally calling setDecodeLevel has side effects, disabling
+ // preserve encryption in particular
+ w.setDecodeLevel(stream_decode_level.cast<qpdf_stream_decode_level_e>());
+ }
w.setObjectStreamMode(object_stream_mode);
- if (normalize_content && linearize) {
- throw py::value_error("cannot save with both normalize_content and linearize");
- }
- w.setContentNormalization(normalize_content);
- w.setLinearization(linearize);
- w.setQDFMode(qdf);
-
- if (!progress.is_none()) {
- auto reporter = PointerHolder<QPDFWriter::ProgressReporter>(new PikeProgressReporter(progress));
- w.registerProgressReporter(reporter);
- }
+ py::object stream;
+ Closer stream_closer;
if (py::hasattr(filename_or_stream, "write") && py::hasattr(filename_or_stream, "seek")) {
// Python code gave us an object with a stream interface
- py::object stream = filename_or_stream;
+ stream = filename_or_stream;
check_stream_is_usable(stream);
+ description = py::repr(stream);
+ } else {
+ py::object filename = fspath(filename_or_stream);
+ py::object ospath = py::module::import("os").attr("path");
+ py::object samefile = ospath.attr("samefile");
+ py::object exists = ospath.attr("exists");
+ if (exists(filename).cast<bool>() && samefile(filename, q.getFilename()).cast<bool>()) {
+ throw py::value_error("Cannot overwrite input file");
+ }
+ stream = py::module::import("io").attr("open")(filename, "wb");
+ stream_closer.set(stream);
+ description = py::str(filename);
+ }
- // TODO might be able to improve this by streaming rather than buffering
- // using subclass of Pipeline that routes calls to Python.
- w.setOutputMemory();
+ // We must set up the output pipeline before we configure encryption
+ Pl_PythonOutput output_pipe(description.c_str(), stream);
+ w.setOutputPipeline(&output_pipe);
- // It would be kind to release the GIL here, but this is not possible if
- // another thread has an object and tries to mess with it. Correctness
- // is more important than performance.
- w.write();
+ if (encryption.is(py::bool_(true)) && !q.isEncrypted()) {
+ throw py::value_error("can't perserve encryption parameters on a file with no encryption");
+ }
- // But now that we've held the GIL forever, we can release it and take
- // it back again; at least in theory giving other threads a chance to
- // to do something.
- {
- py::gil_scoped_release release;
- }
+ if (
+ (encryption.is(py::bool_(true)) || py::isinstance<py::dict>(encryption))
+ && (normalize_content || !stream_decode_level.is_none())
+ ) {
+ throw py::value_error("cannot save with encryption and normalize_content or stream_decode_level");
+ }
- // getBuffer returns Buffer* and qpdf says we are responsible for
- // deleting it, so capture it in a unique_ptr
- std::unique_ptr<Buffer> output_buffer(w.getBuffer());
-
- // Create a memoryview of the buffer that libqpdf created
- // Awkward API alert:
- // QPDFWriter::getBuffer -> Buffer* (caller frees memory)
- // and Buffer::getBuffer -> unsigned char* (caller does not own memory)
- py::buffer_info output_buffer_info(
- output_buffer->getBuffer(),
- output_buffer->getSize());
- py::memoryview view_output_buffer(output_buffer_info);
-
- // Send it to the stream object (probably copying)
- stream.attr("write")(view_output_buffer);
+ if (encryption.is(py::bool_(true))) {
+ w.setPreserveEncryption(true); // Keep existing encryption
+ } else if (encryption.is_none() || encryption.is(py::bool_(false))) {
+ w.setPreserveEncryption(false); // Remove encryption
} else {
- py::object filename = filename_or_stream;
- std::string description = py::str(filename);
- // Delete the intended filename, in case it is the same as the input file.
- // This ensures that the input file will continue to exist in memory on Linux.
- portable_unlink(filename);
- FILE* file = portable_fopen(filename, "wb");
- w.setOutputFile(description.c_str(), file, true);
- w.write();
- file = nullptr; // QPDF will close it
+ setup_encryption(w, encryption, owner, user);
}
-}
+ if (normalize_content && linearize) {
+ throw py::value_error("cannot save with both normalize_content and linearize");
+ }
+ w.setContentNormalization(normalize_content);
+ w.setLinearization(linearize);
+ w.setQDFMode(qdf);
-PYBIND11_MODULE(_qpdf, m) {
- //py::options options;
- //options.disable_function_signatures();
+ if (!force_version.is_none()) {
+ auto version_ext = get_version_extension(force_version);
+ w.forcePDFVersion(version_ext.first, version_ext.second);
+ }
+ if (fix_metadata_version) {
+ update_xmp_pdfversion(q, w.getFinalVersion());
+ }
- m.doc() = "pikepdf provides a Pythonic interface for QPDF";
+ if (!progress.is_none()) {
+ auto reporter = PointerHolder<QPDFWriter::ProgressReporter>(new PikeProgressReporter(progress));
+ w.registerProgressReporter(reporter);
+ }
- m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version");
+ w.write();
+}
- static py::exception<QPDFExc> exc_main(m, "PdfError");
- static py::exception<QPDFExc> exc_password(m, "PasswordError");
- py::register_exception_translator([](std::exception_ptr p) {
- try {
- if (p) std::rethrow_exception(p);
- } catch (const QPDFExc &e) {
- if (e.getErrorCode() == qpdf_e_password) {
- exc_password(e.what());
- } else {
- exc_main(e.what());
- }
- }
- });
+void init_qpdf(py::module &m)
+{
py::enum_<qpdf_object_stream_e>(m, "ObjectStreamMode")
.value("disable", qpdf_object_stream_e::qpdf_o_disable)
.value("preserve", qpdf_object_stream_e::qpdf_o_preserve)
@@ -243,7 +441,12 @@ PYBIND11_MODULE(_qpdf, m) {
.value("specialized", qpdf_stream_decode_level_e::qpdf_dl_specialized)
.value("all", qpdf_stream_decode_level_e::qpdf_dl_all);
- init_pagelist(m);
+ py::enum_<QPDF::encryption_method_e>(m, "EncryptionMethod")
+ .value("none", QPDF::encryption_method_e::e_none)
+ .value("unknown", QPDF::encryption_method_e::e_unknown)
+ .value("rc4", QPDF::encryption_method_e::e_rc4)
+ .value("aes", QPDF::encryption_method_e::e_aes)
+ .value("aesv3", QPDF::encryption_method_e::e_aesv3);
py::class_<QPDF, std::shared_ptr<QPDF>>(m, "Pdf", "In-memory representation of a PDF")
.def_static("new",
@@ -253,20 +456,31 @@ PYBIND11_MODULE(_qpdf, m) {
q->setSuppressWarnings(true);
return q;
},
- "create a new empty PDF from stratch"
+ "Create a new empty PDF from stratch."
)
.def_static("open", open_pdf,
R"~~~(
- Open an existing file at `filename_or_stream`.
+ Open an existing file at *filename_or_stream*.
- If `filename_or_stream` is path-like, the file will be opened. The
- file should not be modified by another process while it is open in
- pikepdf.
+ If *filename_or_stream* is path-like, the file will be opened for reading.
+ The file should not be modified by another process while it is open in
+ pikepdf. The file will not be altered when opened in this way. Any changes
+ to the file must be persisted by using ``.save()``.
- If `filename_or_stream` has `.read()` and `.seek()` methods, the file
+ If *filename_or_stream* has ``.read()`` and ``.seek()`` methods, the file
will be accessed as a readable binary stream. pikepdf will read the
entire stream into a private buffer.
+ ``.open()`` may be used in a ``with``-block, ``.close()`` will be called when
+ the block exists.
+
+ Examples:
+
+ >>> with Pdf.open("test.pdf") as pdf:
+ ...
+
+ >>> pdf = Pdf.open("test.pdf", password="rosebud")
+
Args:
filename_or_stream (os.PathLike): Filename of PDF to open
password (str or bytes): User or owner password to open an
@@ -278,7 +492,8 @@ PYBIND11_MODULE(_qpdf, m) {
ignore_xref_streams (bool): If True, ignore cross-reference
streams. See qpdf documentation.
suppress_warnings (bool): If True (default), warnings are not
- printed to stderr. Use `get_warnings()` to retrieve warnings.
+ printed to stderr. Use :meth:`pikepdf.Pdf.get_warnings()` to
+ retrieve warnings.
attempt_recovery (bool): If True (default), attempt to recover
from PDF parsing errors.
inherit_page_attributes (bool): If True (default), push attributes
@@ -289,7 +504,7 @@ PYBIND11_MODULE(_qpdf, m) {
file.
pikepdf.PdfError: If for other reasons we could not open
the file.
- TypeError: If the type of `filename_or_stream` is not
+ TypeError: If the type of ``filename_or_stream`` is not
usable.
FileNotFoundError: If the file was not found.
)~~~",
@@ -307,15 +522,15 @@ PYBIND11_MODULE(_qpdf, m) {
}
)
.def_property_readonly("filename", &QPDF::getFilename,
- "the source filename of an existing PDF, when available")
+ "The source filename of an existing PDF, when available.")
.def_property_readonly("pdf_version", &QPDF::getPDFVersion,
- "the PDF standard version, such as '1.7'")
+ "The PDF standard version, such as '1.7'.")
.def_property_readonly("extension_level", &QPDF::getExtensionLevel)
.def_property_readonly("Root", &QPDF::getRoot,
- "the /Root object of the PDF"
+ "The /Root object of the PDF."
)
.def_property_readonly("root", &QPDF::getRoot,
- "alias for .Root, the /Root object of the PDF"
+ "Alias for .Root, the /Root object of the PDF."
)
.def_property("docinfo",
[](QPDF& q) {
@@ -330,7 +545,16 @@ PYBIND11_MODULE(_qpdf, m) {
throw py::value_error("docinfo must be an indirect object - use Pdf.make_indirect");
q.getTrailer().replaceKey("/Info", replace);
},
- "access the document information dictionary"
+ R"~~~(
+ Access the (deprecated) document information dictionary.
+
+ The document information dictionary is a brief metadata record
+ that can store some information about the origin of a PDF. It is
+ deprecated and removed in the PDF 2.0 specification. Use the
+ ``.open_metadata()`` API instead, which will edit the modern (and
+ unfortunately, more complicated) XMP metadata object and synchronize
+ changes to the document information dictionary.
+ )~~~"
)
.def_property_readonly("trailer", &QPDF::getTrailer,
R"~~~(
@@ -394,8 +618,9 @@ PYBIND11_MODULE(_qpdf, m) {
The page can be either be a newly constructed PDF object or it can
be obtained from another PDF.
- :param pikepdf.Object page: The page object to attach
- :param bool first: If True, prepend this before the first page; if False append after last page
+ Args:
+ page (pikepdf.Object): The page object to attach
+ first (bool): If True, prepend this before the first page; if False append after last page
)~~~",
py::arg("page"),
py::arg("first")=false,
@@ -423,11 +648,14 @@ PYBIND11_MODULE(_qpdf, m) {
.def("save",
save_pdf,
R"~~~(
- Save all modifications to this :class:`pikepdf.Pdf`
+ Save all modifications to this :class:`pikepdf.Pdf`.
Args:
filename (str or stream): Where to write the output. If a file
- exists in this location it will be overwritten.
+ exists in this location it will be overwritten. The file
+ should not be the same as the input file, because data from
+ the input file may be lazily loaded; as such overwriting
+ in place will null-out objects.
static_id (bool): Indicates that the ``/ID`` metadata, normally
calculated as a hash of certain PDF contents and metadata
@@ -437,12 +665,20 @@ PYBIND11_MODULE(_qpdf, m) {
manner compliant with PDF/A and other stricter variants.
This should be True, the default, in most cases.
- min_version (str): Sets the minimum version of PDF
+ min_version (str or tuple): Sets the minimum version of PDF
specification that should be required. If left alone QPDF
- will decide.
- force_version (str): Override the version recommend by QPDF,
+ will decide. If a tuple, the second element is an integer, the
+ extension level.
+ force_version (str or tuple): Override the version recommend by QPDF,
potentially creating an invalid file that does not display
- in old versions. See QPDF manual for details.
+ in old versions. See QPDF manual for details. If a tuple, the
+ second element is an integer, the extension level.
+ fix_metadata_version (bool): If True (default) and the XMP metadata
+ contains the optional PDF version field, ensure the version in
+ metadata is correct. If the XMP metadata does not contain a PDF
+ version field, none will be added. To ensure that the field is
+ added, edit the metadata and insert a placeholder value in
+ ``pdf:PDFVersion``.
object_stream_mode (pikepdf.ObjectStreamMode):
``disable`` prevents the use of object streams.
@@ -472,10 +708,24 @@ PYBIND11_MODULE(_qpdf, m) {
the program ``fix-qdf`` to fix convert back to a standard
PDF.
+ progress (callable): Specify a callback function that is called
+ as the PDF is written. The function will be called with an
+ integer between 0-100 as the sole parameter, the progress
+ percentage. This function may not access or modify the PDF
+ while it is being written, or data corruption will almost
+ certainly occur.
+
+ encryption (pikepdf.models.Encryption or bool): If ``False``
+ or omitted, existing encryption will be removed. If ``True``
+ encryption settings are copied from the originating PDF.
+ Alternately, an ``Encryption`` object may be provided that
+ sets the parameters for new encryption.
+
You may call ``.save()`` multiple times with different parameters
to generate different versions of a file, and you *may* continue
to modify the file after saving it. ``.save()`` does not modify
- the ``Pdf`` object in memory.
+ the ``Pdf`` object in memory, except possibly by updating the XMP
+ metadata version with ``fix_metadata_version``.
.. note::
@@ -491,13 +741,15 @@ PYBIND11_MODULE(_qpdf, m) {
py::arg("preserve_pdfa")=true,
py::arg("min_version")="",
py::arg("force_version")="",
+ py::arg("fix_metadata_version")=true,
py::arg("compress_streams")=true,
- py::arg("stream_decode_level")=qpdf_stream_decode_level_e::qpdf_dl_generalized,
+ py::arg("stream_decode_level")=py::none(),
py::arg("object_stream_mode")=qpdf_object_stream_e::qpdf_o_preserve,
py::arg("normalize_content")=false,
py::arg("linearize")=false,
py::arg("qdf")=false,
- py::arg("progress")=py::none()
+ py::arg("progress")=py::none(),
+ py::arg("encryption")=py::none()
)
.def("_get_object_id", &QPDF::getObjectByID)
.def("get_object",
@@ -510,7 +762,8 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
)~~~",
- py::return_value_policy::reference_internal
+ py::return_value_policy::reference_internal,
+ py::arg("objgen")
)
.def("get_object",
[](QPDF &q, int objid, int gen) {
@@ -522,7 +775,9 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
)~~~",
- py::return_value_policy::reference_internal
+ py::return_value_policy::reference_internal,
+ py::arg("objid"),
+ py::arg("gen")
)
.def("make_indirect", &QPDF::makeIndirectObject,
R"~~~(
@@ -544,7 +799,8 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
- )~~~"
+ )~~~",
+ py::arg("h")
)
.def("make_indirect",
[](QPDF &q, py::object obj) -> QPDFObjectHandle {
@@ -555,7 +811,8 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
- )~~~"
+ )~~~",
+ py::arg("obj")
)
.def("copy_foreign",
[](QPDF &q, QPDFObjectHandle &h) -> QPDFObjectHandle {
@@ -563,20 +820,106 @@ PYBIND11_MODULE(_qpdf, m) {
},
"Copy object from foreign PDF to this one.",
py::return_value_policy::reference_internal,
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ py::arg("h")
)
.def("_replace_object",
[](QPDF &q, int objid, int gen, QPDFObjectHandle &h) {
q.replaceObject(objid, gen, h);
}
)
- ; // class Pdf
-
- init_object(m);
+ .def("_swap_objects",
+ [](QPDF &q, std::pair<int, int> objgen1, std::pair<int, int> objgen2) {
+ QPDFObjGen o1(objgen1.first, objgen1.second);
+ QPDFObjGen o2(objgen2.first, objgen2.second);
+ q.swapObjects(o1, o2);
+ }
+ )
+ .def("_process",
+ [](QPDF &q, std::string description, py::bytes data) {
+ std::string s = data;
+ q.processMemoryFile(
+ description.c_str(),
+ s.data(),
+ s.size()
+ );
+ },
+ R"~~~(
+ Process a new in-memory PDF, replacing the existing PDF
-#ifdef VERSION_INFO
- m.attr("__version__") = VERSION_INFO;
-#else
- m.attr("__version__") = "dev";
-#endif
+ Used to implement Pdf.close().
+ )~~~"
+ )
+ .def_property_readonly("_allow_accessibility",
+ [](QPDF &q) {
+ return q.allowAccessibility();
+ }
+ )
+ .def_property_readonly("_allow_extract",
+ [](QPDF &q) {
+ return q.allowExtractAll();
+ }
+ )
+ .def_property_readonly("_allow_print_lowres",
+ [](QPDF &q) {
+ return q.allowPrintLowRes();
+ }
+ )
+ .def_property_readonly("_allow_print_highres",
+ [](QPDF &q) {
+ return q.allowPrintHighRes();
+ }
+ )
+ .def_property_readonly("_allow_modify_assembly",
+ [](QPDF &q) {
+ return q.allowModifyAssembly();
+ }
+ )
+ .def_property_readonly("_allow_modify_form",
+ [](QPDF &q) {
+ return q.allowModifyForm();
+ }
+ )
+ .def_property_readonly("_allow_modify_annotation",
+ [](QPDF &q) {
+ return q.allowModifyAnnotation();
+ }
+ )
+ .def_property_readonly("_allow_modify_other",
+ [](QPDF &q) {
+ return q.allowModifyOther();
+ }
+ )
+ .def_property_readonly("_allow_modify_all",
+ [](QPDF &q) {
+ return q.allowModifyAll();
+ }
+ )
+ .def_property_readonly("_encryption_data",
+ [](QPDF &q) {
+ int R = 0;
+ int P = 0;
+ int V = 0;
+ QPDF::encryption_method_e stream_method = QPDF::e_unknown;
+ QPDF::encryption_method_e string_method = QPDF::e_unknown;
+ QPDF::encryption_method_e file_method = QPDF::e_unknown;
+ if (!q.isEncrypted(R, P, V, stream_method, string_method, file_method))
+ return py::dict();
+
+ auto user_passwd = q.getTrimmedUserPassword();
+ auto encryption_key = q.getEncryptionKey();
+
+ return py::dict(
+ py::arg("R") = R,
+ py::arg("P") = P,
+ py::arg("V") = V,
+ py::arg("stream") = stream_method,
+ py::arg("string") = string_method,
+ py::arg("file") = file_method,
+ py::arg("user_passwd") = py::bytes(user_passwd),
+ py::arg("encryption_key") = py::bytes(encryption_key)
+ );
+ }
+ )
+ ; // class Pdf
}
diff --git a/src/qpdf/qpdf_inputsource.h b/src/qpdf/qpdf_inputsource.h
index dc26267..b29b309 100644
--- a/src/qpdf/qpdf_inputsource.h
+++ b/src/qpdf/qpdf_inputsource.h
@@ -17,7 +17,7 @@
#include <qpdf/Buffer.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/InputSource.hh>
-
+#include <qpdf/QUtil.hh>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
diff --git a/src/qpdf/qpdf_pagelist.cpp b/src/qpdf/qpdf_pagelist.cpp
index d8222dd..07c496d 100644
--- a/src/qpdf/qpdf_pagelist.cpp
+++ b/src/qpdf/qpdf_pagelist.cpp
@@ -121,18 +121,6 @@ void PageList::set_pages_from_iterable(py::slice slice, py::iterable other)
void PageList::delete_page(size_t index)
{
auto page = this->get_page(index);
- /*
- // Need a dec_ref to match the inc_ref in insert_page, but it's unclear
- // how to do that. The item will be set the current QPDF always.
- // Accessing data from another PDF seems to involve some pipeline
- // magic in QPDF around libqpdf/QPDFWriter.cc:1614
- if (original page owner != &this->getQPDF()) {
- // If we are removing a page not originally owned by our QPDF,
- // remove the reference count we put it in insert_page()
- py::object pyqpdf = py::cast(page_owner);
- pyqpdf.dec_ref();
- }
- */
this->qpdf->removePage(page);
}
@@ -175,24 +163,6 @@ void PageList::insert_page(size_t index, QPDFObjectHandle page)
// qpdf does not accept duplicating pages within the same file,
// so manually create a copy
page = this->qpdf->makeIndirectObject(page);
- } else {
- // libqpdf does not transfer a page's contents to the new QPDF.
- // Instead WHEN ASKED TO WRITE it will go back and get the data
- // from objecthandle->getOwningQPDF(). Therefore we must ensure
- // our previous owner is kept alive.
-#if 1
- auto tinfo = py::detail::get_type_info(typeid(QPDF));
- py::handle pyqpdf = py::detail::get_object_handle(page_owner, tinfo);
- py::handle pypage = py::cast(page);
- py::detail::keep_alive_impl(pypage, pyqpdf);
-#else
- // MSVC++ complains about the symbol
- // QPDF::Members::~Members() not being exported when this version
- // is used, but it works for GCC and Clang.
- py::handle pyqpdf = py::cast(page_owner);
- py::handle pypage = py::cast(page);
- py::detail::keep_alive_impl(pypage, pyqpdf);
-#endif
}
if (index != this->count()) {
QPDFObjectHandle refpage = this->get_page(index);
@@ -202,7 +172,6 @@ void PageList::insert_page(size_t index, QPDFObjectHandle page)
}
}
-
void init_pagelist(py::module &m)
{
py::class_<PageList>(m, "PageList")
@@ -229,12 +198,13 @@ void init_pagelist(py::module &m)
.def("__delitem__", &PageList::delete_pages_from_iterable)
.def("__len__", &PageList::count)
.def("p",
- [](PageList &pl, size_t index) {
- if (index == 0) // Indexing past end is checked in .get_page
+ [](PageList &pl, size_t pnum) {
+ if (pnum == 0) // Indexing past end is checked in .get_page
throw py::index_error("page access out of range in 1-based indexing");
- return pl.get_page(index - 1);
+ return pl.get_page(pnum - 1);
},
- "convenience - look up page number in ordinal numbering, .p(1) is first page"
+ "Convenience - look up page number in ordinal numbering, ``.p(1)`` is first page",
+ py::arg("pnum")
)
.def("__iter__",
[](PageList &pl) {
@@ -252,7 +222,16 @@ void init_pagelist(py::module &m)
[](PageList &pl, ssize_t index, py::object obj) {
size_t uindex = uindex_from_index(pl, index);
pl.insert_page(uindex, obj);
- }, py::keep_alive<1, 3>()
+ }, py::keep_alive<1, 3>(),
+ R"~~~(
+ Insert a page at the specified location.
+
+ Args:
+ index (int): location at which to insert page, 0-based indexing
+ obj (pikepdf.Object): page object to insert
+ )~~~",
+ py::arg("index"),
+ py::arg("obj")
)
.def("reverse",
[](PageList &pl) {
@@ -262,13 +241,16 @@ void init_pagelist(py::module &m)
PySlice_New(Py_None, Py_None, step.ptr()));
py::list reversed_pages = pl.get_pages(reversed);
pl.set_pages_from_iterable(ordinary_indices, reversed_pages);
- }
+ },
+ "Reverse the order of pages."
)
.def("append",
[](PageList &pl, py::object page) {
pl.insert_page(pl.count(), page);
},
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ "Add another page to the end.",
+ py::arg("page")
)
.def("extend",
[](PageList &pl, PageList &other) {
@@ -279,7 +261,9 @@ void init_pagelist(py::module &m)
pl.insert_page(pl.count(), other.get_page(i));
}
},
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ "Extend the ``Pdf`` by adding pages from another ``Pdf.pages``.",
+ py::arg("other")
)
.def("extend",
[](PageList &pl, py::iterable iterable) {
@@ -290,6 +274,29 @@ void init_pagelist(py::module &m)
++it;
}
},
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ "Extend the ``Pdf`` by adding pages from an iterable of pages.",
+ py::arg("iterable")
+ )
+ .def("remove",
+ [](PageList &pl, py::kwargs kwargs) {
+ auto pnum = kwargs["p"].cast<size_t>();
+ if (pnum == 0) // Indexing past end is checked in .get_page
+ throw py::index_error("page access out of range in 1-based indexing");
+ pl.delete_page(pnum - 1);
+ },
+ R"~~~(
+ Remove a page (using 1-based numbering)
+
+ Args:
+ p (int): 1-based page number
+ )~~~"
+ )
+ .def("__repr__",
+ [](PageList &pl) {
+ return std::string("<pikepdf._qpdf.PageList len=")
+ + std::to_string(pl.count())
+ + std::string(">");
+ }
);
}
diff --git a/src/qpdf/qpdf_pipeline.h b/src/qpdf/qpdf_pipeline.h
new file mode 100644
index 0000000..f922827
--- /dev/null
+++ b/src/qpdf/qpdf_pipeline.h
@@ -0,0 +1,77 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+ */
+
+#include <cstdio>
+#include <cstring>
+
+#include <qpdf/Constants.h>
+#include <qpdf/Types.h>
+#include <qpdf/DLL.h>
+#include <qpdf/QPDFExc.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/Buffer.hh>
+#include <qpdf/QPDF.hh>
+#include <qpdf/Pipeline.hh>
+#include <qpdf/QUtil.hh>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "pikepdf.h"
+
+
+class Pl_PythonOutput : public Pipeline
+{
+public:
+ Pl_PythonOutput(const char *identifier, py::object stream) :
+ Pipeline(identifier, nullptr),
+ stream(stream)
+ {
+ }
+
+ virtual ~Pl_PythonOutput() = default;
+ Pl_PythonOutput(const Pl_PythonOutput&) = delete;
+ Pl_PythonOutput& operator= (const Pl_PythonOutput&) = delete;
+ Pl_PythonOutput(Pl_PythonOutput&&) = delete;
+ Pl_PythonOutput& operator= (Pl_PythonOutput&&) = delete;
+
+ void write(unsigned char *buf, size_t len)
+ {
+ py::gil_scoped_acquire gil;
+ size_t so_far = 0;
+ while (len > 0) {
+ py::buffer_info buffer(buf, len);
+ py::memoryview view_buffer(buffer);
+ py::object result = this->stream.attr("write")(view_buffer);
+ try {
+ so_far = result.cast<size_t>();
+ } catch (const py::cast_error &e) {
+ throw py::type_error("Unexpected return type of write()");
+ }
+ if (so_far == 0) {
+ QUtil::throw_system_error(this->identifier);
+ } else {
+ buf += so_far;
+ len -= so_far;
+ }
+ }
+ }
+
+ void finish()
+ {
+ py::gil_scoped_acquire gil;
+ try {
+ this->stream.attr("flush")();
+ } catch (const py::attr_error &e) {
+ // Suppress
+ }
+ }
+
+private:
+ py::object stream;
+};