summaryrefslogtreecommitdiff
path: root/src/pikepdf
diff options
context:
space:
mode:
Diffstat (limited to 'src/pikepdf')
-rw-r--r--src/pikepdf/__init__.py42
-rw-r--r--src/pikepdf/_cpphelpers.py20
-rw-r--r--src/pikepdf/_methods.py319
-rw-r--r--src/pikepdf/_version.py13
-rw-r--r--src/pikepdf/codec.py48
-rw-r--r--src/pikepdf/models/__init__.py17
-rw-r--r--src/pikepdf/models/encryption.py154
-rw-r--r--src/pikepdf/models/image.py247
-rw-r--r--src/pikepdf/models/matrix.py16
-rw-r--r--src/pikepdf/models/metadata.py86
-rw-r--r--src/pikepdf/objects.py49
11 files changed, 819 insertions, 192 deletions
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py
index 2e42605..8de467a 100644
--- a/src/pikepdf/__init__.py
+++ b/src/pikepdf/__init__.py
@@ -4,38 +4,46 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-
-from pkg_resources import (
- get_distribution as _get_distribution,
- DistributionNotFound
-)
+"""A library for manipulating PDFs"""
try:
from . import _qpdf
except ImportError:
raise ImportError("pikepdf's extension library failed to import")
-from ._qpdf import (
- PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel
-)
+from ._version import __version__
+from ._qpdf import PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel
from .objects import (
- Object, ObjectType, Name, String, Array, Dictionary, Stream, Operator
+ Object,
+ ObjectType,
+ Name,
+ String,
+ Array,
+ Dictionary,
+ Stream,
+ Operator,
)
from .models import (
- PdfImage, PdfInlineImage, UnsupportedImageTypeError, PdfMatrix,
- parse_content_stream
+ PdfImage,
+ PdfInlineImage,
+ UnsupportedImageTypeError,
+ PdfMatrix,
+ Encryption,
+ Permissions,
+ parse_content_stream,
)
from . import _methods
-
-try:
- __version__ = _get_distribution(__name__).version
-except DistributionNotFound:
- __version__ = "Not installed"
+from . import codec
__libqpdf_version__ = _qpdf.qpdf_version()
def open(*args, **kwargs): # pylint: disable=redefined-builtin
- "Alias for :func:`pikepdf.Pdf.open`."
+ """Alias for :func:`pikepdf.Pdf.open`. Open a PDF."""
return Pdf.open(*args, **kwargs)
+
+
+def new(*args, **kwargs):
+ """Alias for :func:`pikepdf.Pdf.new`. Create a new empty PDF."""
+ return Pdf.new(*args, **kwargs)
diff --git a/src/pikepdf/_cpphelpers.py b/src/pikepdf/_cpphelpers.py
index d975657..7ef0654 100644
--- a/src/pikepdf/_cpphelpers.py
+++ b/src/pikepdf/_cpphelpers.py
@@ -12,12 +12,13 @@ called from Python, and subject to change at any time.
import os
import sys
-
# Provide os.fspath equivalent for Python <3.6
if sys.version_info[0:2] <= (3, 5): # pragma: no cover
+
def fspath(path):
'''https://www.python.org/dev/peps/pep-0519/#os'''
import pathlib
+
if isinstance(path, (str, bytes)):
return path
@@ -36,12 +37,23 @@ if sys.version_info[0:2] <= (3, 5): # pragma: no cover
if isinstance(path, (str, bytes)):
return path
else:
- raise TypeError("expected __fspath__() to return str or bytes, "
- "not " + type(path).__name__)
+ raise TypeError(
+ "expected __fspath__() to return str or bytes, "
+ "not " + type(path).__name__
+ )
raise TypeError(
"expected str, bytes, pathlib.Path or os.PathLike object, not "
- + path_type.__name__)
+ + path_type.__name__
+ )
+
else:
fspath = os.fspath
+
+
+def update_xmp_pdfversion(pdf, version):
+
+ with pdf.open_metadata(set_pikepdf_as_editor=False, update_docinfo=False) as meta:
+ if 'pdf:PDFVersion' in meta:
+ meta['pdf:PDFVersion'] = version
diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py
index 6c2b90b..ac6134c 100644
--- a/src/pikepdf/_methods.py
+++ b/src/pikepdf/_methods.py
@@ -12,22 +12,21 @@ bindings after the fact.
We can also move the implementation to C++ if desired.
"""
-from tempfile import NamedTemporaryFile
-from subprocess import run, PIPE
-from io import BytesIO
-
-from collections.abc import KeysView
-
import inspect
+from collections import namedtuple
+from collections.abc import KeysView
+from io import BytesIO
+from subprocess import PIPE, run
+from tempfile import NamedTemporaryFile
-from . import Pdf, Dictionary, Array, Name, Stream, Object
+from . import Array, Dictionary, Name, Object, Pdf, Stream
from ._qpdf import _ObjectMapping
-from .models import PdfMetadata
-
+from .models import PdfMetadata, Permissions, EncryptionInfo
# pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object
-def extends(cls_cpp):
+
+def augments(cls_cpp):
"""Attach methods of a Python support class to an existing class
This monkeypatches all methods defined in the support class onto an
@@ -35,37 +34,44 @@ def extends(cls_cpp):
.. code-block:: python
- @extends(ClassDefinedInCpp)
+ @augments(ClassDefinedInCpp)
class SupportClass:
def foo(self):
pass
- The method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
+ The Python method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
has no meaning on its own and should not be used, but gets returned from
this function so IDE code inspection doesn't get too confused.
We don't subclass because it's much more convenient to monkeypatch Python
methods onto the existing Python binding of the C++ class. For one thing,
this allows the implementation to be moved from Python to C++ or vice
- versa. It saves having to implement an intermediate subclass and then
- ensures that the superclass never 'leaks' to pikepdf users.
+ versa. It saves having to implement an intermediate Python subclass and then
+ ensures that the C++ superclass never 'leaks' to pikepdf users. Finally,
+ wrapper classes and subclasses can become problematic if the call stack
+ crosses the C++/Python boundary multiple times.
Any existing methods may be used, regardless of whether they defined
elsewhere in the support class or in the target class.
+
+ The target class does not have to be C++ or derived from pybind11.
"""
- def real_class_extend(cls, cls_cpp=cls_cpp):
+ def class_augment(cls, cls_cpp=cls_cpp):
for name, fn in inspect.getmembers(cls, inspect.isfunction):
- fn.__qualname__ = fn.__qualname__.replace(
- cls.__name__, cls_cpp.__name__)
+ fn.__qualname__ = fn.__qualname__.replace(cls.__name__, cls_cpp.__name__)
setattr(cls_cpp, name, fn)
for name, fn in inspect.getmembers(cls, inspect.isdatadescriptor):
setattr(cls_cpp, name, fn)
+
def block_init(self):
+ # Prevent initialization of the support class
raise NotImplementedError(self.__class__.__name__ + '.__init__')
+
cls.__init__ = block_init
return cls
- return real_class_extend
+
+ return class_augment
def _single_page_pdf(page):
@@ -86,17 +92,15 @@ def _mudraw(buffer, fmt):
tmp_in.flush()
proc = run(
- ['mudraw', '-F', fmt, '-o', '-', tmp_in.name],
- stdout=PIPE, stderr=PIPE
+ ['mudraw', '-F', fmt, '-o', '-', tmp_in.name], stdout=PIPE, stderr=PIPE
)
if proc.stderr:
raise RuntimeError(proc.stderr.decode())
return proc.stdout
-@extends(Object)
+@augments(Object)
class Extend_Object:
-
def _repr_mimebundle_(self, **kwargs):
"""Present options to IPython for rich display of this object
@@ -127,13 +131,116 @@ class Extend_Object:
pass
return data
+ def emplace(self, other):
+ """Copy all items from other without making a new object.
-@extends(Pdf)
-class Extend_Pdf:
+ Particularly when working with pages, it may be desirable to remove all
+ of the existing page's contents and emplace (insert) a new page on top
+ of it, in a way that preserves all links and references to the original
+ page. (Or similarly, for other Dictionary objects in a PDF.)
- def _repr_mimebundle_(self, **kwargs):
+ When a page is assigned (``pdf.pages[0] = new_page``), only the
+ application knows if references to the original the original page are
+ still valid. For example, a PDF optimizer might restructure a page
+ object into another visually similar one, and references would be valid;
+ but for a program that reorganizes page contents such as a N-up
+ compositor, references may not be valid anymore.
+
+ This method takes precautions to ensure that child objects in common
+ with ``self`` and ``other`` are not inadvertently deleted.
+
+ Example:
+ >>> pdf.pages[0].objgen
+ (16, 0)
+ >>> pdf.pages[0].emplace(pdf.pages[1])
+ >>> pdf.pages[0].objgen
+ (16, 0) # Same object
+ """
+ del_keys = set(self.keys()) - set(other.keys())
+ for k in other.keys():
+ self[k] = other[k] # pylint: disable=unsupported-assignment-operation
+ for k in del_keys:
+ del self[k] # pylint: disable=unsupported-delete-operation
+
+ def write(self, data, *, filter=None, decode_parms=None, type_check=True):
+ """
+ Replace stream object's data with new (possibly compressed) `data`.
+
+ `filter` and `decode_parms` specify that compression that is present on
+ the input `data`.
+
+ When writing the PDF in :meth:`pikepdf.Pdf.save`,
+ pikepdf may change the compression or apply compression to data that was
+ not compressed, depending on the parameters given to that function. It
+ will never change lossless to lossy encoding.
+
+ PNG and TIFF images, even if compressed, cannot be directly inserted
+ into a PDF and displayed as images.
+
+ Args:
+ data (bytes): the new data to use for replacement
+ filter (pikepdf.Name or pikepdf.Array): The filter(s) with which the
+ data is (already) encoded
+ decode_parms (pikepdf.Dictionary or pikepdf.Array): Parameters for the
+ filters with which the object is encode
+ type_check (bool): Check arguments; use False only if you want to
+ intentionally create malformed PDFs.
+
+ If only one `filter` is specified, it may be a name such as
+ `Name('/FlateDecode')`. If there are multiple filters, then array
+ of names should be given.
+
+ If there is only one filter, `decode_parms` is a Dictionary of
+ parameters for that filter. If there are multiple filters, then
+ `decode_parms` is an Array of Dictionary, where each array index
+ is corresponds to the filter.
"""
- Present options to IPython for rich display of this object
+
+ if type_check and filter is not None:
+ if isinstance(filter, list):
+ filter = Array(filter)
+ filter = filter.wrap_in_array()
+
+ if isinstance(decode_parms, list):
+ decode_parms = Array(decode_parms)
+ elif decode_parms is None:
+ decode_parms = Array([])
+ else:
+ decode_parms = decode_parms.wrap_in_array()
+
+ if not all(isinstance(item, Name) for item in filter):
+ raise TypeError(
+ "filter must be: pikepdf.Name or pikepdf.Array([pikepdf.Name])"
+ )
+ if not all(
+ (isinstance(item, Dictionary) or item is None) for item in decode_parms
+ ):
+ raise TypeError(
+ "decode_parms must be: pikepdf.Dictionary or "
+ "pikepdf.Array([pikepdf.Dictionary])"
+ )
+ if len(decode_parms) != 0:
+ if len(filter) != len(decode_parms):
+ raise ValueError(
+ (
+ "filter ({}) and decode_parms ({}) must be arrays of "
+ " same length"
+ ).format(repr(filter), repr(decode_parms))
+ )
+ if len(filter) == 1:
+ filter = filter[0]
+ if len(decode_parms) == 0:
+ decode_parms = None
+ elif len(decode_parms) == 1:
+ decode_parms = decode_parms[0]
+ self._write(data, filter=filter, decode_parms=decode_parms)
+
+
+@augments(Pdf)
+class Extend_Pdf:
+ def _repr_mimebundle_(self, **_kwargs):
+ """
+ Present options to IPython or Jupyter for rich display of this object
See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display
"""
@@ -145,16 +252,12 @@ class Extend_Pdf:
data = {'application/pdf': bio.read()}
return data
- def open_metadata(
- self,
- set_pikepdf_as_editor=True,
- update_docinfo=True
- ):
+ def open_metadata(self, set_pikepdf_as_editor=True, update_docinfo=True):
"""
Open the PDF's XMP metadata for editing
Recommend for use in a ``with`` block. Changes are committed to the
- PDF when the block exits.
+ PDF when the block exits. (The ``Pdf`` must still be opened.)
Example:
>>> with pdf.open_metadata() as meta:
@@ -173,11 +276,128 @@ class Extend_Pdf:
pikepdf.models.PdfMetadata
"""
return PdfMetadata(
- self,
- pikepdf_mark=set_pikepdf_as_editor,
- sync_docinfo=update_docinfo
+ self, pikepdf_mark=set_pikepdf_as_editor, sync_docinfo=update_docinfo
+ )
+
+ def make_stream(self, data):
+ """
+ Create a new pikepdf.Stream object that is attached to this PDF.
+
+ Args:
+ data (bytes): Binary data for the stream object
+ """
+ return Stream(self, data)
+
+ def add_blank_page(self, *, page_size=(612, 792)):
+ """
+ Add a blank page to this PD. If pages already exist, the page will be added to
+ the end. Pages may be reordered using ``Pdf.pages``.
+
+ The caller may add content to the page by modifying its objects after creating
+ it.
+
+ Args:
+ page_size (tuple): The size of the page in PDF units (1/72 inch or 0.35mm).
+ Default size is set to a US Letter 8.5" x 11" page.
+ """
+ for dim in page_size:
+ if not (3 <= dim <= 14400):
+ raise ValueError('Page size must be between 3 and 14400 PDF units')
+
+ page_dict = Dictionary(
+ Type=Name.Page,
+ MediaBox=Array([0, 0, page_size[0], page_size[1]]),
+ Contents=self.make_stream(b''),
+ Resources=Dictionary(),
+ )
+ page = self.make_indirect(page_dict)
+ self._add_page(page, first=False)
+ return page
+
+ def close(self):
+ """
+ Close a Pdf object and release resources acquired by pikepdf
+
+ If pikepdf opened the file handle it will close it (e.g. when opened with a file
+ path). If the caller opened the file for pikepdf, the caller close the file.
+
+ pikepdf lazily loads data from PDFs, so some :class:`pikepdf.Object` may
+ implicitly depend on the :class:`pikepdf.Pdf` being open. This is always the
+ case for :class:`pikepdf.Stream` but can be true for any object. Do not close
+ the `Pdf` object if you might still be accessing content from it.
+
+ When an ``Object`` is copied from one ``Pdf`` to another, the ``Object`` is copied into
+ the destination ``Pdf`` immediately, so after accessing all desired information
+ from the source ``Pdf`` it may be closed.
+
+ Caution:
+ Closing the ``Pdf`` is currently implemented by resetting it to an empty
+ sentinel. It is currently possible to edit the sentinel as if it were a live
+ object. This behavior should not be relied on and is subject to change.
+
+ """
+
+ EMPTY_PDF = (
+ b"%PDF-1.3\n"
+ b"1 0 obj\n"
+ b"<< /Type /Catalog /Pages 2 0 R >>\n"
+ b"endobj\n"
+ b"2 0 obj\n"
+ b"<< /Type /Pages /Kids [] /Count 0 >>\n"
+ b"endobj\n"
+ b"xref\n"
+ b"0 3\n"
+ b"0000000000 65535 f \n"
+ b"0000000009 00000 n \n"
+ b"0000000058 00000 n \n"
+ b"trailer << /Size 3 /Root 1 0 R >>\n"
+ b"startxref\n"
+ b"110\n"
+ b"%%EOF\n"
)
+ if self.filename:
+ description = "closed file: " + self.filename
+ else:
+ description = "closed object"
+ self._process(description, EMPTY_PDF)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ @property
+ def allow(self):
+ """
+ Report permissions associated with this PDF.
+
+ By default these permissions will be replicated when the PDF is
+ saved. Permissions may also only be changed when a PDF is being saved,
+ and are only available for encrypted PDFs. If a PDF is not encrypted,
+ all operations are reported as allowed.
+
+ pikepdf has no way of enforcing permissions.
+
+ Returns: pikepdf.models.Permissions
+ """
+ results = {}
+ for field in Permissions.fields():
+ results[field] = getattr(self, '_allow_' + field)
+ return Permissions(**results)
+
+ @property
+ def encryption(self):
+ """
+ Report encryption information for this PDF.
+
+ Encryption settings may only be changed when a PDF is saved.
+
+ Returns: pikepdf.models.EncryptionInfo
+ """
+ return EncryptionInfo(self._encryption_data)
+
def _attach(self, *, basename, filebytes, mime=None, desc=''):
"""
Attach a file to this PDF
@@ -219,6 +439,7 @@ class Extend_Pdf:
if not mime:
from mimetypes import guess_type
+
mime, _encoding = guess_type(basename)
if not mime:
mime = 'application/octet-stream'
@@ -226,28 +447,28 @@ class Extend_Pdf:
filestream = Stream(self, filebytes)
filestream.Subtype = Name('/' + mime)
- filespec = Dictionary({
- '/Type': Name.Filespec,
- '/F': basename,
- '/UF': basename,
- '/Desc': desc,
- '/EF': Dictionary({
- '/F': filestream
- })
- })
+ filespec = Dictionary(
+ {
+ '/Type': Name.Filespec,
+ '/F': basename,
+ '/UF': basename,
+ '/Desc': desc,
+ '/EF': Dictionary({'/F': filestream}),
+ }
+ )
# names = self.Root.Names.EmbeddedFiles.Names.as_list()
# names.append(filename) # Key
# names.append(self.make_indirect(filespec))
- self.Root.Names.EmbeddedFiles.Names = Array([
- basename, # key
- self.make_indirect(filespec)
- ])
+ self.Root.Names.EmbeddedFiles.Names = Array(
+ [basename, self.make_indirect(filespec)] # key
+ )
if '/PageMode' not in self.Root:
self.Root.PageMode = Name.UseAttachments
-@extends(_ObjectMapping)
+
+@augments(_ObjectMapping)
class Extend_ObjectMapping:
def __contains__(self, key):
try:
diff --git a/src/pikepdf/_version.py b/src/pikepdf/_version.py
new file mode 100644
index 0000000..c9d4b7b
--- /dev/null
+++ b/src/pikepdf/_version.py
@@ -0,0 +1,13 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+from pkg_resources import DistributionNotFound
+from pkg_resources import get_distribution as _get_distribution
+
+try:
+ __version__ = _get_distribution(__package__).version
+except DistributionNotFound:
+ __version__ = "Not installed"
diff --git a/src/pikepdf/codec.py b/src/pikepdf/codec.py
new file mode 100644
index 0000000..d008fb2
--- /dev/null
+++ b/src/pikepdf/codec.py
@@ -0,0 +1,48 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+import codecs
+
+from ._qpdf import utf8_to_pdf_doc, pdf_doc_to_utf8
+
+
+def pdfdoc_encode(input, errors='strict'):
+ error_marker = b'?' if errors == 'replace' else b'\xad'
+ success, pdfdoc = utf8_to_pdf_doc(input, error_marker)
+ if not success:
+ if errors == 'strict':
+ raise ValueError("'pdfdoc' codec can't encode")
+ if errors == 'ignore':
+ pdfdoc = pdfdoc.replace(b'\xad', b'')
+ return pdfdoc, len(input)
+
+
+def pdfdoc_decode(input, errors='strict'):
+ if isinstance(input, memoryview):
+ input = input.tobytes()
+ utf8 = pdf_doc_to_utf8(input)
+ return utf8, len(input)
+
+
+class PdfDocCodec(codecs.Codec):
+ """Implements PdfDocEncoding character map used inside PDFs"""
+
+ def encode(self, input, errors='strict'):
+ return pdfdoc_encode(input, errors)
+
+ def decode(self, input, errors='strict'):
+ return pdfdoc_decode(input, errors)
+
+
+def find_pdfdoc(encoding):
+ if encoding == 'pdfdoc':
+ return codecs.CodecInfo(
+ name='pdfdoc', encode=PdfDocCodec().encode, decode=PdfDocCodec().decode
+ )
+ return None
+
+
+codecs.register(find_pdfdoc)
diff --git a/src/pikepdf/models/__init__.py b/src/pikepdf/models/__init__.py
index b0d27bc..023b836 100644
--- a/src/pikepdf/models/__init__.py
+++ b/src/pikepdf/models/__init__.py
@@ -4,10 +4,11 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-from .. import Object, ObjectType, PdfError
-from .matrix import PdfMatrix
+from pikepdf import Object, ObjectType, PdfError
+from .encryption import Permissions, Encryption, EncryptionInfo
from .image import PdfImage, PdfInlineImage, UnsupportedImageTypeError
+from .matrix import PdfMatrix
from .metadata import PdfMetadata
@@ -50,8 +51,10 @@ def parse_content_stream(page_or_stream, operators=''):
if not isinstance(page_or_stream, Object):
raise TypeError("stream must a PDF object")
- if page_or_stream._type_code != ObjectType.stream \
- and page_or_stream.get('/Type') != '/Page':
+ if (
+ page_or_stream._type_code != ObjectType.stream
+ and page_or_stream.get('/Type') != '/Page'
+ ):
raise TypeError("parse_content_stream called on page or stream object")
try:
@@ -87,8 +90,7 @@ class _Page:
raise AttributeError(item)
def __repr__(self):
- return repr(self.obj).replace(
- 'pikepdf.Dictionary', 'pikepdf.Page', 1)
+ return repr(self.obj).replace('pikepdf.Dictionary', 'pikepdf.Page', 1)
@property
def mediabox(self):
@@ -107,8 +109,7 @@ class _Page:
:return: True if there is text
"""
text_showing_operators = """TJ " ' Tj"""
- text_showing_insts = parse_content_stream(
- self.obj, text_showing_operators)
+ text_showing_insts = parse_content_stream(self.obj, text_showing_operators)
if len(text_showing_insts) > 0:
return True
return False
diff --git a/src/pikepdf/models/encryption.py b/src/pikepdf/models/encryption.py
new file mode 100644
index 0000000..c61df71
--- /dev/null
+++ b/src/pikepdf/models/encryption.py
@@ -0,0 +1,154 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+import collections
+import types
+
+
+class Permissions(types.SimpleNamespace):
+ """
+ Stores the permissions for an encrypted PDF.
+
+ Unencrypted PDFs implicitly have all permissions allowed.
+ pikepdf does not enforce the restrictions in any way. Permissions
+ can only be changed when a PDF is saved.
+ """
+
+ def __init__(
+ self,
+ accessibility=True,
+ extract=True,
+ modify_annotation=True,
+ modify_assembly=False,
+ modify_form=True,
+ modify_other=True,
+ print_lowres=True,
+ print_highres=True,
+ ):
+ kvs = locals()
+ del kvs['self']
+ super().__init__(**kvs)
+
+ def _readonly(self, *args):
+ raise TypeError("object is read-only")
+
+ __setattr__ = _readonly
+
+ __delattr__ = _readonly
+
+ def keys(self):
+ yield from (k for k in self.__dict__ if not k.startswith('_'))
+
+ def values(self):
+ yield from (v for k, v in self.__dict__.items() if not k.startswith('_'))
+
+ @classmethod
+ def fields(cls):
+ yield from (k for k in cls().__dict__ if not k.startswith('_'))
+
+
+class EncryptionInfo:
+ """
+ Reports encryption information for an encrypted PDF.
+
+ This information may not be changed, except when a PDF is saved.
+ This object is not used to specify the encryption settings to save
+ a PDF, due to non-overlapping information requirements.
+ """
+
+ def __init__(self, encdict):
+ self._encdict = encdict
+
+ @property
+ def R(self):
+ """Revision number of the security handler."""
+ return self._encdict['R']
+
+ @property
+ def V(self):
+ """Version of PDF password algorithm."""
+ return self._encdict['V']
+
+ @property
+ def P(self):
+ """Encoded permission bits.
+
+ See :meth:`Pdf.allow` instead.
+ """
+ return self._encdict['P']
+
+ @property
+ def stream_method(self):
+ """Encryption method used to encode streams."""
+ return self._encdict['stream']
+
+ @property
+ def string_method(self):
+ """Encryption method used to encode strings."""
+ return self._encdict['string']
+
+ @property
+ def file_method(self):
+ """Encryption method used to encode the whole file."""
+ return self._encdict['file']
+
+ @property
+ def user_password(self):
+ """If possible, return the user password.
+
+ The user password can only be retrieved when a PDF is opened
+ with the owner password and when older versions of the
+ encryption algorithm are used.
+
+ The password is always returned as ``bytes`` even if it has
+ a clear Unicode representation.
+ """
+ return self._encdict['user_passwd']
+
+ @property
+ def encryption_key(self):
+ """The RC4 or AES encryption key used for this file."""
+ return self._encdict['encryption_key']
+
+ @property
+ def bits(self):
+ """The number of encryption bits."""
+ return len(self._encdict['encryption_key']) * 8
+
+
+class Encryption(dict):
+ """
+ Specify the encryption settings to apply when a PDF is saved.
+
+ Args:
+ owner (str): The owner password to use. This allows full control
+ of the file. If blank, the PDF will be encrypted and
+ present as "(SECURED)" in PDF viewers. If the owner password
+ is blank, the user password should be as well.
+ user (str): The user password to use. With this password, some
+ restrictions will be imposed by a typical PDF reader.
+ If blank, the PDF can be opened by anyone, but only modified
+ as allowed by the permissions in ``allow``.
+ R (int): Select the security handler algorithm to use. Choose from:
+ ``2``, ``3``, ``4`` or ``6``. By default, the highest version of
+ is selected (``6``). ``5`` is a deprecated algorithm that should
+ not be used.
+ allow (pikepdf.Permissions): The permissions to set.
+ If omitted, all permissions are granted to the user.
+ aes (bool): If True, request the AES algorithm. If False, use RC4.
+ If omitted, AES is selected whenever possible (R >= 4).
+ metadata (bool): If True, also encrypt the PDF metadata. If False,
+ metadata is not encrypted. Reading document metadata without
+ decryption may be desirable in some cases. Requires ``aes=True``.
+ If omitted, metadata is encrypted whenever possible.
+ """
+
+ def __init__(
+ self, *, owner, user, R=6, allow=Permissions(), aes=True, metadata=True
+ ):
+ self.update(
+ dict(R=R, owner=owner, user=user, allow=allow, aes=aes, metadata=metadata)
+ )
diff --git a/src/pikepdf/models/image.py b/src/pikepdf/models/image.py
index 8ecb571..6493d85 100644
--- a/src/pikepdf/models/image.py
+++ b/src/pikepdf/models/image.py
@@ -4,20 +4,23 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+import struct
+from abc import ABC, abstractmethod
+from decimal import Decimal
from io import BytesIO
from itertools import zip_longest
-from abc import ABC, abstractmethod
-import struct
+from pathlib import Path
+from shutil import copyfileobj
+from tempfile import NamedTemporaryFile
+from zlib import decompress, error as ZlibError
-from decimal import Decimal
+from .. import Array, Dictionary, Name, Object, PdfError, Stream
-from .. import (
- Object, Array, PdfError, Name, Dictionary, Stream
-)
class DependencyError(Exception):
pass
+
class UnsupportedImageTypeError(Exception):
pass
@@ -37,7 +40,11 @@ def array_str_colorspace(value):
result = [str(items[n]) for n in range(3)]
result.append(bytes(items[3]))
return result
+ if len(items) == 2 and items[0] == '/ICCBased':
+ result = [str(items[0]), items[1]]
+ return result
return array_str(items)
+
return array_str(value)
@@ -112,16 +119,16 @@ class PdfImageBase(ABC):
if self._colorspaces:
if self._colorspaces[0] in self.SIMPLE_COLORSPACES:
return self._colorspaces[0]
- if self._colorspaces[0] == '/DeviceCMYK':
+ if self._colorspaces[0] in ('/DeviceCMYK', '/ICCBased'):
return self._colorspaces[0]
- if self._colorspaces[0] == '/Indexed' \
- and self._colorspaces[1] in self.SIMPLE_COLORSPACES:
+ if (
+ self._colorspaces[0] == '/Indexed'
+ and self._colorspaces[1] in self.SIMPLE_COLORSPACES
+ ):
return self._colorspaces[1]
- if self._colorspaces[0] == '/ICCBased':
- icc = self._colorspaces[1]
- return icc.stream_dict.get('/Alternate', '')
raise NotImplementedError(
- "not sure how to get colorspace: " + repr(self._colorspaces))
+ "not sure how to get colorspace: " + repr(self._colorspaces)
+ )
@property
def bits_per_component(self):
@@ -136,6 +143,11 @@ class PdfImageBase(ABC):
pass
@property
+ @abstractmethod
+ def icc(self):
+ pass
+
+ @property
def indexed(self):
"""``True`` if the image has a defined color palette"""
return '/Indexed' in self._colorspaces
@@ -147,7 +159,12 @@ class PdfImageBase(ABC):
@property
def mode(self):
- """``PIL.Image.mode`` equivalent for this image"""
+ """``PIL.Image.mode`` equivalent for this image, where possible
+
+ If an ICC profile is attached to the image, we still attempt to resolve a Pillow
+ mode.
+ """
+
m = ''
if self.indexed:
m = 'P'
@@ -160,6 +177,18 @@ class PdfImageBase(ABC):
m = 'L'
elif self.colorspace == '/DeviceCMYK':
m = 'CMYK'
+ elif self.colorspace == '/ICCBased':
+ try:
+ icc_profile = self._colorspaces[1]
+ icc_profile_nchannels = int(icc_profile['/N'])
+ if icc_profile_nchannels == 1:
+ m = 'L'
+ elif icc_profile_nchannels == 3:
+ m = 'RGB'
+ elif icc_profile_nchannels == 4:
+ m = 'CMYK'
+ except (ValueError, TypeError):
+ pass
if m == '':
raise NotImplementedError("Not sure how to handle PDF image of this type")
return m
@@ -175,7 +204,6 @@ class PdfImageBase(ABC):
[(/FilterName, {/DecodeParmName: Value, ...}), ...]
The order of /Filter matters as indicates the encoding/decoding sequence.
-
"""
return list(zip_longest(self.filters, self.decode_parms, fillvalue={}))
@@ -183,8 +211,8 @@ class PdfImageBase(ABC):
def palette(self):
"""Retrieves the color palette for this image
- :returns: (base_colorspace: str, palette: bytes)
- :rtype: tuple
+ Returns:
+ tuple (base_colorspace: str, palette: bytes)
"""
if not self.indexed:
@@ -209,6 +237,29 @@ class PdfImageBase(ABC):
def as_pil_image(self):
pass
+ @staticmethod
+ def _unstack_compression(buffer, filters):
+ """Remove stacked compression where it appears.
+
+ Stacked compression means when an image is set to:
+ ``[/FlateDecode /DCTDecode]``
+ for example.
+
+ Only Flate can be stripped off the front currently.
+
+ Args:
+ buffer (pikepdf._qpdf.Buffer): the compressed image data
+ filters (list of str): all files on the data
+ """
+ data = memoryview(buffer)
+ while len(filters) > 1 and filters[0] == '/FlateDecode':
+ try:
+ data = decompress(data)
+ except ZlibError as e:
+ raise UnsupportedImageTypeError() from e
+ filters = filters[1:]
+ return data, filters
+
class PdfImage(PdfImageBase):
"""Support class to provide a consistent API for manipulating PDF images
@@ -237,21 +288,20 @@ class PdfImage(PdfImageBase):
obj (pikepdf.Object): an Image XObject
"""
- if isinstance(obj, Stream) and \
- obj.stream_dict.get("/Subtype") != "/Image":
+ if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image":
raise TypeError("can't construct PdfImage from non-image")
self.obj = obj
+ self._icc = None
@classmethod
def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover
"""Insert a PIL image into a PDF (rudimentary)
- :param pdf: the PDF to attach the image to
- :type pdf: pikepdf.Pdf
- :param page: the page to attach the image to
- :param name: the name to set the image
- :param image: image
- :type image: PIL.Image.Image
+ Args:
+ pdf (pikepdf.Pdf): the PDF to attach the image to
+ page (pikepdf.Object): the page to attach the image to
+ name (str or pikepdf.Name): the name to set the image
+ image (PIL.Image.Image): the image to insert
"""
data = image.tobytes()
@@ -279,6 +329,26 @@ class PdfImage(PdfImageBase):
"""``False`` for image XObject"""
return False
+ @property
+ def icc(self):
+ """If an ICC profile is attached, return a Pillow object that describe it.
+
+ Most of the information may be found in ``icc.profile``.
+
+ Returns:
+ PIL.ImageCms.ImageCmsProfile
+ """
+ from PIL import ImageCms
+
+ if self.colorspace != '/ICCBased':
+ return None
+ if not self._icc:
+ iccstream = self._colorspaces[1]
+ iccbuffer = iccstream.get_stream_buffer()
+ iccbytesio = BytesIO(iccbuffer)
+ self._icc = ImageCms.ImageCmsProfile(iccbytesio)
+ return self._icc
+
def _extract_direct(self, *, stream):
"""Attempt to extract the image directly to a usable image file
@@ -296,30 +366,38 @@ class PdfImage(PdfImageBase):
# saved as a standard JPEG. RGB JPEGs without YUV conversion can't
# be saved as JPEGs, and are probably bugs. Some software in the
# wild actually produces RGB JPEGs in PDFs (probably a bug).
- return (self.mode == 'RGB' and
- self.filter_decodeparms[0][1].get('/ColorTransform', 1))
+ DEFAULT_CT_RGB = 1
+ ct = self.filter_decodeparms[0][1].get('/ColorTransform', DEFAULT_CT_RGB)
+ return self.mode == 'RGB' and ct == DEFAULT_CT_RGB
def normal_dct_cmyk():
# Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved.
# There is a YUVK colorspace but CMYK JPEGs don't generally use it
- return (self.mode == 'CMYK' and
- self.filter_decodeparms[0][1].get('/ColorTransform', 0))
+ DEFAULT_CT_CMYK = 0
+ ct = self.filter_decodeparms[0][1].get('/ColorTransform', DEFAULT_CT_CMYK)
+ return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK
+
+ data, filters = self._unstack_compression(
+ self.obj.get_raw_stream_buffer(), self.filters
+ )
- if self.filters == ['/CCITTFaxDecode']:
- data = self.obj.read_raw_bytes()
+ if filters == ['/CCITTFaxDecode']:
+ if self.colorspace == '/ICCBased':
+ raise UnsupportedImageTypeError("Cannot direct-extract CCITT + ICC")
stream.write(self._generate_ccitt_header(data))
stream.write(data)
return '.tif'
- elif self.filters == ['/DCTDecode'] and (
- self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk):
- buffer = self.obj.get_raw_stream_buffer()
- stream.write(buffer)
+ elif filters == ['/DCTDecode'] and (
+ self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk()
+ ):
+ stream.write(data)
return '.jpg'
raise UnsupportedImageTypeError()
def _extract_transcoded(self):
from PIL import Image
+
im = None
if self.mode == 'RGB' and self.bits_per_component == 8:
# No point in accessing the buffer here, size qpdf decodes to 3-byte
@@ -330,8 +408,7 @@ class PdfImage(PdfImageBase):
buffer = self.get_stream_buffer()
stride = 0 # tell Pillow to calculate stride from line width
ystep = 1 # image is top to bottom in memory
- im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride,
- ystep)
+ im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride, ystep)
if self.mode == 'P':
base_mode, palette = self.palette
if base_mode in ('RGB', 'L'):
@@ -347,14 +424,15 @@ class PdfImage(PdfImageBase):
im = Image.frombytes('1', self.size, data)
base_mode, palette = self.palette
- if not (palette == b'\x00\x00\x00\xff\xff\xff'
- or palette == b'\x00\xff'):
- raise NotImplementedError(
- 'monochrome image with nontrivial palette')
+ if not (palette == b'\x00\x00\x00\xff\xff\xff' or palette == b'\x00\xff'):
+ raise NotImplementedError('monochrome image with nontrivial palette')
+
+ if self.colorspace == '/ICCBased':
+ im.info['icc_profile'] = self.icc.tobytes()
return im
- def extract_to(self, *, stream):
+ def _extract_to_stream(self, *, stream):
"""Attempt to extract the image directly to a usable image file
If possible, the compressed data is extracted and inserted into
@@ -386,6 +464,51 @@ class PdfImage(PdfImageBase):
raise UnsupportedImageTypeError(repr(self))
+ def extract_to(self, *, stream=None, fileprefix=''):
+ """Attempt to extract the image directly to a usable image file
+
+ If possible, the compressed data is extracted and inserted into
+ a compressed image file format without transcoding the compressed
+ content. If this is not possible, the data will be decompressed
+ and extracted to an appropriate format.
+
+ Because it is not known until attempted what image format will be
+ extracted, users should not assume what format they are getting back.
+ When saving the image to a file, use a temporary filename, and then
+ rename the file to its final name based on the returned file extension.
+
+ Examples:
+
+ >>> im.extract_to(stream=bytes_io)
+ '.png'
+
+ >>> im.extract_to(fileprefix='/tmp/image00')
+ '/tmp/image00.jpg'
+
+ Args:
+ stream: Writable stream to write data to.
+ fileprefix (str or Path): The path to write the extracted image to,
+ without the file extension.
+
+ Returns:
+ str: If *fileprefix* was provided, then the fileprefix with the
+ appropriate extension. If no *fileprefix*, then an extension
+ indicating the file type.
+ """
+
+ if bool(stream) == bool(fileprefix):
+ raise ValueError("Cannot set both stream and fileprefix")
+ if stream:
+ return self._extract_to_stream(stream=stream)
+
+ bio = BytesIO()
+ extension = self._extract_to_stream(stream=bio)
+ bio.seek(0)
+ filepath = Path(str(Path(fileprefix)) + extension)
+ with filepath.open('wb') as target:
+ copyfileobj(bio, target)
+ return str(filepath)
+
def read_bytes(self):
"""Decompress this image and return it as unencoded bytes"""
return self.obj.read_bytes()
@@ -433,6 +556,7 @@ class PdfImage(PdfImageBase):
img_size = len(data)
tiff_header_struct = '<' + '2s' + 'H' + 'L' + 'H' + 'HHLL' * 8 + 'L'
+ # fmt: off
tiff_header = struct.pack(
tiff_header_struct,
b'II', # Byte order indication: Little endian
@@ -449,6 +573,7 @@ class PdfImage(PdfImageBase):
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image
0 # last IFD
)
+ # fmt: on
return tiff_header
def show(self):
@@ -457,7 +582,8 @@ class PdfImage(PdfImageBase):
def __repr__(self):
return '<pikepdf.PdfImage image mode={} size={}x{} at {}>'.format(
- self.mode, self.width, self.height, hex(id(self)))
+ self.mode, self.width, self.height, hex(id(self))
+ )
def _repr_png_(self):
"""Display hook for IPython/Jupyter"""
@@ -468,14 +594,17 @@ class PdfImage(PdfImageBase):
class PdfJpxImage(PdfImage):
-
def __init__(self, obj):
super().__init__(obj)
self.pil = self.as_pil_image()
def _extract_direct(self, *, stream):
- buffer = self.obj.get_raw_stream_buffer()
- stream.write(buffer)
+ data, filters = self._unstack_compression(
+ self.obj.get_raw_stream_buffer(), self.filters
+ )
+ if filters != ['/JPXDecode']:
+ raise UnsupportedImageTypeError(self.filters)
+ stream.write(data)
return '.jp2'
@property
@@ -508,7 +637,8 @@ class PdfJpxImage(PdfImage):
def __repr__(self):
return '<pikepdf.PdfJpxImage JPEG2000 image mode={} size={}x{} at {}>'.format(
- self.mode, self.width, self.height, hex(id(self)))
+ self.mode, self.width, self.height, hex(id(self))
+ )
class PdfInlineImage(PdfImageBase):
@@ -532,7 +662,7 @@ class PdfInlineImage(PdfImageBase):
b'/LZW': b'/LZWDecode',
b'/RL': b'/RunLengthDecode',
b'/CCF': b'/CCITTFaxDecode',
- b'/DCT': b'/DCTDecode'
+ b'/DCT': b'/DCTDecode',
}
def __init__(self, *, image_data, image_object: tuple):
@@ -554,8 +684,7 @@ class PdfInlineImage(PdfImageBase):
try:
reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')
except PdfError as e:
- raise PdfError(
- "parsing inline " + reparse.decode('unicode_escape')) from e
+ raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e
self.obj = reparsed_obj
self.pil = None
@@ -575,7 +704,6 @@ class PdfInlineImage(PdfImageBase):
else:
raise NotImplementedError(repr(obj))
-
def _metadata(self, name, type_, default):
return metadata_from_obj(self.obj, name, type_, default)
@@ -597,6 +725,10 @@ class PdfInlineImage(PdfImageBase):
def is_inline(self):
return True
+ @property
+ def icc(self):
+ raise ValueError("Inline images may not have ICC profiles")
+
def __repr__(self):
mode = '?'
try:
@@ -604,23 +736,24 @@ class PdfInlineImage(PdfImageBase):
except Exception:
pass
return '<pikepdf.PdfInlineImage image mode={} size={}x{} at {}>'.format(
- mode, self.width, self.height, hex(id(self)))
+ mode, self.width, self.height, hex(id(self))
+ )
def as_pil_image(self):
- from PIL import Image
-
if self.pil:
return self.pil
raise NotImplementedError('not yet')
- def extract_to(self, *, stream): # pylint: disable=unused-argument
+ def extract_to(
+ self, *, stream=None, fileprefix=''
+ ): # pylint: disable=unused-argument
raise UnsupportedImageTypeError("inline images don't support extract")
def read_bytes(self):
raise NotImplementedError("qpdf returns compressed")
- #return self._data._inline_image_bytes()
+ # return self._data._inline_image_bytes()
def get_stream_buffer(self):
raise NotImplementedError("qpdf returns compressed")
- #return memoryview(self._data.inline_image_bytes())
+ # return memoryview(self._data.inline_image_bytes())
diff --git a/src/pikepdf/models/matrix.py b/src/pikepdf/models/matrix.py
index d68fae6..4c5c2fb 100644
--- a/src/pikepdf/models/matrix.py
+++ b/src/pikepdf/models/matrix.py
@@ -4,7 +4,8 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-from math import cos, sin, pi
+from math import cos, pi, sin
+
class PdfMatrix:
"""
@@ -31,6 +32,7 @@ class PdfMatrix:
"""
def __init__(self, *args):
+ # fmt: off
if not args:
self.values = ((1, 0, 0), (0, 1, 0), (0, 0, 1))
elif len(args) == 6:
@@ -51,6 +53,7 @@ class PdfMatrix:
tuple(args[0][2]))
else:
raise ValueError('arguments')
+ # fmt: on
@staticmethod
def identity():
@@ -66,10 +69,13 @@ class PdfMatrix:
a = self.values
b = other.values
return PdfMatrix(
- [[sum([float(i) * float(j)
- for i, j in zip(row, col)]
- ) for col in zip(*b)]
- for row in a]
+ [
+ [
+ sum([float(i) * float(j) for i, j in zip(row, col)])
+ for col in zip(*b)
+ ]
+ for row in a
+ ]
)
def scaled(self, x, y):
diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
index 65934cd..a029b2b 100644
--- a/src/pikepdf/models/metadata.py
+++ b/src/pikepdf/models/metadata.py
@@ -4,24 +4,20 @@
#
# Copyright (C) 2018, James R. Barlow (https://github.com/jbarlow83/)
+import re
+import sys
from collections import namedtuple
from collections.abc import MutableMapping
from datetime import datetime
from functools import wraps
from io import BytesIO
-from pkg_resources import (
- get_distribution as _get_distribution,
- DistributionNotFound
-)
from warnings import warn
-import re
-import sys
from lxml import etree
-from lxml.etree import QName, XMLSyntaxError
-from defusedxml.lxml import parse
+from lxml.etree import parse, QName, XMLSyntaxError
-from .. import Stream, Name, String, PdfError
+from .. import Name, PdfError, Stream, String
+from .. import __version__ as pikepdf_version
XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
@@ -79,27 +75,21 @@ XMP_CONTAINERS = [
XmpContainer('Seq', list, list.append),
]
-LANG_ALTS = frozenset([
- str(QName(XMP_NS_DC, 'title')),
- str(QName(XMP_NS_DC, 'description')),
- str(QName(XMP_NS_DC, 'rights')),
- str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
-])
+LANG_ALTS = frozenset(
+ [
+ str(QName(XMP_NS_DC, 'title')),
+ str(QName(XMP_NS_DC, 'description')),
+ str(QName(XMP_NS_DC, 'rights')),
+ str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
+ ]
+)
# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,
# but we'll be strict to ensure wider compatibility.)
re_xml_illegal_chars = re.compile(
r"(?u)[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"
)
-re_xml_illegal_bytes = re.compile(
- br"[^\x09\x0A\x0D\x20-\xFF]|&#0;"
-)
-
-# Repeat this to avoid circular from top package's pikepdf.__version__
-try:
- pikepdf_version = _get_distribution(__name__).version
-except DistributionNotFound:
- pikepdf_version = ""
+re_xml_illegal_bytes = re.compile(br"[^\x09\x0A\x0D\x20-\xFF]|&#0;")
def encode_pdf_date(d: datetime) -> str:
@@ -171,6 +161,7 @@ class AuthorConverter:
if sys.version_info < (3, 7):
+
def fromisoformat(datestr):
# strptime %z can't parse a timezone with punctuation
if re.search(r'[+-]\d{2}[-:]\d{2}$', datestr):
@@ -179,9 +170,12 @@ if sys.version_info < (3, 7):
return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S%z")
except ValueError:
return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S")
+
+
else:
fromisoformat = datetime.fromisoformat
+
class DateConverter:
@staticmethod
def xmp_from_docinfo(docinfo_val):
@@ -203,6 +197,7 @@ def ensure_loaded(fn):
if not self._xmp:
self._load()
return fn(self, *args, **kwargs)
+
return wrapper
@@ -228,10 +223,10 @@ class PdfMetadata(MutableMapping):
To update metadata, use a with block.
- .. code-block:: python
+ Example:
- with pdf.open_metadata() as records:
- records['dc:title'] = 'New Title'
+ >>> with pdf.open_metadata() as records:
+ records['dc:title'] = 'New Title'
See Also:
:meth:`pikepdf.Pdf.open_metadata`
@@ -289,7 +284,9 @@ class PdfMetadata(MutableMapping):
continue
self[qname] = val
except (ValueError, AttributeError) as e:
- msg = "The metadata field {} could not be copied to XMP".format(docinfo_name)
+ msg = "The metadata field {} could not be copied to XMP".format(
+ docinfo_name
+ )
if raise_failure:
raise ValueError(msg) from e
else:
@@ -314,6 +311,15 @@ class PdfMetadata(MutableMapping):
pis = self._xmp.xpath('/processing-instruction()')
for pi in pis:
etree.strip_tags(self._xmp, pi.tag)
+ try:
+ self._get_rdf_root()
+ except ValueError:
+ if self._xmp.find('.', self.NS).tag == '{adobe:ns:meta/}xmpmeta':
+ # Looks like: <x:xmpmeta></x:xmpmeta>, so reload with template
+ # that includes <rdf:RDF>
+ return self._load_from(XMP_EMPTY)
+ else:
+ raise # Probably not XMP
@ensure_loaded
def __enter__(self):
@@ -347,7 +353,11 @@ class PdfMetadata(MutableMapping):
try:
value = converter.docinfo_from_xmp(value)
except ValueError:
- warn("The DocumentInfo field {} could not be updated from XMP".format(docinfo_name))
+ warn(
+ "The DocumentInfo field {} could not be updated from XMP".format(
+ docinfo_name
+ )
+ )
value = None
if value is None:
if docinfo_name in self._pdf.docinfo:
@@ -562,19 +572,19 @@ class PdfMetadata(MutableMapping):
val = AltList([clean(val)])
if isinstance(val, (list, set)):
rdfdesc = etree.SubElement(
- rdf, QName(XMP_NS_RDF, 'Description'),
- attrib={
- QName(XMP_NS_RDF, 'about'): '',
- },
+ rdf,
+ QName(XMP_NS_RDF, 'Description'),
+ attrib={QName(XMP_NS_RDF, 'about'): ''},
)
node = etree.SubElement(rdfdesc, self._qname(key))
add_array(node, val)
elif isinstance(val, str):
rdfdesc = etree.SubElement(
- rdf, QName(XMP_NS_RDF, 'Description'),
+ rdf,
+ QName(XMP_NS_RDF, 'Description'),
attrib={
QName(XMP_NS_RDF, 'about'): '',
- self._qname(key): clean(val)
+ self._qname(key): clean(val),
},
)
else:
@@ -588,7 +598,11 @@ class PdfMetadata(MutableMapping):
node, attrib, _oldval, parent = next(self._get_elements(key))
if attrib: # Inline
del node.attrib[attrib]
- if len(node.attrib) == 1 and len(node) == 0 and QName(XMP_NS_RDF, 'about') in node.attrib:
+ if (
+ len(node.attrib) == 1
+ and len(node) == 0
+ and QName(XMP_NS_RDF, 'about') in node.attrib
+ ):
# The only thing left on this node is rdf:about="", so remove it
parent.remove(node)
else:
diff --git a/src/pikepdf/objects.py b/src/pikepdf/objects.py
index a888b97..2e42eb9 100644
--- a/src/pikepdf/objects.py
+++ b/src/pikepdf/objects.py
@@ -8,24 +8,33 @@
The purpose of these is to provide nice-looking classes to allow explicit
construction of PDF objects and more pythonic idioms and facilitate discovery
-by documentation generators.
+by documentation generators and linters.
It's also a place to narrow the scope of input types to those more easily
converted to C++.
-In reality all of these return objects of class pikepdf.Object or rather
-QPDFObjectHandle which is a generic type.
-
+There is some deliberate "smoke and mirrors" here: all of the objects are truly
+instances of ``pikepdf.Object``, which is a variant container object. The
+``__new__`` constructs a ``pikepdf.Object`` in each case, and the rest of the
+class definition is present as an aide for code introspection.
"""
from . import _qpdf
-from ._qpdf import Object, ObjectType
-# pylint: disable=unused-import
-from ._qpdf import Operator
+# pylint: disable=unused-import, abstract-method
+from ._qpdf import Object, ObjectType, Operator
+
+# By default pikepdf.Object will identify itself as pikepdf._qpdf.Object
+# Here we change the module to discourage people from using that internal name
+# Instead it will become pikepdf.objects.Object
+Object.__module__ = __name__
+ObjectType.__module__ = __name__
+Operator.__module__ = __name__
-class _ObjectMeta(type):
+
+# type(Object) is the metaclass that pybind11 defines; we wish to extend that
+class _ObjectMeta(type(Object)):
"""Supports instance checking"""
def __instancecheck__(cls, instance):
@@ -38,9 +47,13 @@ class _NameObjectMeta(_ObjectMeta):
"""Supports usage pikepdf.Name.Whatever -> Name('/Whatever')"""
def __getattr__(self, attr):
+ if attr.startswith('_'):
+ return _ObjectMeta.__getattr__(attr)
return Name('/' + attr)
- def __setattr__(self, name, value):
+ def __setattr__(self, attr, value):
+ if attr.startswith('_'):
+ return _ObjectMeta.__setattr__(attr, value)
raise TypeError("Attributes may not be set on pikepdf.Name")
def __getitem__(self, item):
@@ -56,7 +69,7 @@ class _NameObjectMeta(_ObjectMeta):
)
-class Name(metaclass=_NameObjectMeta):
+class Name(Object, metaclass=_NameObjectMeta):
"""Constructs a PDF Name object
Names can be constructed with two notations:
@@ -69,6 +82,7 @@ class Name(metaclass=_NameObjectMeta):
that are normally expected to be in a PDF. The latter is preferred for
dynamic names and attributes.
"""
+
object_type = ObjectType.name
def __new__(cls, name):
@@ -79,8 +93,9 @@ class Name(metaclass=_NameObjectMeta):
return _qpdf._new_name(name)
-class String(metaclass=_ObjectMeta):
+class String(Object, metaclass=_ObjectMeta):
"""Constructs a PDF String object"""
+
object_type = ObjectType.string
def __new__(cls, s):
@@ -97,8 +112,9 @@ class String(metaclass=_ObjectMeta):
return _qpdf._new_string_utf8(s)
-class Array(metaclass=_ObjectMeta):
+class Array(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Array object"""
+
object_type = ObjectType.array
def __new__(cls, a=None):
@@ -118,8 +134,9 @@ class Array(metaclass=_ObjectMeta):
return _qpdf._new_array(a)
-class Dictionary(metaclass=_ObjectMeta):
+class Dictionary(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Dictionary object"""
+
object_type = ObjectType.dictionary
def __new__(cls, d=None, **kwargs):
@@ -147,15 +164,15 @@ class Dictionary(metaclass=_ObjectMeta):
if kwargs:
# Add leading slash
# Allows Dictionary(MediaBox=(0,0,1,1), Type=Name('/Page')...
- return _qpdf._new_dictionary(
- {('/' + k) : v for k, v in kwargs.items()})
+ return _qpdf._new_dictionary({('/' + k): v for k, v in kwargs.items()})
if not d:
d = {}
return _qpdf._new_dictionary(d)
-class Stream(metaclass=_ObjectMeta):
+class Stream(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Stream object"""
+
object_type = ObjectType.stream
def __new__(cls, owner, obj):