diff options
-rw-r--r-- | debian/copyright | 5 | ||||
-rw-r--r-- | docs/conf.py | 10 | ||||
-rw-r--r-- | docs/images/28fish.jpg | bin | 0 -> 543264 bytes | |||
-rw-r--r-- | docs/index.rst | 6 | ||||
-rw-r--r-- | docs/installation.rst | 2 | ||||
-rw-r--r-- | docs/topics/content_streams.rst | 47 | ||||
-rw-r--r-- | docs/topics/images.rst | 9 | ||||
-rw-r--r-- | docs/topics/pages.rst | 2 | ||||
-rw-r--r-- | docs/topics/streams.rst | 13 | ||||
-rw-r--r-- | docs/tutorial.rst | 72 | ||||
-rw-r--r-- | src/qpdf/object.cpp | 7 | ||||
-rw-r--r-- | src/qpdf/qpdf.cpp | 7 |
12 files changed, 119 insertions, 61 deletions
diff --git a/debian/copyright b/debian/copyright index 17e69de..92f6daf 100644 --- a/debian/copyright +++ b/debian/copyright @@ -43,6 +43,11 @@ Copyright: (C) 2009 Rama License: CC-BY-SA 2.0 See: https://commons.wikimedia.org/wiki/File:Pike_square_img_3653.jpg +Files: docs/images/28fish.jpg +Copyright: (C) 2009 Fae +License: CC-BY-4.0 + See: https://upload.wikimedia.org/wikipedia/commons/0/0c/Twenty_eight_types_of_fish._Engraving_by_R._Scott_after_T._B_Wellcome_V0022737EL.jpg + Files: tests/*.py Copyright: (C) 2017 James R. Barlow License: CC0-1.0 diff --git a/docs/conf.py b/docs/conf.py index 5d3c986..474d53f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,12 +12,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os -from pkg_resources import get_distribution import subprocess +import sys from unittest.mock import MagicMock +from pkg_resources import get_distribution + +import pikepdf + on_rtd = os.environ.get('READTHEDOCS') == 'True' if on_rtd: # Borrowed from https://github.com/YannickJadoul/Parselmouth/blob/master/docs/conf.py @@ -76,7 +79,6 @@ else: sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext')) sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf # -- General configuration ------------------------------------------------ @@ -116,7 +118,7 @@ master_doc = 'index' # General information about the project. project = u'pikepdf' -copyright = u'2018, James R. Barlow' +copyright = u'2020, James R. Barlow' author = u'James R. Barlow' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/images/28fish.jpg b/docs/images/28fish.jpg Binary files differnew file mode 100644 index 0000000..ec06d02 --- /dev/null +++ b/docs/images/28fish.jpg diff --git a/docs/index.rst b/docs/index.rst index bd47eac..94878a1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -101,9 +101,9 @@ In use pikepdf is used by the same author's `OCRmyPDF <https://github.com/jbarlow83/OCRmyPDF>`_ to inspect input PDFs, graft the -generated OCR layers on to page content, and output PDFs. Its code contains main -practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and -``optimize.py``. pikepdf is also used in the test suite. +generated OCR layers on to page content, and output PDFs. Its code contains several +practical examples, particular in ``pdfinfo.py``, ``graft.py``, and +``optimize.py``. pikepdf is also used in its test suite. .. toctree:: :maxdepth: 2 diff --git a/docs/installation.rst b/docs/installation.rst index 2224c6e..2586e1e 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -24,7 +24,6 @@ virtual environment). pip install pikepdf - Use ``pip install --user pikepdf`` to install the package for the current user only. Use ``pip install pikepdf`` to install to a virtual environment. @@ -46,7 +45,6 @@ package manager (such as ``apt``). These versions may lag behind the version distributed with PyPI, but may be convenient for users that cannot use binary wheels. - .. |pikepdf| image:: https://repology.org/badge/vertical-allrepos/pikepdf.svg :alt: Package status diff --git a/docs/topics/content_streams.rst b/docs/topics/content_streams.rst index cc1cea7..c0d23ff 100644 --- a/docs/topics/content_streams.rst +++ b/docs/topics/content_streams.rst @@ -6,8 +6,10 @@ XObject that describes where and how to draw images, vectors, and text. Content streams are binary data that can be thought of as a list of operators and zero or more operands. Operands are given first, followed by the operator. -It is a stack-based language based loosely on PostScript (not actually PostScript!) -but without any programmable features. There are no variables, loops or conditionals. +It is a stack-based language, loosely based on PostScript. (It's not actually +PostScript, but sometimes well-meaning people mistakenly say that it is!) +Like HTML, it has a precise grammar, and also like (pure) HTML, it has no loops, +conditionals or variables. A typical example is as follows (with additional whitespace and PostScript-style ``%``-comments): @@ -106,18 +108,25 @@ stream. .. note:: - You need to translate the image so that it is centered at the bottom left - corner of the page, rotate, and then reverse the translation. + To rotate an image, first translate it so that the image is centered at ``(0, 0)``, + rotate then apply the rotate, then translate it to its new center position. + This is because rotations occur around ``(0, 0)``. + +.. note:: + + While the coordinate system is pinned to ``(0, 0)`` and **up is positive**, + the page's MediaBox does not have to be. Editing content streams robustly -------------------------------- The stateful nature of PDF content streams makes editing them complicated. Edits -like this will work when the input file is known to have a fixed structure -(that is, the state at the time of editing is known). You can always prepend -content to the top of the content stream, since the initial state is known. -And you can often append content to the end the stream, since the final state is -predictable if every ``q`` (push state) has a matching ``Q`` (pop state). +like the example above will work when the input file is known to have a fixed +structure (that is, the state at the time of editing is known). You can always +prepend content to the top of the content stream, since the initial state is +known. And you can often append content to the end the stream, since the final +state is predictable if every ``q`` (push state) has a matching ``Q`` (pop +state). Otherwise, you must track the graphics state and maintain a stack of states. @@ -126,21 +135,17 @@ representation that is easier edit and then serializing it back, totally rewriting the content stream. Content streams should be thought of as an output format. -Some manipulations are more manageable. You can often prepend content to the -top of the content stream or append to the end, or both, if the internal -content stream is well-formed on each end. - Extracting text from PDFs ------------------------- -If you guessed that the content streams were the place to look for text inside a PDF -– you'd be correct. Unfortunately, extracting the text is fairly difficult because -content stream actually specifies as a font and glyph numbers to use. Sometimes, there -is a 1:1 transparent mapping between Unicode numbers and glyph numbers, and dump of the -content stream will show the text. In general, you cannot rely on there being a -transparent mapping; in fact, it is perfectly legal for a font to specify no Unicode -mapping at all, or to use an unconventional mapping (when a PDF contains a subsetted -font for example). +If you guessed that the content streams were the place to look for text inside a +PDF – you'd be correct. Unfortunately, extracting the text is fairly difficult +because content stream actually specifies as a font and glyph numbers to use. +Sometimes, there is a 1:1 transparent mapping between Unicode numbers and glyph +numbers, and dump of the content stream will show the text. In general, you +cannot rely on there being a transparent mapping; in fact, it is perfectly legal +for a font to specify no Unicode mapping at all, or to use an unconventional +mapping (when a PDF contains a subsetted font for example). **We strongly recommend against trying to scrape text from the content stream.** diff --git a/docs/topics/images.rst b/docs/topics/images.rst index 042609c..19ec2e8 100644 --- a/docs/topics/images.rst +++ b/docs/topics/images.rst @@ -69,7 +69,8 @@ Extracting images Extracting images is straightforward. :meth:`~pikepdf.PdfImage.extract_to` will extract images to a specified file prefix. The extension is determined while extracting and appended to the filename. Where possible, ``extract_to`` -writes compressed data directly to the stream without transcoding. +writes compressed data directly to the stream without transcoding. (Transcoding +lossy formats like JPEG can reduce their quality.) .. ipython:: :verbatim: @@ -80,7 +81,7 @@ writes compressed data directly to the stream without transcoding. It also possible to extract to a writable Python stream using ``.extract_to(stream=...`)``. -You can also retrieve the image as a Pillow image: +You can also retrieve the image as a Pillow image (this will transcode): .. ipython:: @@ -88,9 +89,9 @@ You can also retrieve the image as a Pillow image: Another way to view the image is using Pillow's ``Image.show()`` method. -Not all images can be extracted. Also, some PDFs describe an image with a +Not all image types can be extracted. Also, some PDFs describe an image with a mask, with transparency effects. pikepdf can only extract the images -themselves, not rasterize them exactly as they appear in a PDF viewer. In +themselves, not rasterize them exactly as they would appear in a PDF viewer. In the vast majority of cases, however, the image can be extracted as it appears. .. note:: diff --git a/docs/topics/pages.rst b/docs/topics/pages.rst index 056da90..afcb5a5 100644 --- a/docs/topics/pages.rst +++ b/docs/topics/pages.rst @@ -163,6 +163,8 @@ Emplacing pages To preserve indirect references, use :meth:`pikepdf.Object.emplace`, which will (conceptually) delete all of the content of target and replace it with the content of source, thus preserving indirect references to the page. +(Think of this as demolishing the interior of a house, but keeping it at the +same address.) .. ipython:: diff --git a/docs/topics/streams.rst b/docs/topics/streams.rst index e46726e..81891df 100644 --- a/docs/topics/streams.rst +++ b/docs/topics/streams.rst @@ -8,15 +8,22 @@ stream can be encoded with one or more filters. Images are a type of stream object. Most of the interesting content in a PDF (images and content streams) are -inside page objects. +inside stream objects. Because the PDF specification unfortunately defines several terms involve the word stream, let's attempt to clarify: +.. figure:: /images/28fish.jpg + :figwidth: 30% + :align: right + :alt: Image of many species of fish + + When it comes to taxonomy, software developers have it easy. + stream object A PDF object that contains binary data and a metadata dictionary to describes it, represented as :class:`pikepdf.Stream`. In HTML this is equivalent to - a ``<img>`` with inline image data. + a ``<object>`` tag with attributes and data. object stream A stream object (not a typo, an object stream really is a type of stream @@ -34,6 +41,8 @@ Form XObject A group of images, text and drawing commands that can be rendered elsewhere in a PDF as a group. This is often used when a group of objects are needed at different scales or multiple pages. In HTML this is like an ``<svg>``. + It is not a fillable PDF form (although a fillable PDF form could involve + Form XObjects). Reading stream objects ---------------------- diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 9cc6c8a..f07ed8a 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -33,8 +33,11 @@ name conflicts or ``from pikepdf import Pdf as PDF`` if you prefer uppercase. The PDF class API follows the example of the widely-used `Pillow image library <https://pillow.readthedocs.io/en/latest/>`_. For clarity there is no default constructor since the arguments used for creation and -opening are different. ``Pdf.open()`` also accepts seekable streams as input, -and ``Pdf.save()`` accepts streams as output. +opening are different. To make a new empty PDF, use ``Pdf.new()`` not ``Pdf()``. + +``Pdf.open()`` also accepts seekable streams as input, and ``Pdf.save()`` accepts +streams as output. :class:`pathlib.Path` objects are fully supported anywhere +pikepdf accepts a filename. Inspecting pages ---------------- @@ -59,7 +62,7 @@ How many pages? pikepdf integrates with IPython and Jupyter's rich object APIs so that you can view PDFs, PDF pages, or images within PDF in a IPython window or Jupyter -notebook. This makes it to test visual changes. +notebook. This makes easier it to test visual changes. .. ipython:: :verbatim: @@ -93,12 +96,12 @@ Pages are dictionaries In PDFs, the main data structure is the **dictionary**, a key-value data structure much like a Python ``dict`` or ``attrdict``. The major difference is -that the keys can only be **names**, and can only be PDF types, including +that the keys can only be **names**, and the values can only be PDF types, including other dictionaries. PDF dictionaries are represented as :class:`pikepdf.Dictionary`, and names -are of type :class:`pikepdf.Name`. A page is just a dictionary with a few -required files and a reference from the document's "page tree". (pikepdf manages +are of type :class:`pikepdf.Name`. A page is just a dictionary with a certain +required keys and a reference from the document's "page tree". (pikepdf manages the page tree for you.) .. ipython:: @@ -116,18 +119,15 @@ Let's example the page's ``repr()`` output: .. ipython:: - In [1]: page1 + In [1]: repr(page1) The angle brackets in the output indicate that this object cannot be constructed with a Python expression because it contains a reference. When angle brackets are omitted from the ``repr()`` of a pikepdf object, then the object can be replicated with a Python expression, such as ``eval(repr(x)) == x``. Pages -typically concern indirect references to themselves and other pages, so they +typically have indirect references to themselves and other pages, so they cannot be represented as an expression. -In Jupyter and IPython, pikepdf will instead attempt to display a preview of the PDF -page, assuming a PDF rendering backend is available. - Item and attribute notation --------------------------- @@ -136,14 +136,16 @@ keys (``page1['/MediaBox']``). .. ipython:: - In [1]: page1.MediaBox # preferred notation for required names + In [1]: page1.MediaBox # preferred notation for standard PDF names In [1]: page1['/MediaBox'] # also works -By convention, pikepdf uses attribute notation for standard names, and item -notation for names that are set by PDF developers. For example, the images +By convention, pikepdf uses attribute notation for standard names (the names +that are normally part of a dictionary, according to the PDF Reference Manual), +and item notation for names that may not always appear. For example, the images belong to a page always appear at ``page.Resources.XObject`` but the name -of images is set by the PDF creator: +of images is arbitrarily chosen by whatever software generates the PDF (``/Im0``, +in this case). (Whenever expressed as strings, names begin with ``/``.) .. ipython:: :verbatim: @@ -179,8 +181,7 @@ Saving changes -------------- Naturally, you can save your changes with :meth:`pikepdf.Pdf.save`. -``filename`` can be a :class:`pathlib.Path`, which we accept everywhere. (Saving -is commented out to avoid upsetting the documentation generator.) +``filename`` can be a :class:`pathlib.Path`, which we accept everywhere. .. ipython:: :verbatim: @@ -188,12 +189,24 @@ is commented out to avoid upsetting the documentation generator.) In [1]: pdf.save('output.pdf') You may save a file multiple times, and you may continue modifying it after -saving. +saving. For example, you could create an unencrypted version of document, then +apply a watermark, and create an encrypted version. + +.. note:: + + You may not overwrite the input file (or whatever Python object provides the + data) when saving or at any other time. pikepdf assumes it will have + exclusive access to the input file or input data you give it to, until + ``pdf.close()`` is called. + +Saving secure PDFs +^^^^^^^^^^^^^^^^^^ To save an encrypted (password protected) PDF, use a :class:`pikepdf.Encryption` -object to specify the encryption settings. By default, pikepdf selects the strongest -security handler and algorithm (AES-256), but allows full access to modify file contents. -A :class:`pikepdf.Permissions` object can be used to specify restrictions. +object to specify the encryption settings. By default, pikepdf selects the +strongest security handler and algorithm (AES-256), but allows full access to +modify file contents. A :class:`pikepdf.Permissions` object can be used to +specify restrictions. .. ipython:: :verbatim: @@ -204,6 +217,23 @@ A :class:`pikepdf.Permissions` object can be used to specify restrictions. ...: user="user password", owner="owner password", allow=no_extracting ...: )) +As in all PDFs, if a user password is set, it will not be possible to +open the PDF without the password. If the owner password is set, changes will +not be permitted with the owner password. If the user password is an empty +string and an owner password is set, the PDF can be viewed by anyone with the +user (or owner) password. PDF viewers only enforce ``pikepdf.Permissions`` +restrictions when a PDF is opened with the user password, since the owner may +change anything. + +pikepdf does not and cannot enforce the restrictions in ``pikepdf.Permissions`` +if you open a file with the user password. Someone with either the user or +owner password can access all the contents of PDF. If you are developing an +application, however, you should consider enforcing the restrictions. + +For widest compatibility, passwords should be ASCII, since the PDF reference +manual is unclear about how non-ASCII passwords are supposed to be encoded. +See the documentation on ``Pdf.save()`` for more details. + Next steps ---------- diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp index 190bf97..664ed0e 100644 --- a/src/qpdf/object.cpp +++ b/src/qpdf/object.cpp @@ -624,7 +624,7 @@ void init_object(py::module& m) .def_property_readonly("objgen", &object_get_objgen, R"~~~( - Return the object-generation number pair for this object + Return the object-generation number pair for this object. If this is a direct object, then the returned value is ``(0, 0)``. By definition, if this is an indirect object, it has a "objgen", @@ -632,7 +632,10 @@ void init_object(py::module& m) Direct objects cannot necessarily be looked up. The generation number is usually 0, except for PDFs that have been - incrementally updated. + incrementally updated. Incrementally updated PDFs are now uncommon, + since it does not take too long for modern CPUs to reconstruct an + entire PDF. pikepdf will consolidate all incremental updates + when saving. )~~~" ) diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp index 14acc9b..f097838 100644 --- a/src/qpdf/qpdf.cpp +++ b/src/qpdf/qpdf.cpp @@ -502,8 +502,11 @@ void init_qpdf(py::module &m) Args: filename_or_stream (os.PathLike): Filename of PDF to open password (str or bytes): User or owner password to open an - encrypted PDF. If a str is given it will be converted to - UTF-8. + encrypted PDF. If the type of this parameter is ``str`` + it will be encoded as UTF-8. If the type is ``bytes`` it will + be saved verbatim. Passwords are always padded or + truncated to 32 bytes internally. Use ASCII passwords for + maximum compatibility. hex_password (bool): If True, interpret the password as a hex-encoded version of the exact encryption key to use, without performing the normal key computation. Useful in forensics. |