12 files changed, 119 insertions, 61 deletions
diff --git a/debian/copyright b/debian/copyright
index 17e69de..92f6daf 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -43,6 +43,11 @@ Copyright: (C) 2009 Rama
 License: CC-BY-SA 2.0
  See: https://commons.wikimedia.org/wiki/File:Pike_square_img_3653.jpg
 
+Files: docs/images/28fish.jpg
+Copyright: (C) 2009 Fae
+License: CC-BY-4.0
+ See: https://upload.wikimedia.org/wikipedia/commons/0/0c/Twenty_eight_types_of_fish._Engraving_by_R._Scott_after_T._B_Wellcome_V0022737EL.jpg
+
 Files: tests/*.py
 Copyright: (C) 2017 James R. Barlow
 License: CC0-1.0
diff --git a/docs/conf.py b/docs/conf.py
index 5d3c986..474d53f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -12,12 +12,15 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
-import sys
 import os
-from pkg_resources import get_distribution
 import subprocess
+import sys
 from unittest.mock import MagicMock
 
+from pkg_resources import get_distribution
+
+import pikepdf
+
 on_rtd = os.environ.get('READTHEDOCS') == 'True'
 if on_rtd:
     # Borrowed from https://github.com/YannickJadoul/Parselmouth/blob/master/docs/conf.py
@@ -76,7 +79,6 @@ else:
 sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext'))
 sys.path.insert(0, os.path.join(os.path.abspath('.'), '..'))
 
-import pikepdf
 
 # -- General configuration ------------------------------------------------
 
@@ -116,7 +118,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'pikepdf'
-copyright = u'2018, James R. Barlow'
+copyright = u'2020, James R. Barlow'
 author = u'James R. Barlow'
 
 # The version info for the project you're documenting, acts as replacement for
diff --git a/docs/images/28fish.jpg b/docs/images/28fish.jpg
new file mode 100644
index 0000000..ec06d02
--- /dev/null
+++ b/docs/images/28fish.jpg
diff --git a/docs/index.rst b/docs/index.rst
index bd47eac..94878a1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -101,9 +101,9 @@ In use
 
 pikepdf is used by the same author's `OCRmyPDF
 <https://github.com/jbarlow83/OCRmyPDF>`_ to inspect input PDFs, graft the
-generated OCR layers on to page content, and output PDFs. Its code contains main
-practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and
-``optimize.py``. pikepdf is also used in the test suite.
+generated OCR layers on to page content, and output PDFs. Its code contains several
+practical examples, particular in ``pdfinfo.py``, ``graft.py``, and
+``optimize.py``. pikepdf is also used in its test suite.
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/installation.rst b/docs/installation.rst
index 2224c6e..2586e1e 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -24,7 +24,6 @@ virtual environment).
 
     pip install pikepdf
 
-
 Use ``pip install --user pikepdf`` to install the package for the current user
 only. Use ``pip install pikepdf`` to install to a virtual environment.
 
@@ -46,7 +45,6 @@ package manager (such as ``apt``). These versions may lag behind the version
 distributed with PyPI, but may be convenient for users that cannot use binary
 wheels.
 
-
 .. |pikepdf| image:: https://repology.org/badge/vertical-allrepos/pikepdf.svg
     :alt: Package status
 
diff --git a/docs/topics/content_streams.rst b/docs/topics/content_streams.rst
index cc1cea7..c0d23ff 100644
--- a/docs/topics/content_streams.rst
+++ b/docs/topics/content_streams.rst
@@ -6,8 +6,10 @@ XObject that describes where and how to draw images, vectors, and text.
 
 Content streams are binary data that can be thought of as a list of operators
 and zero or more operands. Operands are given first, followed by the operator.
-It is a stack-based language based loosely on PostScript (not actually PostScript!)
-but without any programmable features. There are no variables, loops or conditionals.
+It is a stack-based language, loosely based on PostScript. (It's not actually
+PostScript, but sometimes well-meaning people mistakenly say that it is!)
+Like HTML, it has a precise grammar, and also like (pure) HTML, it has no loops,
+conditionals or variables.
 
 A typical example is as follows (with additional whitespace and PostScript-style
 ``%``-comments):
@@ -106,18 +108,25 @@ stream.
 
 .. note::
 
-  You need to translate the image so that it is centered at the bottom left
-  corner of the page, rotate, and then reverse the translation.
+  To rotate an image, first translate it so that the image is centered at ``(0, 0)``,
+  rotate then apply the rotate, then translate it to its new center position.
+  This is because rotations occur around ``(0, 0)``.
+
+.. note::
+
+  While the coordinate system is pinned to ``(0, 0)`` and **up is positive**,
+  the page's MediaBox does not have to be.
 
 Editing content streams robustly
 --------------------------------
 
 The stateful nature of PDF content streams makes editing them complicated. Edits
-like this will work when the input file is known to have a fixed structure
-(that is, the state at the time of editing is known). You can always prepend
-content to the top of the content stream, since the initial state is known.
-And you can often append content to the end the stream, since the final state is
-predictable if every ``q`` (push state) has a matching ``Q`` (pop state).
+like the example above will work when the input file is known to have a fixed
+structure (that is, the state at the time of editing is known). You can always
+prepend content to the top of the content stream, since the initial state is
+known. And you can often append content to the end the stream, since the final
+state is predictable if every ``q`` (push state) has a matching ``Q`` (pop
+state).
 
 Otherwise, you must track the graphics state and maintain a stack of states.
 
@@ -126,21 +135,17 @@ representation that is easier edit and then serializing it back, totally
 rewriting the content stream. Content streams should be thought of as an
 output format.
 
-Some manipulations are more manageable. You can often prepend content to the
-top of the content stream or append to the end, or both, if the internal
-content stream is well-formed on each end.
-
 Extracting text from PDFs
 -------------------------
 
-If you guessed that the content streams were the place to look for text inside a PDF
-– you'd be correct. Unfortunately, extracting the text is fairly difficult because
-content stream actually specifies as a font and glyph numbers to use. Sometimes, there
-is a 1:1 transparent mapping between Unicode numbers and glyph numbers, and dump of the
-content stream will show the text. In general, you cannot rely on there being a
-transparent mapping; in fact, it is perfectly legal for a font to specify no Unicode
-mapping at all, or to use an unconventional mapping (when a PDF contains a subsetted
-font for example).
+If you guessed that the content streams were the place to look for text inside a
+PDF – you'd be correct. Unfortunately, extracting the text is fairly difficult
+because content stream actually specifies as a font and glyph numbers to use.
+Sometimes, there is a 1:1 transparent mapping between Unicode numbers and glyph
+numbers, and dump of the content stream will show the text. In general, you
+cannot rely on there being a transparent mapping; in fact, it is perfectly legal
+for a font to specify no Unicode mapping at all, or to use an unconventional
+mapping (when a PDF contains a subsetted font for example).
 
 **We strongly recommend against trying to scrape text from the content stream.**
 
diff --git a/docs/topics/images.rst b/docs/topics/images.rst
index 042609c..19ec2e8 100644
--- a/docs/topics/images.rst
+++ b/docs/topics/images.rst
@@ -69,7 +69,8 @@ Extracting images
 Extracting images is straightforward. :meth:`~pikepdf.PdfImage.extract_to` will
 extract images to a specified file prefix. The extension is determined while
 extracting and appended to the filename. Where possible, ``extract_to``
-writes compressed data directly to the stream without transcoding.
+writes compressed data directly to the stream without transcoding. (Transcoding
+lossy formats like JPEG can reduce their quality.)
 
 .. ipython::
     :verbatim:
@@ -80,7 +81,7 @@ writes compressed data directly to the stream without transcoding.
 It also possible to extract to a writable Python stream using
 ``.extract_to(stream=...`)``.
 
-You can also retrieve the image as a Pillow image:
+You can also retrieve the image as a Pillow image (this will transcode):
 
 .. ipython::
 
@@ -88,9 +89,9 @@ You can also retrieve the image as a Pillow image:
 
 Another way to view the image is using Pillow's ``Image.show()`` method.
 
-Not all images can be extracted. Also, some PDFs describe an image with a
+Not all image types can be extracted. Also, some PDFs describe an image with a
 mask, with transparency effects. pikepdf can only extract the images
-themselves, not rasterize them exactly as they appear in a PDF viewer. In
+themselves, not rasterize them exactly as they would appear in a PDF viewer. In
 the vast majority of cases, however, the image can be extracted as it appears.
 
 .. note::
diff --git a/docs/topics/pages.rst b/docs/topics/pages.rst
index 056da90..afcb5a5 100644
--- a/docs/topics/pages.rst
+++ b/docs/topics/pages.rst
@@ -163,6 +163,8 @@ Emplacing pages
 To preserve indirect references, use :meth:`pikepdf.Object.emplace`,
 which will (conceptually) delete all of the content of target and replace it
 with the content of source, thus preserving indirect references to the page.
+(Think of this as demolishing the interior of a house, but keeping it at the
+same address.)
 
 .. ipython::
 
diff --git a/docs/topics/streams.rst b/docs/topics/streams.rst
index e46726e..81891df 100644
--- a/docs/topics/streams.rst
+++ b/docs/topics/streams.rst
@@ -8,15 +8,22 @@ stream can be encoded with one or more filters. Images are a type of stream
 object.
 
 Most of the interesting content in a PDF (images and content streams) are
-inside page objects.
+inside stream objects.
 
 Because the PDF specification unfortunately defines several terms involve the
 word stream, let's attempt to clarify:
 
+.. figure:: /images/28fish.jpg
+  :figwidth: 30%
+  :align: right
+  :alt: Image of many species of fish
+
+  When it comes to taxonomy, software developers have it easy.
+
 stream object
   A PDF object that contains binary data and a metadata dictionary to describes
   it, represented as :class:`pikepdf.Stream`. In HTML this is equivalent to
-  a ``<img>`` with inline image data.
+  a ``<object>`` tag with attributes and data.
 
 object stream
   A stream object (not a typo, an object stream really is a type of stream
@@ -34,6 +41,8 @@ Form XObject
   A group of images, text and drawing commands that can be rendered elsewhere
   in a PDF as a group. This is often used when a group of objects are needed
   at different scales or multiple pages. In HTML this is like an ``<svg>``.
+  It is not a fillable PDF form (although a fillable PDF form could involve
+  Form XObjects).
 
 Reading stream objects
 ----------------------
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 9cc6c8a..f07ed8a 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -33,8 +33,11 @@ name conflicts or ``from pikepdf import Pdf as PDF`` if you prefer uppercase.
 The PDF class API follows the example of the widely-used
 `Pillow image library <https://pillow.readthedocs.io/en/latest/>`_. For clarity
 there is no default constructor since the arguments used for creation and
-opening are different. ``Pdf.open()`` also accepts seekable streams as input,
-and ``Pdf.save()`` accepts streams as output.
+opening are different. To make a new empty PDF, use ``Pdf.new()`` not ``Pdf()``.
+
+``Pdf.open()`` also accepts seekable streams as input, and ``Pdf.save()`` accepts
+streams as output. :class:`pathlib.Path` objects are fully supported anywhere
+pikepdf accepts a filename.
 
 Inspecting pages
 ----------------
@@ -59,7 +62,7 @@ How many pages?
 
 pikepdf integrates with IPython and Jupyter's rich object APIs so that you can
 view PDFs, PDF pages, or images within PDF in a IPython window or Jupyter
-notebook. This makes it to test visual changes.
+notebook. This makes easier it to test visual changes.
 
 .. ipython::
     :verbatim:
@@ -93,12 +96,12 @@ Pages are dictionaries
 
 In PDFs, the main data structure is the **dictionary**, a key-value data
 structure much like a Python ``dict`` or ``attrdict``. The major difference is
-that the keys can only be **names**, and can only be PDF types, including
+that the keys can only be **names**, and the values can only be PDF types, including
 other dictionaries.
 
 PDF dictionaries are represented as :class:`pikepdf.Dictionary`, and names
-are of type :class:`pikepdf.Name`. A page is just a dictionary with a few
-required files and a reference from the document's "page tree". (pikepdf manages
+are of type :class:`pikepdf.Name`. A page is just a dictionary with a certain
+required keys and a reference from the document's "page tree". (pikepdf manages
 the page tree for you.)
 
 .. ipython::
@@ -116,18 +119,15 @@ Let's example the page's ``repr()`` output:
 
 .. ipython::
 
-    In [1]: page1
+    In [1]: repr(page1)
 
 The angle brackets in the output indicate that this object cannot be constructed
 with a Python expression because it contains a reference. When angle brackets
 are omitted from the ``repr()`` of a pikepdf object, then the object can be
 replicated with a Python expression, such as ``eval(repr(x)) == x``. Pages
-typically concern indirect references to themselves and other pages, so they
+typically have indirect references to themselves and other pages, so they
 cannot be represented as an expression.
 
-In Jupyter and IPython, pikepdf will instead attempt to display a preview of the PDF
-page, assuming a PDF rendering backend is available.
-
 Item and attribute notation
 ---------------------------
 
@@ -136,14 +136,16 @@ keys (``page1['/MediaBox']``).
 
 .. ipython::
 
-    In [1]: page1.MediaBox      # preferred notation for required names
+    In [1]: page1.MediaBox      # preferred notation for standard PDF names
 
     In [1]: page1['/MediaBox']  # also works
 
-By convention, pikepdf uses attribute notation for standard names, and item
-notation for names that are set by PDF developers. For example, the images
+By convention, pikepdf uses attribute notation for standard names (the names
+that are normally part of a dictionary, according to the PDF Reference Manual),
+and item notation for names that may not always appear. For example, the images
 belong to a page always appear at ``page.Resources.XObject`` but the name
-of images is set by the PDF creator:
+of images is arbitrarily chosen by whatever software generates the PDF (``/Im0``,
+in this case). (Whenever expressed as strings, names begin with ``/``.)
 
 .. ipython::
     :verbatim:
@@ -179,8 +181,7 @@ Saving changes
 --------------
 
 Naturally, you can save your changes with :meth:`pikepdf.Pdf.save`.
-``filename`` can be a :class:`pathlib.Path`, which we accept everywhere. (Saving
-is commented out to avoid upsetting the documentation generator.)
+``filename`` can be a :class:`pathlib.Path`, which we accept everywhere.
 
 .. ipython::
     :verbatim:
@@ -188,12 +189,24 @@ is commented out to avoid upsetting the documentation generator.)
     In [1]: pdf.save('output.pdf')
 
 You may save a file multiple times, and you may continue modifying it after
-saving.
+saving. For example, you could create an unencrypted version of document, then
+apply a watermark, and create an encrypted version.
+
+.. note::
+
+    You may not overwrite the input file (or whatever Python object provides the
+    data) when saving or at any other time. pikepdf assumes it will have
+    exclusive access to the input file or input data you give it to, until
+    ``pdf.close()`` is called.
+
+Saving secure PDFs
+^^^^^^^^^^^^^^^^^^
 
 To save an encrypted (password protected) PDF, use a :class:`pikepdf.Encryption`
-object to specify the encryption settings. By default, pikepdf selects the strongest
-security handler and algorithm (AES-256), but allows full access to modify file contents.
-A :class:`pikepdf.Permissions` object can be used to specify restrictions.
+object to specify the encryption settings. By default, pikepdf selects the
+strongest security handler and algorithm (AES-256), but allows full access to
+modify file contents. A :class:`pikepdf.Permissions` object can be used to
+specify restrictions.
 
 .. ipython::
     :verbatim:
@@ -204,6 +217,23 @@ A :class:`pikepdf.Permissions` object can be used to specify restrictions.
        ...:      user="user password", owner="owner password", allow=no_extracting
        ...: ))
 
+As in all PDFs, if a user password is set, it will not be possible to
+open the PDF without the password. If the owner password is set, changes will
+not be permitted with the owner password. If the user password is an empty
+string and an owner password is set, the PDF can be viewed by anyone with the
+user (or owner) password. PDF viewers only enforce ``pikepdf.Permissions``
+restrictions when a PDF is opened with the user password, since the owner may
+change anything.
+
+pikepdf does not and cannot enforce the restrictions in ``pikepdf.Permissions``
+if you open a file with the user password. Someone with either the user or
+owner password can access all the contents of PDF. If you are developing an
+application, however, you should consider enforcing the restrictions.
+
+For widest compatibility, passwords should be ASCII, since the PDF reference
+manual is unclear about how non-ASCII passwords are supposed to be encoded.
+See the documentation on ``Pdf.save()`` for more details.
+
 Next steps
 ----------
 
diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp
index 190bf97..664ed0e 100644
--- a/src/qpdf/object.cpp
+++ b/src/qpdf/object.cpp
@@ -624,7 +624,7 @@ void init_object(py::module& m)
         .def_property_readonly("objgen",
             &object_get_objgen,
             R"~~~(
-            Return the object-generation number pair for this object
+            Return the object-generation number pair for this object.
 
             If this is a direct object, then the returned value is ``(0, 0)``.
             By definition, if this is an indirect object, it has a "objgen",
@@ -632,7 +632,10 @@ void init_object(py::module& m)
             Direct objects cannot necessarily be looked up.
 
             The generation number is usually 0, except for PDFs that have been
-            incrementally updated.
+            incrementally updated. Incrementally updated PDFs are now uncommon,
+            since it does not take too long for modern CPUs to reconstruct an
+            entire PDF. pikepdf will consolidate all incremental updates
+            when saving.
 
             )~~~"
         )
diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp
index 14acc9b..f097838 100644
--- a/src/qpdf/qpdf.cpp
+++ b/src/qpdf/qpdf.cpp
@@ -502,8 +502,11 @@ void init_qpdf(py::module &m)
             Args:
                 filename_or_stream (os.PathLike): Filename of PDF to open
                 password (str or bytes): User or owner password to open an
-                    encrypted PDF. If a str is given it will be converted to
-                    UTF-8.
+                    encrypted PDF. If the type of this parameter is ``str``
+                    it will be encoded as UTF-8. If the type is ``bytes`` it will
+                    be saved verbatim. Passwords are always padded or
+                    truncated to 32 bytes internally. Use ASCII passwords for
+                    maximum compatibility.
                 hex_password (bool): If True, interpret the password as a
                     hex-encoded version of the exact encryption key to use, without
                     performing the normal key computation. Useful in forensics.