diff options
-rw-r--r-- | docs/objects.rst | 30 | ||||
-rw-r--r-- | src/pikepdf/_methods.py | 23 | ||||
-rw-r--r-- | src/qpdf/qpdf.cpp | 4 | ||||
-rw-r--r-- | tests/test_pdf.py | 7 |
4 files changed, 53 insertions, 11 deletions
diff --git a/docs/objects.rst b/docs/objects.rst index c08f460..c20fcc4 100644 --- a/docs/objects.rst +++ b/docs/objects.rst @@ -18,9 +18,9 @@ value in a PDF is assigned to a Python ``float``, pikepdf will convert it to ``Decimal``. Types that are not directly convertible to Python are represented as -:class:`pikepdf.Object`, a compound object that offers a superset of methods, -some work only if the underlying type is suitable. You can use the EAFP -idiom or ``isinstance`` to determine the type more precisely. This partly +:class:`pikepdf.Object`, a compound object that offers a superset of possible +methods, some of which only if the underlying type is suitable. Use the EAFP +idiom, or ``isinstance`` to determine the type more precisely. This partly reflects the fact that the PDF specification allows many data fields to be one of several types. @@ -67,3 +67,27 @@ the appropriate pikepdf object when passed to pikepdf APIs – when possible. However, pikepdf sends ``pikepdf.Object`` types back to Python on return calls, in most cases, because pikepdf needs to keep track of objects that came from PDFs originally. + + +Object lifecycle and memory management +====================================== + +As mentioned above, a :class:`pikepdf.Object` may reference data that is lazily +loaded from its source :class:`pikepdf.Pdf`. Closing the `Pdf` with +:meth:`pikepdf.Pdf.close` will invalidate some objects, depending on whether +or not the data was loaded, and other implementation details that may change. +Generally speaking, a :class:`pikepdf.Pdf` should be held open until it is no +longer needed, and objects that were derived from it may or may not be usable +after it is closed. + +Simple objects (booleans, integers, decimals, ``None``) are copied directly +to Python as pure Python objects. + +For PDF stream objects, use :meth:`pikepdf.Object.read_bytes()` to obtain a +copy of the object as pure bytes data, if this information is required after +closing a PDF. + +When objects are copied from one :class:`pikepdf.Pdf` to another, the +underlying data is copied immediately into the target. As such it is possible +to merge hundreds of `Pdf` into one, keeping only a single source and the +target file open at a time. diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py index 55e9526..8d897c0 100644 --- a/src/pikepdf/_methods.py +++ b/src/pikepdf/_methods.py @@ -208,14 +208,25 @@ class Extend_Pdf: def close(self): """ - Close a PDF object (probably) + Close a Pdf object and release resources acquired by pikepdf - Strictly speaking this decrements the reference count of the open - file object associated with this PDF. It is possible that other - pikepdf objects also reference the same file object; if these exist, - the file object **will remain open** until they are released. + If pikepdf opened the file handle it will close it (e.g. when opened with a file + path). If the caller opened the file for pikepdf, the caller close the file. + + pikepdf lazily loads data from PDFs, so some :class:`pikepdf.Object` may + implicitly depend on the :class:`pikepdf.Pdf` being open. This is always the + case for :class:`pikepdf.Stream` but can be true for any object. Do not close + the `Pdf` object if you might still be accessing content from it. + + When an `Object` is copied from one `Pdf` to another, the Object is copied into + the destination `Pdf` immediately, so after accessing all desired information + from the source Pdf it may be closed. + + Caution: + Closing the `Pdf` is currently implemented by resetting it to an empty + sentinel. It is currently possible to edit the sentinel as if it were a live + object. This behavior should not be relied on and is subject to change. - The PDF is closed by resetting to an empty in-memory PDF. """ EMPTY_PDF = ( diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp index 78d7570..26d7396 100644 --- a/src/qpdf/qpdf.cpp +++ b/src/qpdf/qpdf.cpp @@ -259,8 +259,8 @@ void init_qpdf(py::module &m) will be accessed as a readable binary stream. pikepdf will read the entire stream into a private buffer. - If `.open()` is used in a `with` block, pikepdf will attempt to close - the underlying file object. + `.open()` may be used in a `with`-block, `.close()` will be called when + the block exists. Args: filename_or_stream (os.PathLike): Filename of PDF to open diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 9c4ec60..d9d7422 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -224,3 +224,10 @@ def test_with_block(resources): with pikepdf.open(resources / 'pal-1bit-trivial.pdf') as pdf: desc = pdf.filename assert pdf.filename != desc + + +def test_with_block_abuse(resources): + with pikepdf.open(resources / 'pal-1bit-trivial.pdf') as pdf: + im0 = pdf.pages[0].Resources.XObject['/Im0'] + with pytest.raises(PdfError): + im0.read_bytes() |