summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.appveyor.yml4
-rw-r--r--.gitignore3
-rw-r--r--.pre-commit-config.yaml6
-rw-r--r--.pylintrc2
-rw-r--r--.travis.yml12
-rw-r--r--Makefile7
-rw-r--r--README.md17
-rw-r--r--debian/changelog24
-rw-r--r--debian/compat1
-rw-r--r--debian/control6
-rw-r--r--debian/copyright452
-rw-r--r--debian/patches/Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch51
-rw-r--r--debian/patches/disable-test_docinfo_problems.patch27
-rw-r--r--debian/patches/disable-test_icc_extract.patch27
-rw-r--r--debian/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch6
-rw-r--r--debian/patches/drop-installation-from-docs-contents.patch2
-rw-r--r--debian/patches/drop-pybind11-from-setup.py.patch22
-rw-r--r--debian/patches/drop-setuptools_scm_git_archive-from-setup.py.patch19
-rw-r--r--debian/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch110
-rw-r--r--debian/patches/series5
-rw-r--r--debian/tests/control2
-rw-r--r--docs/_ext/fix_pybind11_autodoc.py31
-rw-r--r--docs/api.rst (renamed from docs/pikepdf.rst)84
-rw-r--r--docs/changelog.rst256
-rw-r--r--docs/conf.py163
-rw-r--r--docs/encoding.rst41
-rw-r--r--docs/images/pike-release.jpgbin0 -> 88609 bytes
-rw-r--r--docs/images/pike-tree.jpgbin0 -> 86085 bytes
-rw-r--r--docs/index.rst6
-rw-r--r--docs/installation.rst86
-rw-r--r--docs/objects.rst34
-rw-r--r--docs/page_copying.rst37
-rw-r--r--docs/release_notes.rst436
-rw-r--r--docs/resources.rst6
-rw-r--r--docs/tutorial.rst12
-rw-r--r--docs/tutorial/page.rst16
-rw-r--r--docs/tutorial/pages.rst46
-rw-r--r--docs/tutorial/streams.rst18
-rw-r--r--pyproject.toml32
-rw-r--r--requirements/docs.txt1
-rw-r--r--requirements/test.txt11
-rw-r--r--setup.cfg10
-rw-r--r--setup.py58
-rw-r--r--src/pikepdf/__init__.py42
-rw-r--r--src/pikepdf/_cpphelpers.py20
-rw-r--r--src/pikepdf/_methods.py319
-rw-r--r--src/pikepdf/_version.py13
-rw-r--r--src/pikepdf/codec.py48
-rw-r--r--src/pikepdf/models/__init__.py17
-rw-r--r--src/pikepdf/models/encryption.py154
-rw-r--r--src/pikepdf/models/image.py247
-rw-r--r--src/pikepdf/models/matrix.py16
-rw-r--r--src/pikepdf/models/metadata.py86
-rw-r--r--src/pikepdf/objects.py49
-rw-r--r--src/qpdf/annotation.cpp52
-rw-r--r--src/qpdf/object.cpp102
-rw-r--r--src/qpdf/pikepdf.cpp98
-rw-r--r--src/qpdf/pikepdf.h30
-rw-r--r--src/qpdf/qpdf.cpp585
-rw-r--r--src/qpdf/qpdf_inputsource.h2
-rw-r--r--src/qpdf/qpdf_pagelist.cpp87
-rw-r--r--src/qpdf/qpdf_pipeline.h77
-rw-r--r--tests/conftest.py15
-rw-r--r--tests/resources/1biticc.pdfbin0 -> 5661 bytes
-rw-r--r--tests/resources/graph-encrypted.pdfbin293636 -> 296661 bytes
-rw-r--r--tests/resources/pike-flate-jp2.pdfbin0 -> 18471 bytes
-rw-r--r--tests/test_codec.py16
-rw-r--r--tests/test_dictionary.py6
-rw-r--r--tests/test_encrypt.py128
-rw-r--r--tests/test_formxobject.py45
-rw-r--r--tests/test_image_access.py143
-rw-r--r--tests/test_io.py26
-rw-r--r--tests/test_ipython.py3
-rw-r--r--tests/test_metadata.py132
-rw-r--r--tests/test_object.py185
-rw-r--r--tests/test_pages.py72
-rw-r--r--tests/test_parsers.py49
-rw-r--r--tests/test_pdf.py91
-rw-r--r--tests/test_pdfa.py10
-rw-r--r--tests/test_private_pdfs.py16
-rw-r--r--tests/test_refcount.py11
-rw-r--r--tests/test_sanity.py40
82 files changed, 3935 insertions, 1286 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 98d4ff0..77afcca 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -19,9 +19,9 @@ install:
$userpath = "$env:APPDATA\Python\Python$env:PYTHON\Scripts"
if ($env:PLATFORM -eq "x64") {
$env:PYTHON = "$env:PYTHON-x64"
- $qpdfdll = "https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.3.0/qpdf-8.3.0-bin-msvc64.zip"
+ $qpdfdll = "https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.4.0/qpdf-8.4.0-bin-msvc64.zip"
} else {
- $qpdfdll = "https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.3.0/qpdf-8.3.0-bin-msvc32.zip"
+ $qpdfdll = "https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.4.0/qpdf-8.4.0-bin-msvc32.zip"
}
$env:PATH = "C:\Python$env:PYTHON;C:\Python$env:PYTHON\Scripts;$userpath;$env:PATH"
echo $env:PATH
diff --git a/.gitignore b/.gitignore
index e0d09e0..6cc9660 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,7 @@ MANIFEST
.vscode/
_generate
build
-.venv/
+.venv*/
*.sublime*
.eggs/
.cache/
@@ -20,6 +20,7 @@ var/
.pytest_cache/
.coverage
coverage/
+pip-wheel-metadata/
# Main directory testing
/*.pdf
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..9c554d9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+- repo: https://github.com/ambv/black
+ rev: stable
+ hooks:
+ - id: black
+ language_version: python3
diff --git a/.pylintrc b/.pylintrc
index 85d1076..a1754fc 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -312,7 +312,7 @@ indent-after-paren=4
indent-string=' '
# Maximum number of characters on a single line.
-max-line-length=100
+max-line-length=88
# Maximum number of lines in a module
max-module-lines=1000
diff --git a/.travis.yml b/.travis.yml
index 104b46e..a0774c9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,7 +8,7 @@ cache:
env:
global:
- - QPDF_RELEASE=https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.3.0/qpdf-8.3.0.tar.gz
+ - QPDF_RELEASE=https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.4.0/qpdf-8.4.0.tar.gz
- TWINE_USERNAME=ocrmypdf-travis
# TWINE_PASSWORD
- secure: "d1PfCVoqvFFwAqm0QEozLLoUdEHaY0kAvawfc4lKdLSjI+yOJYoNdknU0r3TdwttNEF2VV+aY9q/4wVnYrEiF4L13E3s+UtDqIXwGk/b14JrdExIx/0yj642kbCJPycZqqRZgvGwYuhb6EF7e/QrsNYMwZ95E9oTyWa0ZaEkiPrrVJh9XSNDpw9I8REL3GecpfvF/GpHWX0VBHoaJfCgDzDDvHQPdfIXAZg+OLJOLNrR2ivvUD3gR371376fYPMPNsqMNqBghLdX8lnX2zkEc67An9ZBLi1dx46PhHjn8c06QOBQ25wcwtCxSnaXygkq5HXUXnpWmCbPcy3n98bJBE1P86M1eWo5c3KV4zwY3pvC6/ldFFAX0nC5Qr8xVpiBZIZKhqBEsX7HlCIRdN5OzmWXTkRhO05GtloH+IPuS8PH09vlaGfdCmBdJkvQjnkXL9Jdw5JJcIt9c//CgRjJ4CtHySA3I0XEnqbLHRhsYAyfJfM4ya3ou+eETpWVpnkZ4kbn8fuUkIpZL6YS9XtJCVCfh5uNpJ7BV0DzlZqdV//K3s9CTNyFac0L521YcFRwl0Nb72AlzbtGwWgWh1C8qmlJ/ENf0XI3dafvcqzPL61rwBlo0sah9DWxwUDWMUicUtp8qP5GK8VxHse+QlolJQVSb07jD6bf7+mILX0B3Mg="
@@ -84,16 +84,16 @@ matrix:
- clang --version
- $PYTHON_CMD --version
- echo "CC=$CC CXX=$CXX"
- - $PYTHON_CMD -m pip install pybind11 cibuildwheel==0.10.0 setuptools_scm
+ - $PYTHON_CMD -m pip install --user cibuildwheel==0.10.1
script:
- $PYTHON_CMD setup.py sdist -d wheelhouse
- export CCACHE_BASEDIR=`python3 -c "import tempfile; import os; print(os.path.realpath(tempfile.gettempdir()))"`
- - export CIBW_BEFORE_BUILD='pip install pybind11 setuptools_scm pytest-runner'
+ - export CIBW_BEFORE_BUILD='pip install pybind11'
- export CIBW_SKIP="cp27-* cp34-*"
- - cibuildwheel --output-dir wheelhouse
+ - $PYTHON_CMD -m cibuildwheel --output-dir wheelhouse
- |
if [[ $TRAVIS_TAG ]]; then
- $PYTHON_CMD -m pip install twine
+ $PYTHON_CMD -m pip install --user twine
$PYTHON_CMD -m twine upload wheelhouse/*.whl wheelhouse/*.tar.gz
fi
@@ -104,7 +104,7 @@ matrix:
services:
- docker
install:
- - $PYTHON_CMD -m pip install cibuildwheel==0.10.0
+ - $PYTHON_CMD -m pip install cibuildwheel==0.10.1
#- mkdir gcc-x86_64 && wget -q https://github.com/Noctem/pogeo-toolchain/releases/download/v1.4/gcc-7.2-binutils-2.29-centos5-x86-64.tar.bz2 -O - | tar xj -C gcc-x86_64 --strip-components=1
#- mkdir ccache && wget -q https://www.samba.org/ftp/ccache/ccache-3.3.4.tar.bz2 -O - | tar xj -C ccache --strip-components=1
- mkdir qpdf && wget -q $QPDF_RELEASE -O - | tar xz -C qpdf --strip-components=1
diff --git a/Makefile b/Makefile
index f483ae9..89c1ab0 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ all: build
.PHONY: invalidate-cppcov
invalidate-cppcov:
- find . -name "*.gcno" -print0 | xargs -0 rm
+ find . -name "*.gcno" -delete
.PHONY: build
build: invalidate-cppcov
@@ -19,7 +19,7 @@ clean-coverage-pycov:
.PHONY: clean-coverage-cppcov
clean-coverage-cppcov:
rm -rf coverage/cppcov
- find . -name "*.gcda" -print0 | xargs -0 rm
+ find . -name "*.gcda" -delete
rm -f coverage/cpp.info
.PHONY: clean-coverage
@@ -56,5 +56,6 @@ cppcov: clean-coverage-cppcov build-cppcov pycov coverage/cppcov
coverage: cppcov pycov
.PHONY: docs
-docs:
+docs: build
+ $(MAKE) -C docs clean
$(MAKE) -C docs html
diff --git a/README.md b/README.md
index 3325a20..8e937de 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,10 @@ Python + QPDF = "py" + "qpdf" = "pyqpdf", which looks like a dyslexia test. Say
```python
# Elegant, Pythonic API
-pdf = pikepdf.open('input.pdf')
-num_pages = len(pdf.pages)
-del pdf.pages[-1]
-pdf.save('output.pdf')
+with pikepdf.open('input.pdf') as pdf:
+ num_pages = len(pdf.pages)
+ del pdf.pages[-1]
+ pdf.save('output.pdf')
```
**To install:**
@@ -34,8 +34,6 @@ Features
This library is similar to PyPDF2 and pdfrw - it provides low level access to PDF features and allows editing and content transformation of existing PDFs. Some knowledge of the PDF specification may be helpful. It does not have the capability to render a PDF to image.
-Python 2.7 and earlier versions of Python 3 are not currently supported but support is probably not difficult to achieve. Pull requests are welcome.
-
| **Feature** | **pikepdf** | **PyPDF2** | **pdfrw** |
|---------------------------------------------------------------------|-------------------------------------|-------------------------------------------|-----------------------------------------|
| Editing, manipulation and transformation of existing PDFs | ✔ | ✔ | ✔ |
@@ -43,7 +41,7 @@ Python 2.7 and earlier versions of Python 3 are not currently supported but supp
| Implementation | C++ and Python | Python | Python |
| PDF versions supported | 1.1 to 1.7 | 1.3? | 1.7 |
| Python versions supported | 3.5-3.7 | 2.6-3.6 | 2.6-3.6 |
-| Supports password protected (encrypted) PDFs | ✔ (except public key) | Only obsolete RC4 | ✘ |
+| Save and load password protected (encrypted) PDFs | ✔ (except public key) | ✘ (Only obsolete RC4) | ✘ (not at all) |
| Save and load PDF compressed object streams (PDF 1.5) | ✔ | ✘ | ✘ |
| Creates linearized ("fast web view") PDFs | ✔ | ✘ | ✘ |
| Actively maintained | ![pikepdf commit activity][pikepdf-commits] | ![PyPDF2 commit activity][pypdf2-commits] | ![pdfrw commit activity][pdfrw-commits] |
@@ -62,6 +60,11 @@ Python 2.7 and earlier versions of Python 3 are not currently supported but supp
[pdfrw-commits]: https://img.shields.io/github/commit-activity/y/pmaupin/pdfrw.svg
+In Production
+-------------
+
+* [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) uses pikepdf to graft OCR text layers onto existing PDFs, to examine the contents of input PDFs, and to optimize PDFs.
+
License
-------
diff --git a/debian/changelog b/debian/changelog
index a9e4c61..8abb809 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,27 @@
+pikepdf (1.5.0.post0+dfsh-1) unstable; urgency=medium
+
+ [ Sean Whitton ]
+ * New upstream release.
+ - Drop python3-defusedxml build-dep
+ - Drop python3-pytest-runner build-dep
+ + Drop rw-build-tree d/tests/control restriction
+ - Tighten python3-pybind11 dependency to require 2.3.0.
+ * Add drop-pybind11-from-setup.py.patch, disable-test_icc_extract.patch.
+ * Drop Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch,
+ drop-setuptools_scm_git_archive-from-setup.py.patch and
+ fix_xmp_metadata_without_xmpmeta_wrapper.patch.
+ * Refresh remaining patches.
+ * d/copyright updates:
+ - Update Files-Excluded
+ - Update upstream copyright years.
+ - Add info for docs/images/pike-{release,tree}.jpg,
+ tests/resources/pike-flate-jp2.pdf
+
+ [ Ondřej Nový ]
+ * Use debhelper-compat instead of debian/compat.
+
+ -- Sean Whitton <spwhitton@spwhitton.name> Thu, 15 Aug 2019 18:47:38 +0100
+
pikepdf (1.0.5+dfsg-3) unstable; urgency=medium
* Cherry pick upstream commit 4d22fe4 as
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
index ee1d2cb..5e56555 100644
--- a/debian/control
+++ b/debian/control
@@ -4,13 +4,11 @@ Priority: optional
Maintainer: Debian Python Modules Team <python-modules-team@lists.alioth.debian.org>
Uploaders: Sean Whitton <spwhitton@spwhitton.name>
Build-Depends:
- debhelper (>= 10),
+ debhelper-compat (= 10),
dh-python,
libqpdf-dev,
python3-all-dev,
- python3-defusedxml,
- python3-pybind11,
- python3-pytest-runner,
+ python3-pybind11 (>= 2.3.0),
python3-setuptools,
python3-setuptools-scm,
python3-setuptools-scm-git-archive,
diff --git a/debian/copyright b/debian/copyright
index 815795c..a4356b9 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,10 +1,17 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: pikepdf
Source: https://github.com/pikepdf/pikepdf
-Files-Excluded: tests/resources/enron1_gs.pdf
+Files-Excluded: tests/resources/enron1_gs.pdf tests/resources/tree.png tests/resources/tree-icc.pdf
+Comment:
+ tests/resources/tree* is from
+ https://svgsilh.com/009688/image/1847429.html which claims CC0-1.0,
+ but it is a modified version of
+ https://pixabay.com/illustrations/christmas-tree-tree-christmas-1847429/
+ which is under a license which is not clearly DFSG:
+ https://pixabay.com/service/license/
Files: *
-Copyright: (C) 2017 James R. Barlow
+Copyright: (C) 2017-2019 James R. Barlow
License: MPL-2.0
Comment:
The file licenses/license.wheel.txt is relevant only when a binary
@@ -26,7 +33,7 @@ Files: debian/*
Copyright: (C) 2018 Sean Whitton <spwhitton@spwhitton.name>
License: MPL-2.0
-Files: docs/images/pike.jpg tests/resources/pike-jp2.pdf
+Files: docs/images/pike.jpg tests/resources/pike*jp2.pdf
Copyright: Public domain
License: public-domain
From the U.S. Fish and Wildlife Service National Image Library.
@@ -85,6 +92,18 @@ License: CeCILL-2.0 or CC-BY-SA-2.0-FR
Comment:
Obtained from: https://commons.wikimedia.org/wiki/File:Pike_square_img_3653.jpg
+Files: docs/images/pike-release.jpg
+Copyright: (C) 2014 Azerty197666
+License: CC-BY-SA-4.0
+Comment:
+ Obtained from: https://commons.wikimedia.org/wiki/File:Release_of_a_pike.jpg
+
+Files: docs/images/pike-tree.jpg
+Copyright: Public domain
+License: CC0-1.0
+Comment:
+ Obtained from: https://www.maxpixel.net/Pike-Fish-Fishing-1082782
+
License: MPL-2.0
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0.
@@ -1594,3 +1613,430 @@ License: CC-BY-SA-2.0-FR
demande.
.
Creative Commons peut être contacté à https://creativecommons.org/.
+
+License: CC-BY-SA-4.0
+ Attribution-ShareAlike 4.0 International
+ .
+ =======================================================================
+ .
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
+ does not provide legal services or legal advice. Distribution of
+ Creative Commons public licenses does not create a lawyer-client or
+ other relationship. Creative Commons makes its licenses and related
+ information available on an "as-is" basis. Creative Commons gives no
+ warranties regarding its licenses, any material licensed under their
+ terms and conditions, or any related information. Creative Commons
+ disclaims all liability for damages resulting from their use to the
+ fullest extent possible.
+ .
+ Using Creative Commons Public Licenses
+ .
+ Creative Commons public licenses provide a standard set of terms and
+ conditions that creators and other rights holders may use to share
+ original works of authorship and other material subject to copyright
+ and certain other rights specified in the public license below. The
+ following considerations are for informational purposes only, are not
+ exhaustive, and do not form part of our licenses.
+ .
+ Considerations for licensors: Our public licenses are
+ intended for use by those authorized to give the public
+ permission to use material in ways otherwise restricted by
+ copyright and certain other rights. Our licenses are
+ irrevocable. Licensors should read and understand the terms
+ and conditions of the license they choose before applying it.
+ Licensors should also secure all rights necessary before
+ applying our licenses so that the public can reuse the
+ material as expected. Licensors should clearly mark any
+ material not subject to the license. This includes other CC-
+ licensed material, or material used under an exception or
+ limitation to copyright. More considerations for licensors:
+ wiki.creativecommons.org/Considerations_for_licensors
+ .
+ Considerations for the public: By using one of our public
+ licenses, a licensor grants the public permission to use the
+ licensed material under specified terms and conditions. If
+ the licensor's permission is not necessary for any reason--for
+ example, because of any applicable exception or limitation to
+ copyright--then that use is not regulated by the license. Our
+ licenses grant only permissions under copyright and certain
+ other rights that a licensor has authority to grant. Use of
+ the licensed material may still be restricted for other
+ reasons, including because others have copyright or other
+ rights in the material. A licensor may make special requests,
+ such as asking that all changes be marked or described.
+ Although not required by our licenses, you are encouraged to
+ respect those requests where reasonable. More_considerations
+ for the public:
+ wiki.creativecommons.org/Considerations_for_licensees
+ .
+ =======================================================================
+ .
+ Creative Commons Attribution-ShareAlike 4.0 International Public
+ License
+ .
+ By exercising the Licensed Rights (defined below), You accept and agree
+ to be bound by the terms and conditions of this Creative Commons
+ Attribution-ShareAlike 4.0 International Public License ("Public
+ License"). To the extent this Public License may be interpreted as a
+ contract, You are granted the Licensed Rights in consideration of Your
+ acceptance of these terms and conditions, and the Licensor grants You
+ such rights in consideration of benefits the Licensor receives from
+ making the Licensed Material available under these terms and
+ conditions.
+ .
+ .
+ Section 1 -- Definitions.
+ .
+ a. Adapted Material means material subject to Copyright and Similar
+ Rights that is derived from or based upon the Licensed Material
+ and in which the Licensed Material is translated, altered,
+ arranged, transformed, or otherwise modified in a manner requiring
+ permission under the Copyright and Similar Rights held by the
+ Licensor. For purposes of this Public License, where the Licensed
+ Material is a musical work, performance, or sound recording,
+ Adapted Material is always produced where the Licensed Material is
+ synched in timed relation with a moving image.
+ .
+ b. Adapter's License means the license You apply to Your Copyright
+ and Similar Rights in Your contributions to Adapted Material in
+ accordance with the terms and conditions of this Public License.
+ .
+ c. BY-SA Compatible License means a license listed at
+ creativecommons.org/compatiblelicenses, approved by Creative
+ Commons as essentially the equivalent of this Public License.
+ .
+ d. Copyright and Similar Rights means copyright and/or similar rights
+ closely related to copyright including, without limitation,
+ performance, broadcast, sound recording, and Sui Generis Database
+ Rights, without regard to how the rights are labeled or
+ categorized. For purposes of this Public License, the rights
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
+ Rights.
+ .
+ e. Effective Technological Measures means those measures that, in the
+ absence of proper authority, may not be circumvented under laws
+ fulfilling obligations under Article 11 of the WIPO Copyright
+ Treaty adopted on December 20, 1996, and/or similar international
+ agreements.
+ .
+ f. Exceptions and Limitations means fair use, fair dealing, and/or
+ any other exception or limitation to Copyright and Similar Rights
+ that applies to Your use of the Licensed Material.
+ .
+ g. License Elements means the license attributes listed in the name
+ of a Creative Commons Public License. The License Elements of this
+ Public License are Attribution and ShareAlike.
+ .
+ h. Licensed Material means the artistic or literary work, database,
+ or other material to which the Licensor applied this Public
+ License.
+ .
+ i. Licensed Rights means the rights granted to You subject to the
+ terms and conditions of this Public License, which are limited to
+ all Copyright and Similar Rights that apply to Your use of the
+ Licensed Material and that the Licensor has authority to license.
+ .
+ j. Licensor means the individual(s) or entity(ies) granting rights
+ under this Public License.
+ .
+ k. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such
+ as reproduction, public display, public performance, distribution,
+ dissemination, communication, or importation, and to make material
+ available to the public including in ways that members of the
+ public may access the material from a place and at a time
+ individually chosen by them.
+ .
+ l. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and of
+ the Council of 11 March 1996 on the legal protection of databases,
+ as amended and/or succeeded, as well as other essentially
+ equivalent rights anywhere in the world.
+ .
+ m. You means the individual or entity exercising the Licensed Rights
+ under this Public License. Your has a corresponding meaning.
+ .
+ .
+ Section 2 -- Scope.
+ .
+ a. License grant.
+ .
+ 1. Subject to the terms and conditions of this Public License,
+ the Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+ .
+ a. reproduce and Share the Licensed Material, in whole or
+ in part; and
+ .
+ b. produce, reproduce, and Share Adapted Material.
+ .
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with
+ its terms and conditions.
+ .
+ 3. Term. The term of this Public License is specified in Section
+ 6(a).
+ .
+ 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter created,
+ and to make technical modifications necessary to do so. The
+ Licensor waives and/or agrees not to assert any right or
+ authority to forbid You from making technical modifications
+ necessary to exercise the Licensed Rights, including
+ technical modifications necessary to circumvent Effective
+ Technological Measures. For purposes of this Public License,
+ simply making modifications authorized by this Section 2(a)
+ (4) never produces Adapted Material.
+ .
+ 5. Downstream recipients.
+ .
+ a. Offer from the Licensor -- Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+ .
+ b. Additional offer from the Licensor -- Adapted Material.
+ Every recipient of Adapted Material from You
+ automatically receives an offer from the Licensor to
+ exercise the Licensed Rights in the Adapted Material
+ under the conditions of the Adapter's License You apply.
+ .
+ c. No downstream restrictions. You may not offer or impose
+ any additional or different terms or conditions on, or
+ apply any Effective Technological Measures to, the
+ Licensed Material if doing so restricts exercise of the
+ Licensed Rights by any recipient of the Licensed
+ Material.
+ .
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You
+ are, or that Your use of the Licensed Material is, connected
+ with, or sponsored, endorsed, or granted official status by,
+ the Licensor or others designated to receive attribution as
+ provided in Section 3(a)(1)(A)(i).
+ .
+ b. Other rights.
+ .
+ 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however, to
+ the extent possible, the Licensor waives and/or agrees not to
+ assert any such rights held by the Licensor to the limited
+ extent necessary to allow You to exercise the Licensed
+ Rights, but not otherwise.
+ .
+ 2. Patent and trademark rights are not licensed under this
+ Public License.
+ .
+ 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties.
+ .
+ .
+ Section 3 -- License Conditions.
+ .
+ Your exercise of the Licensed Rights is expressly made subject to the
+ following conditions.
+ .
+ a. Attribution.
+ .
+ 1. If You Share the Licensed Material (including in modified
+ form), You must:
+ .
+ a. retain the following if it is supplied by the Licensor
+ with the Licensed Material:
+ .
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if
+ designated);
+ .
+ ii. a copyright notice;
+ .
+ iii. a notice that refers to this Public License;
+ .
+ iv. a notice that refers to the disclaimer of
+ warranties;
+ .
+ v. a URI or hyperlink to the Licensed Material to the
+ extent reasonably practicable;
+ .
+ b. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+ .
+ c. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+ .
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required
+ information.
+ .
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+ .
+ b. ShareAlike.
+ .
+ In addition to the conditions in Section 3(a), if You Share
+ Adapted Material You produce, the following conditions also apply.
+ .
+ 1. The Adapter's License You apply must be a Creative Commons
+ license with the same License Elements, this version or
+ later, or a BY-SA Compatible License.
+ .
+ 2. You must include the text of, or the URI or hyperlink to, the
+ Adapter's License You apply. You may satisfy this condition
+ in any reasonable manner based on the medium, means, and
+ context in which You Share Adapted Material.
+ .
+ 3. You may not offer or impose any additional or different terms
+ or conditions on, or apply any Effective Technological
+ Measures to, Adapted Material that restrict exercise of the
+ rights granted under the Adapter's License You apply.
+ .
+ .
+ Section 4 -- Sui Generis Database Rights.
+ .
+ Where the Licensed Rights include Sui Generis Database Rights that
+ apply to Your use of the Licensed Material:
+ .
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+ to extract, reuse, reproduce, and Share all or a substantial
+ portion of the contents of the database;
+ .
+ b. if You include all or a substantial portion of the database
+ contents in a database in which You have Sui Generis Database
+ Rights, then the database in which You have Sui Generis Database
+ Rights (but not its individual contents) is Adapted Material,
+ .
+ including for purposes of Section 3(b); and
+ c. You must comply with the conditions in Section 3(a) if You Share
+ all or a substantial portion of the contents of the database.
+ .
+ For the avoidance of doubt, this Section 4 supplements and does not
+ replace Your obligations under this Public License where the Licensed
+ Rights include other Copyright and Similar Rights.
+ .
+ .
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+ .
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+ .
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+ .
+ c. The disclaimer of warranties and limitation of liability provided
+ above shall be interpreted in a manner that, to the extent
+ possible, most closely approximates an absolute disclaimer and
+ waiver of all liability.
+ .
+ .
+ Section 6 -- Term and Termination.
+ .
+ a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply with
+ this Public License, then Your rights under this Public License
+ terminate automatically.
+ .
+ b. Where Your right to use the Licensed Material has terminated under
+ Section 6(a), it reinstates:
+ .
+ 1. automatically as of the date the violation is cured, provided
+ it is cured within 30 days of Your discovery of the
+ violation; or
+ .
+ 2. upon express reinstatement by the Licensor.
+ .
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+ .
+ c. For the avoidance of doubt, the Licensor may also offer the
+ Licensed Material under separate terms or conditions or stop
+ distributing the Licensed Material at any time; however, doing so
+ will not terminate this Public License.
+ .
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+ License.
+ .
+ .
+ Section 7 -- Other Terms and Conditions.
+ .
+ a. The Licensor shall not be bound by any additional or different
+ terms or conditions communicated by You unless expressly agreed.
+ .
+ b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+ .
+ .
+ Section 8 -- Interpretation.
+ .
+ a. For the avoidance of doubt, this Public License does not, and
+ shall not be interpreted to, reduce, limit, restrict, or impose
+ conditions on any use of the Licensed Material that could lawfully
+ be made without permission under this Public License.
+ .
+ b. To the extent possible, if any provision of this Public License is
+ deemed unenforceable, it shall be automatically reformed to the
+ minimum extent necessary to make it enforceable. If the provision
+ cannot be reformed, it shall be severed from this Public License
+ without affecting the enforceability of the remaining terms and
+ conditions.
+ .
+ c. No term or condition of this Public License will be waived and no
+ failure to comply consented to unless expressly agreed to by the
+ Licensor.
+ .
+ d. Nothing in this Public License constitutes or may be interpreted
+ as a limitation upon, or waiver of, any privileges and immunities
+ that apply to the Licensor or You, including from the legal
+ processes of any jurisdiction or authority.
+ .
+ .
+ =======================================================================
+ .
+ Creative Commons is not a party to its public licenses.
+ Notwithstanding, Creative Commons may elect to apply one of its public
+ licenses to material it publishes and in those instances will be
+ considered the "Licensor." Except for the limited purpose of indicating
+ that material is shared under a Creative Commons public license or as
+ otherwise permitted by the Creative Commons policies published at
+ creativecommons.org/policies, Creative Commons does not authorize the
+ use of the trademark "Creative Commons" or any other trademark or logo
+ of Creative Commons without its prior written consent including,
+ without limitation, in connection with any unauthorized modifications
+ to any of its public licenses or any other arrangements,
+ understandings, or agreements concerning use of licensed material. For
+ the avoidance of doubt, this paragraph does not form part of the public
+ licenses.
+ .
+ Creative Commons may be contacted at creativecommons.org.
diff --git a/debian/patches/Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch b/debian/patches/Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch
deleted file mode 100644
index 8c51aad..0000000
--- a/debian/patches/Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From: "James R. Barlow" <jim@purplerock.ca>
-Date: Tue, 12 Feb 2019 20:42:11 -0800
-Subject: Fix issue #25 - year missing leading zero on some platforms
-
-Closes #25
-
-(cherry picked from commit 4d22fe47912c518e8b3348aedccdac3f11ed81d7)
----
- src/pikepdf/models/metadata.py | 7 +++++--
- tests/test_metadata.py | 3 ++-
- 2 files changed, 7 insertions(+), 3 deletions(-)
-
-diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
-index 1a0eeb2..65934cd 100644
---- a/src/pikepdf/models/metadata.py
-+++ b/src/pikepdf/models/metadata.py
-@@ -121,8 +121,11 @@ def encode_pdf_date(d: datetime) -> str:
- the local time.
- """
-
-- pdfmark_date_fmt = r'%Y%m%d%H%M%S'
-- s = d.strftime(pdfmark_date_fmt)
-+ # The formatting of %Y is not consistent as described in
-+ # https://bugs.python.org/issue13305 and underspecification in libc.
-+ # So explicitly format the year with leading zeros
-+ s = "{:04d}".format(d.year)
-+ s += d.strftime(r'%m%d%H%M%S')
- tz = d.strftime('%z')
- if tz:
- sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
-diff --git a/tests/test_metadata.py b/tests/test_metadata.py
-index 1d41878..41a879c 100644
---- a/tests/test_metadata.py
-+++ b/tests/test_metadata.py
-@@ -3,7 +3,7 @@ from datetime import datetime, timezone, timedelta
- import re
-
- import pytest
--from hypothesis import given
-+from hypothesis import given, example
- from hypothesis.strategies import integers
- import pikepdf
- from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream
-@@ -252,6 +252,7 @@ def test_date_docinfo_from_xmp():
- integers(0, 99),
- integers(0, 99),
- )
-+@example(1, 1, 1, 0, 0, 0)
- def test_random_dates(year, month, day, hour, mins, sec):
- date_args = year, month, day, hour, mins, sec
- xmp = '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}'.format(*date_args)
diff --git a/debian/patches/disable-test_docinfo_problems.patch b/debian/patches/disable-test_docinfo_problems.patch
index a69ca9a..fdcd1fa 100644
--- a/debian/patches/disable-test_docinfo_problems.patch
+++ b/debian/patches/disable-test_docinfo_problems.patch
@@ -9,7 +9,7 @@ Needs a test resource whose DFSG status is in doubt.
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
-@@ -50,12 +50,6 @@ def trivial(resources):
+@@ -58,12 +58,6 @@ def trivial(resources):
@pytest.fixture
@@ -22,12 +22,10 @@ Needs a test resource whose DFSG status is in doubt.
def invalid_creationdate(resources):
# Has nuls in docinfo, old PDF
return Pdf.open(resources / 'invalid_creationdate.pdf')
-@@ -320,34 +314,6 @@ def test_remove_attribute_metadata(sandw
-
- # Ensure the whole node was deleted
+@@ -337,37 +331,6 @@ def test_remove_attribute_metadata(sandw
assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp))
--
--
+
+
-def test_docinfo_problems(enron1, invalid_creationdate):
- meta = enron1.open_metadata()
- meta._load() # File has invalid XML sequence &#0;
@@ -45,15 +43,20 @@ Needs a test resource whose DFSG status is in doubt.
-
-
-def test_wrong_xml(enron1):
-- enron1.Root.Metadata = Stream(enron1, b"""
+- enron1.Root.Metadata = Stream(
+- enron1,
+- b"""
- <test><xml>This is valid xml but not valid XMP</xml></test>
-- """.strip())
+- """.strip(),
+- )
- meta = enron1.open_metadata()
-- with pytest.raises(ValueError, message='not XMP'):
+- with pytest.raises(ValueError, match='not XMP'):
- with meta:
- pass
-- with pytest.raises(ValueError, message='not XMP'):
+- with pytest.raises(ValueError, match='not XMP'):
- meta['pdfaid:part']
-
-
+-
+-
def test_no_x_xmpmeta(trivial):
+ trivial.Root.Metadata = Stream(
+ trivial,
diff --git a/debian/patches/disable-test_icc_extract.patch b/debian/patches/disable-test_icc_extract.patch
new file mode 100644
index 0000000..3f6973f
--- /dev/null
+++ b/debian/patches/disable-test_icc_extract.patch
@@ -0,0 +1,27 @@
+From: Sean Whitton <spwhitton@spwhitton.name>
+Date: Thu, 15 Aug 2019 18:42:49 +0100
+Subject: disable test_icc_extract
+
+Requires a test resource in Files-Excluded.
+---
+ tests/test_image_access.py | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/tests/test_image_access.py b/tests/test_image_access.py
+index d4625d5..113a5ef 100644
+--- a/tests/test_image_access.py
++++ b/tests/test_image_access.py
+@@ -317,13 +317,6 @@ def test_icc_use(resources):
+ assert pim.icc.profile.xcolor_space == 'GRAY'
+
+
+-def test_icc_extract(resources):
+- xobj, _pdf = first_image_in(resources / 'tree-icc.pdf')
+-
+- pim = PdfImage(xobj)
+- assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes()
+-
+-
+ def test_stacked_compression(resources):
+ xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf')
+
diff --git a/debian/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch b/debian/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch
index e440dca..f880317 100644
--- a/debian/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch
+++ b/debian/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch
@@ -8,8 +8,8 @@ Subject: docs build use DEB_VERSION_UPSTREAM
--- a/docs/conf.py
+++ b/docs/conf.py
-@@ -52,8 +52,6 @@ else:
- # documentation root, use os.path.abspath to make it absolute, like shown here.
+@@ -76,8 +76,6 @@ else:
+ sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext'))
sys.path.insert(0, os.path.join(os.path.abspath('.'), '..'))
-import pikepdf
@@ -17,7 +17,7 @@ Subject: docs build use DEB_VERSION_UPSTREAM
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-@@ -101,7 +99,7 @@ author = u'James R. Barlow'
+@@ -123,7 +121,7 @@ author = u'James R. Barlow'
# |version| and |release|, also used in various other places throughout the
# built documents.
diff --git a/debian/patches/drop-installation-from-docs-contents.patch b/debian/patches/drop-installation-from-docs-contents.patch
index 65ca879..bfd6dae 100644
--- a/debian/patches/drop-installation-from-docs-contents.patch
+++ b/debian/patches/drop-installation-from-docs-contents.patch
@@ -13,6 +13,6 @@ Subject: drop installation from docs contents
:name: intro_toc
- installation
- changelog
+ release_notes
tutorial
objects
diff --git a/debian/patches/drop-pybind11-from-setup.py.patch b/debian/patches/drop-pybind11-from-setup.py.patch
new file mode 100644
index 0000000..90dc374
--- /dev/null
+++ b/debian/patches/drop-pybind11-from-setup.py.patch
@@ -0,0 +1,22 @@
+From: Sean Whitton <spwhitton@spwhitton.name>
+Date: Thu, 15 Aug 2019 18:12:41 +0100
+Subject: drop pybind11 from setup.py
+
+We are successfully providing pybind11 as a build-dep but ocrmypdf's
+setup.py fails to detect it for some reason.
+---
+ setup.py | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/setup.py b/setup.py
+index 59e9b3b..c1f62b2 100644
+--- a/setup.py
++++ b/setup.py
+@@ -138,7 +138,6 @@ setup(
+ setup_requires=[
+ 'setuptools_scm',
+ 'setuptools_scm_git_archive',
+- 'pybind11 >= 2.3.0, < 3',
+ ],
+ use_scm_version=True,
+ tests_require=tests_require,
diff --git a/debian/patches/drop-setuptools_scm_git_archive-from-setup.py.patch b/debian/patches/drop-setuptools_scm_git_archive-from-setup.py.patch
deleted file mode 100644
index 9349e43..0000000
--- a/debian/patches/drop-setuptools_scm_git_archive-from-setup.py.patch
+++ /dev/null
@@ -1,19 +0,0 @@
-From: Sean Whitton <spwhitton@spwhitton.name>
-Date: Wed, 10 Oct 2018 08:17:05 -0700
-Subject: drop setuptools_scm_git_archive from setup.py
-
-Pending resolution of #910742.
----
- setup.py | 1 -
- 1 file changed, 1 deletion(-)
-
---- a/setup.py
-+++ b/setup.py
-@@ -134,7 +134,6 @@ setup(
- setup_requires=[
- 'pytest-runner',
- 'setuptools_scm',
-- 'setuptools_scm_git_archive',
- 'pybind11 >= 2.2.4, < 3'
- ],
- use_scm_version=True,
diff --git a/debian/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch b/debian/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch
deleted file mode 100644
index 0254bee..0000000
--- a/debian/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch
+++ /dev/null
@@ -1,110 +0,0 @@
-From d31ea8fed2004345b3c274172ff0c28b7c6aca16 Mon Sep 17 00:00:00 2001
-From: "James R. Barlow" <jim@purplerock.ca>
-Date: Wed, 6 Feb 2019 00:36:59 -0800
-Subject: [PATCH] Fix handling of XMP metadata with no <x:xmpmeta> wrapper
-
----
- src/pikepdf/models/metadata.py | 12 ++++++++--
- tests/test_metadata.py | 41 ++++++++++++++++++++++++++++++++--
- 2 files changed, 49 insertions(+), 4 deletions(-)
-
-diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
-index 58c5a9d..23d246b 100644
---- a/src/pikepdf/models/metadata.py
-+++ b/src/pikepdf/models/metadata.py
-@@ -428,6 +428,14 @@ def _get_subelements(self, node):
- return result
- return ''
-
-+ def _get_rdf_root(self):
-+ rdf = self._xmp.find('.//rdf:RDF', self.NS)
-+ if rdf is None:
-+ rdf = self._xmp.getroot()
-+ if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF':
-+ raise ValueError("Metadata seems to be XML but not XMP")
-+ return rdf
-+
- def _get_elements(self, name=''):
- """Get elements from XMP
-
-@@ -452,7 +460,7 @@ def _get_elements(self, name=''):
-
- """
- qname = self._qname(name)
-- rdf = self._xmp.find('.//rdf:RDF', self.NS)
-+ rdf = self._get_rdf_root()
- for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS):
- if qname and qname in rdfdesc.keys():
- yield (rdfdesc, qname, rdfdesc.get(qname), rdf)
-@@ -540,7 +548,7 @@ def add_array(node, items):
- raise TypeError(val)
- except StopIteration:
- # Insert a new node
-- rdf = self._xmp.find('.//rdf:RDF', self.NS)
-+ rdf = self._get_rdf_root()
- if str(self._qname(key)) in LANG_ALTS:
- val = AltList([clean(val)])
- if isinstance(val, (list, set)):
-diff --git a/tests/test_metadata.py b/tests/test_metadata.py
-index abe05ff..be654c8 100644
---- a/tests/test_metadata.py
-+++ b/tests/test_metadata.py
-@@ -6,7 +6,7 @@
- from hypothesis import given
- from hypothesis.strategies import integers
- import pikepdf
--from pikepdf import Pdf, Dictionary, Name, PasswordError
-+from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream
- from pikepdf.models.metadata import (
- decode_pdf_date, encode_pdf_date,
- XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP,
-@@ -285,7 +285,7 @@ def test_bad_char_rejection(trivial):
- ET.fromstring(str(xmp))
-
-
--def test_xpacket(sandwich):
-+def test_xpacket_generation(sandwich):
- xmpstr1 = sandwich.Root.Metadata.read_bytes()
- xpacket_begin = b'<?xpacket begin='
- xpacket_end = b'<?xpacket end='
-@@ -336,3 +336,40 @@ def test_docinfo_problems(enron1, invalid_creationdate):
- with meta as xmp:
- xmp['xmp:CreateDate'] = 'invalid date'
- assert 'could not be updated' in warned[0].message.args[0]
-+
-+
-+def test_wrong_xml(enron1):
-+ enron1.Root.Metadata = Stream(enron1, b"""
-+ <test><xml>This is valid xml but not valid XMP</xml></test>
-+ """.strip())
-+ meta = enron1.open_metadata()
-+ with pytest.raises(ValueError, message='not XMP'):
-+ with meta:
-+ pass
-+ with pytest.raises(ValueError, message='not XMP'):
-+ meta['pdfaid:part']
-+
-+
-+def test_no_x_xmpmeta(trivial):
-+ trivial.Root.Metadata = Stream(trivial, b"""
-+ <?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>
-+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-+ xmlns:xmp="http://ns.adobe.com/xap/1.0/">
-+ <rdf:Description rdf:about=""
-+ xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
-+ xmlns:xmp="http://ns.adobe.com/xap/1.0/">
-+ <pdfaid:part>1</pdfaid:part>
-+ <pdfaid:conformance>A</pdfaid:conformance>
-+ <xmp:CreatorTool>Simple Scan 3.30.2</xmp:CreatorTool>
-+ <xmp:CreateDate>2019-02-05T07:08:46+01:00</xmp:CreateDate>
-+ <xmp:ModifyDate>2019-02-05T07:08:46+01:00</xmp:ModifyDate>
-+ <xmp:MetadataDate>2019-02-05T07:08:46+01:00</xmp:MetadataDate>
-+ </rdf:Description>
-+ </rdf:RDF>
-+ <?xpacket end="w"?>
-+ """.strip())
-+
-+ with trivial.open_metadata() as xmp:
-+ assert xmp._get_rdf_root() is not None
-+ xmp['pdfaid:part'] = '2'
-+ assert xmp['pdfaid:part'] == '2'
diff --git a/debian/patches/series b/debian/patches/series
index 152c5b8..57f09d3 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,6 +1,5 @@
docs-build-use-DEB_VERSION_UPSTREAM.patch
drop-installation-from-docs-contents.patch
-drop-setuptools_scm_git_archive-from-setup.py.patch
-fix_xmp_metadata_without_xmpmeta_wrapper.patch
disable-test_docinfo_problems.patch
-Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch
+drop-pybind11-from-setup.py.patch
+disable-test_icc_extract.patch
diff --git a/debian/tests/control b/debian/tests/control
index c20747f..3c956a8 100644
--- a/debian/tests/control
+++ b/debian/tests/control
@@ -1,6 +1,4 @@
Tests: test-suite
-Restrictions: rw-build-tree
-# ^ pytest-runner writes stuff to the source tree
Depends: @,
python3-pytest,
python3-pytest-helpers-namespace,
diff --git a/docs/_ext/fix_pybind11_autodoc.py b/docs/_ext/fix_pybind11_autodoc.py
new file mode 100644
index 0000000..d46dbff
--- /dev/null
+++ b/docs/_ext/fix_pybind11_autodoc.py
@@ -0,0 +1,31 @@
+# pybind11 generates some docstrings and function signatures that are functionally
+# correct but encourage users to rely on implementation details. Fix these here.
+
+replacements = {
+ 'pikepdf._qpdf.Object': 'pikepdf.Object',
+ 'pikepdf._qpdf.Pdf': 'pikepdf.Pdf',
+ 'QPDFObjectHandle': 'pikepdf.Object',
+ 'QPDFExc': 'pikepdf.PdfError',
+}
+
+
+def fix_sigs(app, what, name, obj, options, signature, return_annotation):
+ for from_, to in replacements.items():
+ if signature:
+ signature = signature.replace(from_, to)
+ if return_annotation:
+ return_annotation = return_annotation.replace(from_, to)
+ return signature, return_annotation
+
+
+def fix_doc(app, what, name, obj, options, lines):
+ for n, line in enumerate(lines[:]):
+ for from_, to in replacements.items():
+ lines[n] = lines[n].replace(from_, to)
+
+
+def setup(app):
+ app.connect('autodoc-process-signature', fix_sigs)
+ app.connect('autodoc-process-docstring', fix_doc)
+
+ return {'version': '0.1', 'parallel_read_safe': True, 'parallel_write_safe': True}
diff --git a/docs/pikepdf.rst b/docs/api.rst
index 750a3db..792fb8d 100644
--- a/docs/pikepdf.rst
+++ b/docs/api.rst
@@ -1,15 +1,15 @@
pikepdf API
***********
-Primary objects
-===============
+Main objects
+============
.. autoclass:: pikepdf.Pdf
:members:
.. autofunction:: pikepdf.open
-.. autoclass:: pikepdf.ObjectStreamMode
+.. class:: pikepdf.ObjectStreamMode
Options for saving object streams within PDFs, which are more a compact
way of saving certains types of data that was added in PDF 1.5. All
@@ -30,7 +30,7 @@ Primary objects
Generate object streams.
-.. autoclass:: pikepdf.StreamDecodeLevel
+.. class:: pikepdf.StreamDecodeLevel
.. attribute:: none
@@ -68,6 +68,8 @@ Primary objects
compression and decompression cycles. This is mostly useful for
retrieving image data.
+.. autoclass:: pikepdf.Encryption
+
.. autoexception:: pikepdf.PdfError
.. autoexception:: pikepdf.PasswordError
@@ -126,3 +128,77 @@ Support models
.. autoclass:: pikepdf.models.PdfMetadata
:members:
+
+.. autoclass:: pikepdf.models.Encryption
+ :members:
+
+.. autoclass:: pikepdf.Permissions
+ :members:
+
+ .. attribute:: accessibility
+
+ The owner of the PDF permission for screen readers and accessibility
+ tools to access the PDF.
+
+ .. attribute:: extract
+
+ The owner of the PDF permission for software to extract content from a PDF.
+
+ .. attribute:: modify_annotation
+
+ .. attribute:: modify_assembly
+
+ .. attribute:: modify_form
+
+ .. attribute:: modify_other
+
+ The owner of the PDF permission to modify various parts of a PDF.
+
+ .. attribute:: print_lowres
+
+ .. attribute:: print_highres
+
+ The owner of the PDF permission to print at low or high resolution.
+
+.. class:: pikepdf.models.EncryptionMethod
+
+ Describes which encryption method was used on a particular part of a
+ PDF. These values are returned by :class:`pikepdf.EncryptionInfo` but
+ are not currently used to specify how encryption is requested.
+
+ .. attribute:: none
+
+ Data was not encrypted.
+
+ .. attribute:: unknown
+
+ An unknown algorithm was used.
+
+ .. attribute:: rc4
+
+ The RC4 encryption algorithm was used (obsolete).
+
+ .. attribute:: aes
+
+ The AES-based algorithm was used as described in the PDF 1.7 reference manual.
+
+ .. attribute:: aesv3
+
+ An improved version of the AES-based algorithm was used as described in the
+ Adobe Supplement to the ISO 32000, requiring PDF 1.7 extension level 3. This
+ algorithm still uses AES, but allows both AES-128 and AES-256, and improves how
+ the key is derived from the password.
+
+.. autoclass:: pikepdf.models.EncryptionInfo
+ :members:
+
+Internal objects
+================
+
+These objects are returned by other pikepdf objects. They are part of the API,
+but not intended to be created explicitly.
+
+.. autoclass:: pikepdf._qpdf.PageList
+ :members:
+
+ A ``list``-like object enumerating all pages in a :class:`pikepdf.Pdf`.
diff --git a/docs/changelog.rst b/docs/changelog.rst
deleted file mode 100644
index 0ac5e8e..0000000
--- a/docs/changelog.rst
+++ /dev/null
@@ -1,256 +0,0 @@
-.. _changelog:
-
-Changelog
-#########
-
-pikepdf releases use the `semantic versioning <http://semver.org>`_ policy.
-
-The pikepdf API (as provided by ``import pikepdf``) is quite stable and is in production use.
-
-Note that the C++ extension module ``pikepdf._qpdf`` is a private interface within pikepdf that applications should not use directly.
-
-v1.0.5
-======
-
-* Fixed an issue where an invalid date in XMP metadata would cause an exception when updating DocumentInfo. For now, we warn that some DocumentInfo is not convertible. (In the future, we should also check if the XMP date is valid, because it probably is not.)
-
-* Rebuilt the binary wheels with libqpdf 8.3.0. libqpdf 8.2.1 is still supported.
-
-v1.0.4
-======
-
-* Updates to tests/resources (provenance of one test file, replaced another test file with a synthetic one)
-
-v1.0.3
-======
-
-* Fixed regression on negative indexing of pages.
-
-v1.0.2
-======
-
-* Fixed an issue where invalid values such as out of range years (e.g. 0) in DocumentInfo would raise exceptions when using DocumentInfo to populate XMP metadata with ``.load_from_docinfo``.
-
-v1.0.1
-======
-
-* Fixed an exception with handling metadata that contains the invalid XML entity ``&#0;`` (an escaped NUL)
-
-v1.0.0
-======
-
-* Changed version to 1.0.
-
-v0.10.2
-=======
-
-Fixes
------
-
-* Fixed segfault when overwriting the pikepdf file that is currently open on Linux.
-
-* Fixed removal of an attribute metadata value when values were present on the same node.
-
-v0.10.1
-=======
-
-Fixes
------
-
-* Avoid canonical XML since it is apparently too strict for XMP.
-
-v0.10.0
-=======
-
-Fixes
------
-
-* Fixed several issues related to generating XMP metadata that passed veraPDF validation.
-
-* Fixed a random test suite failure for very large negative integers.
-
-* The lxml library is now required.
-
-v0.9.2
-======
-
-Fixes
------
-
-* Added all of the commonly used XML namespaces to XMP metadata handling, so we are less likely to name something 'ns1', etc.
-
-* Skip a test that fails on Windows.
-
-* Fixed build errors in documentation.
-
-v0.9.1
-======
-
-Fixes
------
-
-* Fix ``Object.write()`` accepting positional arguments it wouldn't use
-
-* Fix handling of XMP data with timezones (or missing timezone information) in a few cases
-
-* Fix generation of XMP with invalid XML characters if the invalid characters were inside a non-scalar object
-
-v0.9.0
-======
-
-Updates
--------
-
-* New API to access and edit PDF metadata and make consistent edits to the new and old style of PDF metadata.
-
-* 32-bit binary wheels are now available for Windows
-
-* PDFs can now be saved in QPDF's "qdf" mode
-
-* The Python package defusedxml is now required
-
-* The Python package python-xmp-toolkit and its dependency libexempi are suggested for testing, but not required
-
-Fixes
------
-
-* Fixed handling of filenames that contain multibyte characters on non-UTF-8 systems
-
-Breaking
---------
-
-* The ``Pdf.metadata`` property was removed, and replaced with the new metadata API
-
-* ``Pdf.attach()`` has been removed, because the interface as implemented had no way to deal with existing attachments.
-
-v0.3.7
-======
-
-* Add API for inline images to unparse themselves
-
-v0.3.6
-======
-
-* Performance of reading files from memory improved to avoid unnecessary copies.
-
-* It is finally possible to use ``for key in pdfobj`` to iterate contents of PDF Dictionary, Stream and Array objects. Generally these objects behave more like Python containers should now.
-
-* Package API declared beta.
-
-v0.3.5
-======
-
-Breaking
---------
-
-* ``Pdf.save(...stream_data_mode=...)`` has been dropped in favor of the newer ``compress_streams=`` and ``stream_decode_level`` parameters.
-
-Fixes
------
-
-* A use-after-free memory error that caused occasional segfaults and "QPDFFakeName" errors when opening from stream objects has been resolved.
-
-v0.3.4
-======
-
-Updates
--------
-
-* pybind11 vendoring has ended now that v2.2.4 has been released
-
-v0.3.3
-======
-
-Breaking
---------
-
-* libqpdf 8.2.1 is now required
-
-Updates
--------
-
-* Improved support for working with JPEG2000 images in PDFs
-* Added progress callback for saving files, ``Pdf.save(..., progress=)``
-* Updated pybind11 subtree
-
-Fixes
------
-
-* ``del obj.AttributeName`` was not implemented. The attribute interface is now consistent
-* Deleting named attributes now defers to the attribute dictionary for Stream objects, as get/set do
-* Fixed handling of JPEG2000 images where metadata must be retrieved from the file
-
-v0.3.2
-======
-
-Updates
--------
-
-* Added support for direct image extraction of CMYK and grayscale JPEGs, where previously only RGB (internally YUV) was supported
-* ``Array()`` now creates an empty array properly
-* The syntax ``Name.Foo in Dictionary()``, e.g. ``Name.XObject in page.Resources``, now works
-
-v0.3.1
-======
-
-Breaking
---------
-
-* ``pikepdf.open`` now validates its keyword arguments properly, potentially breaking code that passed invalid arguments
-* libqpdf 8.1.0 is now required - libqpdf 8.1.0 API is now used for creating Unicode strings
-* If a non-existent file is opened with ``pikepdf.open``, a ``FileNotFoundError`` is raised instead of a generic error
-* We are now *temporarily* vendoring a copy of pybind11 since its master branch contains unreleased and important fixes for Python 3.7.
-
-Updates
--------
-
-* The syntax ``Name.Thing`` (e.g. ``Name.DecodeParms``) is now supported as equivalent to ``Name('/Thing')`` and is the recommended way to refer names within a PDF
-* New API ``Pdf.remove_unneeded_resources()`` which removes objects from each page's resource dictionary that are not used in the page. This can be used to create smaller files.
-
-Fixes
------
-
-* Fixed an error parsing inline images that have masks
-* Fixed several instances of catching C++ exceptions by value instead of by reference
-
-v0.3.0
-======
-
-Breaking
---------
-
-* Modified ``Object.write`` method signature to require ``filter`` and ``decode_parms`` as keyword arguments
-* Implement automatic type conversion from the PDF Null type to ``None``
-* Removed ``Object.unparse_resolved`` in favor of ``Object.unparse(resolved=True)``
-* libqpdf 8.0.2 is now required at minimum
-
-Updates
--------
-
-* Improved IPython/Jupyter interface to directly export temporary PDFs
-* Updated to qpdf 8.1.0 in wheels
-* Added Python 3.7 support for Windows
-* Added a number of missing options from QPDF to ``Pdf.open`` and ``Pdf.save``
-* Added ability to delete a slice of pages
-* Began using Jupyter notebooks for documentation
-
-v0.2.2
-======
-
-* Added Python 3.7 support to build and test (not yet available for Windows, due to lack of availability on Appveyor)
-* Removed setter API from ``PdfImage`` because it never worked anyway
-* Improved handling of ``PdfImage`` with trivial palettes
-
-v0.2.1
-======
-
-* ``Object.check_owner`` renamed to ``Object.is_owned_by``
-* ``Object.objgen`` and ``Object.get_object_id`` are now public functions
-* Major internal reorganization with ``pikepdf.models`` becoming the submodule that holds support code to ease access to PDF objects as opposed to wrapping QPDF.
-
-v0.2.0
-======
-
-* Implemented automatic type conversion for ``int``, ``bool`` and ``Decimal``, eliminating the ``pikepdf.{Integer,Boolean,Real}`` types. Removed a lot of associated numerical code.
-
-Everything before v0.2.0 can be considered too old to document.
diff --git a/docs/conf.py b/docs/conf.py
index f0dd83d..f01a2ba 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,19 +22,41 @@ on_rtd = os.environ.get('READTHEDOCS') == 'True'
if on_rtd:
# Borrowed from https://github.com/YannickJadoul/Parselmouth/blob/master/docs/conf.py
rtd_version = os.environ.get('READTHEDOCS_VERSION')
- setup_py_version = subprocess.check_output([sys.executable, 'setup.py', '--version'], cwd='..').decode('ascii').strip()
+ setup_py_version = (
+ subprocess.check_output([sys.executable, 'setup.py', '--version'], cwd='..')
+ .decode('ascii')
+ .strip()
+ )
if rtd_version == 'stable':
branch = None
try:
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pikepdf=={}'.format(setup_py_version)])
+ subprocess.check_call(
+ [
+ sys.executable,
+ '-m',
+ 'pip',
+ 'install',
+ 'pikepdf=={}'.format(setup_py_version),
+ ]
+ )
except subprocess.CalledProcessError:
branch = 'master'
else:
branch = 'master' if rtd_version == 'latest' else rtd_version
if branch is not None:
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--only-binary', 'pikepdf', 'pikepdf'])
+ subprocess.check_call(
+ [
+ sys.executable,
+ '-m',
+ 'pip',
+ 'install',
+ '--only-binary',
+ 'pikepdf',
+ 'pikepdf',
+ ]
+ )
class Mock(MagicMock):
@classmethod
@@ -47,15 +69,17 @@ if on_rtd:
else:
sys.path.insert(0, os.path.abspath(os.path.join('..', 'installed')))
+
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext'))
sys.path.insert(0, os.path.join(os.path.abspath('.'), '..'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -66,13 +90,11 @@ extensions = [
'sphinx.ext.autosummary',
'sphinx.ext.napoleon',
'IPython.sphinxext.ipython_console_highlighting',
- 'IPython.sphinxext.ipython_directive'
+ 'IPython.sphinxext.ipython_directive',
+ 'fix_pybind11_autodoc',
]
-ipython_execlines = [
- 'import pikepdf',
- 'from pikepdf import Pdf'
-]
+ipython_execlines = ['import pikepdf', 'from pikepdf import Pdf']
autosummary_generate = True
@@ -85,7 +107,7 @@ templates_path = ['_templates']
source_suffix = '.rst'
# The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
@@ -111,9 +133,9 @@ language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
@@ -121,27 +143,27 @@ exclude_patterns = ['_build', '**.ipynb_checkpoints', '_notebooks']
# The reST default role (used for this markup: `text`) to use for all
# documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
@@ -156,26 +178,26 @@ html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
@@ -185,62 +207,62 @@ html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
-#html_search_language = 'en'
+# html_search_language = 'en'
# A dictionary with options for the search language support, empty by default.
# Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
+# html_search_options = {'type': 'default'}
# The name of a javascript file (relative to the configuration directory) that
# implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
+# html_search_scorer = 'scorer.js'
# Output file base name for HTML help builder.
htmlhelp_basename = 'pikepdfdoc'
@@ -248,59 +270,52 @@ htmlhelp_basename = 'pikepdfdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
-
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
+ # Latex figure (float) alignment
+ #'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- (master_doc, 'pikepdf.tex', u'pikepdf Documentation',
- u'James R. Barlow', 'manual'),
+ (master_doc, 'pikepdf.tex', u'pikepdf Documentation', u'James R. Barlow', 'manual')
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
# If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [
- (master_doc, 'pikepdf', u'pikepdf Documentation',
- [author], 1)
-]
+man_pages = [(master_doc, 'pikepdf', u'pikepdf Documentation', [author], 1)]
# If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
@@ -309,22 +324,28 @@ man_pages = [
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- (master_doc, 'pikepdf', u'pikepdf Documentation',
- author, 'pikepdf', 'Python bindings for QPDF.',
- 'Miscellaneous'),
+ (
+ master_doc,
+ 'pikepdf',
+ u'pikepdf Documentation',
+ author,
+ 'pikepdf',
+ 'Python bindings for QPDF.',
+ 'Miscellaneous',
+ )
]
# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
# If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
# Example configuration for intersphinx: refer to the Python standard library.
diff --git a/docs/encoding.rst b/docs/encoding.rst
new file mode 100644
index 0000000..88d2e15
--- /dev/null
+++ b/docs/encoding.rst
@@ -0,0 +1,41 @@
+Character encoding
+******************
+
+In most circumstances, pikepdf performs appropriate encodings and
+decodings on its own, or returns :class:`pikepdf.String` if it is not clear
+whether to present data as a string or binary data.
+
+``str(pikepdf.String)`` is performed by inspecting the binary data. If the
+binary data begins with a UTF-16 byte order mark, then the data is
+interpreted as UTF-16 and returned as a Python ``str``. Otherwise, the data
+is returned as a Python ``str``, if the binary data will be interpreted as
+PDFDocEncoding and decoded to ``str``. Again, in most cases this is correct
+behavior and will operate transparently.
+
+Some functions are available in circumstances where it is necessary to force
+a particular conversion.
+
+PDFDocEncoding
+==============
+
+The PDF specification defines PDFDocEncoding, a character encoding used only
+in PDFs. It is quite similar to ASCII but not equivalent.
+
+When pikepdf is imported, it automatically registers ``"pdfdoc"`` as a codec
+with the standard library, so that it may be used in string and byte
+conversions.
+
+.. code-block:: python
+
+ "•".encode('pdfdoc') == b'\x81'
+
+Other codecs
+============
+
+Two other codecs are commonly used in PDFs, but they are already part of the
+standard library.
+
+**WinAnsiEncoding** is identical Windows Code Page 1252, and may be converted
+using the ``"cp1251"`` codec.
+
+**MacRomanEncoding** may be converted using the ``"macroman"`` codec.
diff --git a/docs/images/pike-release.jpg b/docs/images/pike-release.jpg
new file mode 100644
index 0000000..092ab6f
--- /dev/null
+++ b/docs/images/pike-release.jpg
Binary files differ
diff --git a/docs/images/pike-tree.jpg b/docs/images/pike-tree.jpg
new file mode 100644
index 0000000..79aa70c
--- /dev/null
+++ b/docs/images/pike-tree.jpg
Binary files differ
diff --git a/docs/index.rst b/docs/index.rst
index bf31c04..9cad0bb 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -111,16 +111,18 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and
:caption: Introduction
:name: intro_toc
- changelog
+ release_notes
tutorial
objects
+ page_copying
.. toctree::
:maxdepth: 2
:caption: Reference
:name: reference_toc
- pikepdf
+ api
+ encoding
arch
resources
diff --git a/docs/installation.rst b/docs/installation.rst
index 75e51ed..a8f9a5b 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -1,13 +1,23 @@
Installation
============
+.. figure:: images/pike-tree.jpg
+ :scale: 50%
+ :alt: Picture of pike fish impaled on tree branch
+ :align: right
+
+ A pike installation failure.
+
+As a Python package
+-------------------
+
.. |latest| image:: https://img.shields.io/pypi/v/pikepdf.svg
:alt: pikepdf latest released version on PyPI
|latest|
Most users on Linux, macOS or Windows with x64 systems should take advantage of
-the binary wheels.
+the binary wheels to install the Python package.
.. code-block:: bash
@@ -22,29 +32,32 @@ added if anyone uses them.
Binary wheels should work on most systems work on Linux distributions 2007
and newer, macOS 10.11 and newer (for Homebrew), Windows 7 and newer.
-Managed distributions
----------------------
+Use ``pip install --user pikepdf`` to install the package for the current user
+only. Use ``pip install pikepdf`` to install to a virtual environment.
-pikepdf is not yet widely distributed, but a few Linux distributions do make it
-available.
+Managed Linux distributions
+---------------------------
-**Debian**
+**Debian, Ubuntu and other APT distributions**
-.. |deb-experimental| image:: https://repology.org/badge/version-for-repo/debian_experimental/pikepdf.svg
- :alt: Debian experimental
+.. |apt| image:: https://repology.org/badge/vertical-allrepos/pikepdf.svg
+ :alt: Package status in apt world
-|deb-experimental|
+|apt|
.. code-block:: bash
- apt-get -t experimental install pikepdf
+ apt install pikepdf
**Fedora 29**
-.. |fedora| image:: https://repology.org/badge/version-only-for-repo/fedora_29/python:pikepdf.svg
+.. |fedora| image:: https://repology.org/badge/version-for-repo/fedora_29/python:pikepdf.svg
:alt: Fedora 29
-|fedora|
+.. |rawhide| image:: https://repology.org/badge/version-for-repo/fedora_rawhide/python:pikepdf.svg
+ :alt: Fedora Rawhide
+
+|fedora| |rawhide|
.. code-block:: bash
@@ -52,18 +65,34 @@ available.
**ArchLinux**
+.. |aur| image:: https://repology.org/badge/version-for-repo/aur/python:pikepdf.svg
+
+|aur|
+
Available in `ArchLinux User Repository <https://aur.archlinux.org/packages/python-pikepdf/>`_.
.. code-block:: bash
pacman -S pikepdf
+Installing on FreeBSD 11.2
+--------------------------
+
+No FreeBSD ports exist, but all of the dependencies are currently available. You can try
+something like:
+
+.. code-block:: bash
+
+ pkg install python3 lang/python3
+ pkg install py36-lxml qpdf
+ pip install --user pikepdf
+
Building from source
--------------------
**Requirements**
-.. |qpdf-version| replace:: 8.2.1
+.. |qpdf-version| replace:: 8.4.0
pikepdf requires:
@@ -84,7 +113,7 @@ must
(Consider using the binary wheels, which bundle the required version of
libqpdf.)
-**GCC and Clang**
+**Compiling with GCC or Clang**
- clone this repository
- install libjpeg, zlib and libqpdf on your platform, including headers
@@ -116,14 +145,19 @@ Running a regular ``pip install`` command will detect the
version of the compiler used to build Python and attempt to build the
extension with it. We must force the use of Visual Studio 2015.
-- clone this repository
-- ``"%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" x64``
-- ``set DISTUTILS_USE_SDK=1``
-- ``set MSSdk=1``
-- download |msvc-zip| from the `QPDF releases page <https://github.com/qpdf/qpdf/releases>`_
-- extract ``bin\qpdfXX.dll`` from the zip file above, where XX is the version
- of the ABI, and copy it to the ``src/pikepdf`` folder in the repository
-- run ``pip install .`` in the root directory of the repository
+#. Clone this repository.
+#. In a command prompt, run:
+
+ .. code-block:: bat
+
+ %VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" x64
+ set DISTUTILS_USE_SDK=1
+ set MSSdk=1
+
+#. Download |msvc-zip| from the `QPDF releases page <https://github.com/qpdf/qpdf/releases>`_.
+#. Extract ``bin\qpdfXX.dll`` from the zip file above, where XX is the version
+ of the ABI, and copy it to the ``src/pikepdf`` folder in the repository.
+#. Run ``pip install .`` in the root directory of the repository.
.. note::
@@ -148,6 +182,8 @@ Building the documentation
Documentation is generated using Sphinx and you are currently reading it. To
regenerate it:
-- ``pip install -r requirements/docs.txt``
-- ``cd pikepdf/docs``
-- ``make html``
+.. code-block:: bash
+
+ pip install -r requirements/docs.txt
+ cd pikepdf/docs
+ make html
diff --git a/docs/objects.rst b/docs/objects.rst
index c08f460..4b62cc6 100644
--- a/docs/objects.rst
+++ b/docs/objects.rst
@@ -1,5 +1,5 @@
-pikepdf Object Model
-********************
+Object model
+************
This section covers the object model pikepdf uses in more detail.
@@ -18,9 +18,9 @@ value in a PDF is assigned to a Python ``float``, pikepdf will convert it to
``Decimal``.
Types that are not directly convertible to Python are represented as
-:class:`pikepdf.Object`, a compound object that offers a superset of methods,
-some work only if the underlying type is suitable. You can use the EAFP
-idiom or ``isinstance`` to determine the type more precisely. This partly
+:class:`pikepdf.Object`, a compound object that offers a superset of possible
+methods, some of which only if the underlying type is suitable. Use the EAFP
+idiom, or ``isinstance`` to determine the type more precisely. This partly
reflects the fact that the PDF specification allows many data fields to be
one of several types.
@@ -67,3 +67,27 @@ the appropriate pikepdf object when passed to pikepdf APIs – when possible.
However, pikepdf sends ``pikepdf.Object`` types back to Python on return calls,
in most cases, because pikepdf needs to keep track of objects that came from
PDFs originally.
+
+
+Object lifecycle and memory management
+======================================
+
+As mentioned above, a :class:`pikepdf.Object` may reference data that is lazily
+loaded from its source :class:`pikepdf.Pdf`. Closing the `Pdf` with
+:meth:`pikepdf.Pdf.close` will invalidate some objects, depending on whether
+or not the data was loaded, and other implementation details that may change.
+Generally speaking, a :class:`pikepdf.Pdf` should be held open until it is no
+longer needed, and objects that were derived from it may or may not be usable
+after it is closed.
+
+Simple objects (booleans, integers, decimals, ``None``) are copied directly
+to Python as pure Python objects.
+
+For PDF stream objects, use :meth:`pikepdf.Object.read_bytes()` to obtain a
+copy of the object as pure bytes data, if this information is required after
+closing a PDF.
+
+When objects are copied from one :class:`pikepdf.Pdf` to another, the
+underlying data is copied immediately into the target. As such it is possible
+to merge hundreds of `Pdf` into one, keeping only a single source and the
+target file open at a time.
diff --git a/docs/page_copying.rst b/docs/page_copying.rst
new file mode 100644
index 0000000..7d0c456
--- /dev/null
+++ b/docs/page_copying.rst
@@ -0,0 +1,37 @@
+.. _page-copying:
+
+Copying and updating pages
+**************************
+
+You can rearrange or duplicate pages within a PDF, with an important caveat:
+
+.. warning::
+
+ ``pdf.pages[0] = pdf.pages[42]`` will create a shallow copy of
+ ``pdf.pages[42]``, unlike the usual behavior in Python.
+
+Assigning one page to another within the same PDF will create a shallow copy of
+the source page. This does differ from the usual Python semantics, where
+assigning a list element to another element in the same list would merely create
+two references to an identical object. (Normally after setting ``list[0] =
+list[1]``, ``list[0] is list[1]``.) We break this convention with the shallow
+copy, and only guarantee ``page[0] == page[1]``.)
+
+There is one important reason we have to do it this way: suppose that there
+was a table of contents entry that points to ``pdf.pages[42]``. After we set
+``pages[0]`` to be the same, where should the table of contents entry point?
+We leave it pointed at ``pdf.pages[42]``.
+
+What if there was a table of contents entry that referenced ``pages[0]``?
+(In PDFs, the table of contents references a page object, not a page number.)
+Is that entry still valid after reassignment? As the library, we don't know.
+As the application developer, you have to decide. (pikepdf does not currently
+have support code for managing table of contents objects, but you can
+manipulate them.)
+
+Updating a page in place
+========================
+
+Use :meth:`pikepdf.Object.emplace` to emplace one PDF page over top of another
+while preserving all references to the original page. ``emplace()`` sets all
+of the keys and values of the pages to be equal.
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
new file mode 100644
index 0000000..5b9f461
--- /dev/null
+++ b/docs/release_notes.rst
@@ -0,0 +1,436 @@
+.. _changelog:
+
+Release notes
+#############
+
+.. figure:: images/pike-release.jpg
+ :figwidth: 30%
+ :alt: pike fish being released to water
+ :align: right
+
+ Releasing a pike.
+
+pikepdf releases use the `semantic versioning <http://semver.org>`__
+policy.
+
+The pikepdf API (as provided by ``import pikepdf``) is quite stable and
+is in production use.
+
+Note that the C++ extension module ``pikepdf._qpdf`` is a private
+interface within pikepdf that applications should not use directly.
+
+v1.5.0
+======
+
+- Improved interpretation of images within PDFs that use an ICC colorspace.
+ Where possible we embed the ICC profile when extracting the image, and
+ profile access to the ICC profile.
+- Fixed saving PDFs with their existing encryption.
+- Fixed documentation to reflect the fact that saving a PDF without
+ specifying encryption settings will remove encryption.
+- Added a test to prevent overwriting the input PDF since overwriting
+ corrupts lazy loading.
+- ``Object.write(filters=, decode_parms=)`` now detects invalid parameters
+ instead of writing invalid values to ``Filters`` and ``DecodeParms``.
+- We can now extract some images that had stacked compression, provided it
+ is ``/FlateDecode``.
+- Add convenience function ``Object.wrap_in_array()``.
+
+v1.4.0
+======
+
+- Added support for saving encrypted PDFs. (Reading them has been supported
+ for a long time.)
+- Added support for setting the PDF extension level as well as version.
+- Added support converting strings to and from PDFDocEncoding, by
+ registering a ``"pdfdoc"`` codec.
+
+v1.3.1
+======
+
+- Updated pybind11 to v2.3.0, fixing a possible GIL deadlock when
+ pikepdf objects were shared across threads. (#27)
+- Fixed an issue where PDFs with valid XMP metadata but missing an
+ element that is usually present would be rejected as malformed XMP.
+
+v1.3.0
+======
+
+- Remove dependency on ``defusedxml.lxml``. Unfortunately this module
+ of ``defusedxml`` is deprecated because the issue it worked around
+ have been resolved in the underlying ``lxml`` library. In the absence
+ of other options for XML hardening we have reverted to standard
+ ``lxml``.
+- Fixed an issue where ``PdfImage.extract_to()`` would write a file in
+ the wrong directory.
+- Eliminated an intermediate buffer that was used when saving to an IO
+ stream (as opposed to a filename). We would previously write the
+ entire output to a memory buffer and then write to the output buffer;
+ we now write directly to the stream.
+- Added ``Object.emplace()`` as a workaround for when one wants to
+ update a page without generating a new page object so that
+ links/table of contents entries to the original page are preserved.
+- Improved documentation. Eliminated all ``arg0`` placeholder variable
+ names.
+- Added ``PageList.remove(p=1)``, so that it is possible to remove
+ pages using counting numbers.
+
+v1.2.0
+======
+
+- Implemented ``Pdf.close()`` and ``with``-block context manager, to
+ allow Pdf objects to be closed without relying on ``del``.
+- ``PdfImage.extract_to()`` has a new keyword argument ``fileprefix=``,
+ which to specify a filepath where an image should be extracted with
+ pikepdf setting the appropriate file suffix. This simplifies the API
+ for the most common case of extracting images to files.
+- Fixed an internal test that should have suppressed the extraction of
+ JPEGs with a nonstandard ColorTransform parameter set. Without the
+ proper color transform applied, the extracted JPEGs will typically
+ look very pink. Now, these images should fail to extract as was
+ intended.
+- Fixed that ``Pdf.save(object_stream_mode=...)`` was ignored if the
+ default ``fix_metadata_version=True`` was also set.
+- Data from one ``Pdf`` is now copied to other ``Pdf`` objects
+ immediately, instead of creating a reference that required source
+ PDFs to remain available. ``Pdf`` objects no longer reference each
+ other.
+- libqpdf 8.4.0 is now required
+- Various documentation improvements
+
+v1.1.0
+======
+
+- Added workaround for macOS/clang build problem of the wrong exception
+ type being thrown in some cases.
+- Improved translation of certain system errors to their Python
+ equivalents.
+- Fixed issues resulting from platform differences in
+ ``datetime.strftime``. (#25)
+- Added ``Pdf.new``, ``Pdf.add_blank_page`` and ``Pdf.make_stream``
+ convenience methods for creating new PDFs from scratch.
+- Added binding for new QPDF JSON feature: ``Object.to_json``.
+- We now automatically update the XMP PDFVersion metadata field to be
+ consistent with the PDF's declared version, if the field is present.
+- Made our Python-augmented C++ classes easier for Python code
+ inspectors to understand.
+- Eliminated use of the ``imghdr`` library.
+- Autoformatted Python code with black.
+- Fixed handling of XMP metadata that omits the standard
+ ``<x:xmpmeta>`` wrapper.
+
+v1.0.5
+======
+
+- Fixed an issue where an invalid date in XMP metadata would cause an
+ exception when updating DocumentInfo. For now, we warn that some
+ DocumentInfo is not convertible. (In the future, we should also check
+ if the XMP date is valid, because it probably is not.)
+- Rebuilt the binary wheels with libqpdf 8.3.0. libqpdf 8.2.1 is still
+ supported.
+
+v1.0.4
+======
+
+- Updates to tests/resources (provenance of one test file, replaced
+ another test file with a synthetic one)
+
+v1.0.3
+======
+
+- Fixed regression on negative indexing of pages.
+
+v1.0.2
+======
+
+- Fixed an issue where invalid values such as out of range years (e.g.
+ 0) in DocumentInfo would raise exceptions when using DocumentInfo to
+ populate XMP metadata with ``.load_from_docinfo``.
+
+v1.0.1
+======
+
+- Fixed an exception with handling metadata that contains the invalid
+ XML entity ``&#0;`` (an escaped NUL)
+
+v1.0.0
+======
+
+- Changed version to 1.0.
+
+v0.10.2
+=======
+
+Fixes
+-----
+
+- Fixed segfault when overwriting the pikepdf file that is currently
+ open on Linux.
+- Fixed removal of an attribute metadata value when values were present
+ on the same node.
+
+v0.10.1
+=======
+
+.. _fixes-1:
+
+Fixes
+-----
+
+- Avoid canonical XML since it is apparently too strict for XMP.
+
+v0.10.0
+=======
+
+.. _fixes-2:
+
+Fixes
+-----
+
+- Fixed several issues related to generating XMP metadata that passed
+ veraPDF validation.
+- Fixed a random test suite failure for very large negative integers.
+- The lxml library is now required.
+
+v0.9.2
+======
+
+.. _fixes-3:
+
+Fixes
+-----
+
+- Added all of the commonly used XML namespaces to XMP metadata
+ handling, so we are less likely to name something 'ns1', etc.
+- Skip a test that fails on Windows.
+- Fixed build errors in documentation.
+
+v0.9.1
+======
+
+.. _fixes-4:
+
+Fixes
+-----
+
+- Fix ``Object.write()`` accepting positional arguments it wouldn't use
+- Fix handling of XMP data with timezones (or missing timezone
+ information) in a few cases
+- Fix generation of XMP with invalid XML characters if the invalid
+ characters were inside a non-scalar object
+
+v0.9.0
+======
+
+Updates
+-------
+
+- New API to access and edit PDF metadata and make consistent edits to
+ the new and old style of PDF metadata.
+- 32-bit binary wheels are now available for Windows
+- PDFs can now be saved in QPDF's "qdf" mode
+- The Python package defusedxml is now required
+- The Python package python-xmp-toolkit and its dependency libexempi
+ are suggested for testing, but not required
+
+.. _fixes-5:
+
+Fixes
+-----
+
+- Fixed handling of filenames that contain multibyte characters on
+ non-UTF-8 systems
+
+Breaking
+--------
+
+- The ``Pdf.metadata`` property was removed, and replaced with the new
+ metadata API
+- ``Pdf.attach()`` has been removed, because the interface as
+ implemented had no way to deal with existing attachments.
+
+v0.3.7
+======
+
+- Add API for inline images to unparse themselves
+
+v0.3.6
+======
+
+- Performance of reading files from memory improved to avoid
+ unnecessary copies.
+- It is finally possible to use ``for key in pdfobj`` to iterate
+ contents of PDF Dictionary, Stream and Array objects. Generally these
+ objects behave more like Python containers should now.
+- Package API declared beta.
+
+v0.3.5
+======
+
+.. _breaking-1:
+
+Breaking
+--------
+
+- ``Pdf.save(...stream_data_mode=...)`` has been dropped in favor of
+ the newer ``compress_streams=`` and ``stream_decode_level``
+ parameters.
+
+.. _fixes-6:
+
+Fixes
+-----
+
+- A use-after-free memory error that caused occasional segfaults and
+ "QPDFFakeName" errors when opening from stream objects has been
+ resolved.
+
+v0.3.4
+======
+
+.. _updates-1:
+
+Updates
+-------
+
+- pybind11 vendoring has ended now that v2.2.4 has been released
+
+v0.3.3
+======
+
+.. _breaking-2:
+
+Breaking
+--------
+
+- libqpdf 8.2.1 is now required
+
+.. _updates-2:
+
+Updates
+-------
+
+- Improved support for working with JPEG2000 images in PDFs
+- Added progress callback for saving files,
+ ``Pdf.save(..., progress=)``
+- Updated pybind11 subtree
+
+.. _fixes-7:
+
+Fixes
+-----
+
+- ``del obj.AttributeName`` was not implemented. The attribute
+ interface is now consistent
+- Deleting named attributes now defers to the attribute dictionary for
+ Stream objects, as get/set do
+- Fixed handling of JPEG2000 images where metadata must be retrieved
+ from the file
+
+v0.3.2
+======
+
+.. _updates-3:
+
+Updates
+-------
+
+- Added support for direct image extraction of CMYK and grayscale
+ JPEGs, where previously only RGB (internally YUV) was supported
+- ``Array()`` now creates an empty array properly
+- The syntax ``Name.Foo in Dictionary()``, e.g.
+ ``Name.XObject in page.Resources``, now works
+
+v0.3.1
+======
+
+.. _breaking-3:
+
+Breaking
+--------
+
+- ``pikepdf.open`` now validates its keyword arguments properly,
+ potentially breaking code that passed invalid arguments
+- libqpdf 8.1.0 is now required - libqpdf 8.1.0 API is now used for
+ creating Unicode strings
+- If a non-existent file is opened with ``pikepdf.open``, a
+ ``FileNotFoundError`` is raised instead of a generic error
+- We are now *temporarily* vendoring a copy of pybind11 since its
+ master branch contains unreleased and important fixes for Python 3.7.
+
+.. _updates-4:
+
+Updates
+-------
+
+- The syntax ``Name.Thing`` (e.g. ``Name.DecodeParms``) is now
+ supported as equivalent to ``Name('/Thing')`` and is the recommended
+ way to refer names within a PDF
+- New API ``Pdf.remove_unneeded_resources()`` which removes objects
+ from each page's resource dictionary that are not used in the page.
+ This can be used to create smaller files.
+
+.. _fixes-8:
+
+Fixes
+-----
+
+- Fixed an error parsing inline images that have masks
+- Fixed several instances of catching C++ exceptions by value instead
+ of by reference
+
+v0.3.0
+======
+
+.. _breaking-4:
+
+Breaking
+--------
+
+- Modified ``Object.write`` method signature to require ``filter`` and
+ ``decode_parms`` as keyword arguments
+- Implement automatic type conversion from the PDF Null type to
+ ``None``
+- Removed ``Object.unparse_resolved`` in favor of
+ ``Object.unparse(resolved=True)``
+- libqpdf 8.0.2 is now required at minimum
+
+.. _updates-5:
+
+Updates
+-------
+
+- Improved IPython/Jupyter interface to directly export temporary PDFs
+- Updated to qpdf 8.1.0 in wheels
+- Added Python 3.7 support for Windows
+- Added a number of missing options from QPDF to ``Pdf.open`` and
+ ``Pdf.save``
+- Added ability to delete a slice of pages
+- Began using Jupyter notebooks for documentation
+
+v0.2.2
+======
+
+- Added Python 3.7 support to build and test (not yet available for
+ Windows, due to lack of availability on Appveyor)
+- Removed setter API from ``PdfImage`` because it never worked anyway
+- Improved handling of ``PdfImage`` with trivial palettes
+
+v0.2.1
+======
+
+- ``Object.check_owner`` renamed to ``Object.is_owned_by``
+- ``Object.objgen`` and ``Object.get_object_id`` are now public
+ functions
+- Major internal reorganization with ``pikepdf.models`` becoming the
+ submodule that holds support code to ease access to PDF objects as
+ opposed to wrapping QPDF.
+
+v0.2.0
+======
+
+- Implemented automatic type conversion for ``int``, ``bool`` and
+ ``Decimal``, eliminating the ``pikepdf.{Integer,Boolean,Real}``
+ types. Removed a lot of associated numerical code.
+
+Everything before v0.2.0 can be considered too old to document.
diff --git a/docs/resources.rst b/docs/resources.rst
index 0bcc3c7..03d5cf9 100644
--- a/docs/resources.rst
+++ b/docs/resources.rst
@@ -5,10 +5,14 @@ Resources
* `PDF 1.7`_ ISO Specification PDF 32000-1:2008
-* `Adobe extensions`_ to the PDF specification
+* `Adobe Supplement to ISO 32000 BaseVersion 1.7 ExtensionLevel 3`_, Adobe Acrobat 9.0, June 2008, for AESv3
+
+* Other `Adobe extensions`_ to the PDF specification
.. _QPDF Manual: http://qpdf.sourceforge.net/files/qpdf-manual.html
.. _PDF 1.7: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
.. _Adobe extensions: https://www.adobe.com/devnet/pdf/pdf_reference.html
+
+.. _Adobe Supplement to ISO 32000 BaseVersion 1.7 ExtensionLevel 3: https://www.adobe.com/content/dam/acom/en/devnet/pdf/adobe_supplement_iso32000.pdf
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 98a721d..17ae49b 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -1,6 +1,10 @@
Tutorial
********
+.. figure:: images/pike-cartoon.png
+ :figwidth: 30%
+ :align: right
+
**Opening and saving**
In contrast to better known PDF libraries, pikepdf uses a single object to
@@ -11,8 +15,8 @@ this :class:`pikepdf.Pdf`.
from pikepdf import Pdf
new_pdf = Pdf.new()
- sample_pdf = Pdf.open('sample.pdf')
- sample_pdf.save('sample2.pdf')
+ with Pdf.open('sample.pdf') as pdf:
+ pdf.save('output.pdf')
You may of course use ``from pikepdf import Pdf as ...`` if the short class
name conflicts or ``from pikepdf import Pdf as PDF`` if you prefer uppercase.
@@ -30,8 +34,8 @@ and ``Pdf.save()`` accepts streams as output.
This tutorial begins on the assumption that working with pages - splitting
and merging, saving and loading, is the most basic thing users want to do.
(The ``qpdf`` commandline tool, on which pikepdf is based, also does an
-excellent job of file level PDF handling.) What pikepdf does is make qpdf's
-powerful API more accessible.
+excellent job of file level PDF handling.) pikepdf makes qpdf's powerful API
+more accessible.
.. toctree::
:maxdepth: 1
diff --git a/docs/tutorial/page.rst b/docs/tutorial/page.rst
index 73a2dc4..06805da 100644
--- a/docs/tutorial/page.rst
+++ b/docs/tutorial/page.rst
@@ -66,14 +66,14 @@ Returning to the page's output:
In [1]: page1
-The angle brackets in the output indicate that this object cannot be
-constructed with a Python expression because it contains a reference. When
-angle brackets are omitted from the ``repr()`` of a pikepdf object, then the
-object can be replicated with a Python expression, such as
-``eval(repr(x)) == x``.
-
-In Jupyter and IPython, pikepdf will instead attempt to display a preview of
-the PDF page. An explicit ``repr(page)`` will show the text representation.
+The angle brackets in the output indicate that this object cannot be constructed with a
+Python expression because it contains a reference. When angle brackets are omitted from
+the ``repr()`` of a pikepdf object, then the object can be replicated with a Python
+expression, such as ``eval(repr(x)) == x``. Pages typically concern indirect references
+to themselves and other pages, so they cannot be represented as an expression.
+
+In Jupyter and IPython, pikepdf will instead attempt to display a preview of the PDF
+page, assuming a PDF rendering backend is available.
This page's MediaBox is a direct object. The MediaBox describes
the size of the page in PDF coordinates (1/72 inch multiplied by the value of
diff --git a/docs/tutorial/pages.rst b/docs/tutorial/pages.rst
index b36cdb5..9a3dea5 100644
--- a/docs/tutorial/pages.rst
+++ b/docs/tutorial/pages.rst
@@ -21,17 +21,15 @@ How many pages?
In [2]: len(pdf.pages)
-Thanks to IPython’s rich Python object representations you can view the PDF
-while you work on it if you execute this example in a Jupyter notebook. Click
-the *View PDF* link below to view the file. **You can view the PDF after each
-change you make.** If you’re reading this documentation online or as part of
-distribution, you won’t see the rich representation.
+pikepdf integrates with IPython and Jupyter's rich object APIs so that you can
+view PDFs, PDF pages, or images within PDF in a IPython window or Jupyter
+notebook. This makes it to test visual changes.
.. ipython::
:verbatim:
In [1]: pdf
- Out[1]: View PDF
+ Out[1]: « In Jupyter you would see the PDF here »
You can also examine individual pages, which we’ll explore in the next
section. Suffice to say that you can access pages by indexing them and
@@ -56,7 +54,7 @@ document scanners.
In [1]: pdf
-Pretty nice, isn’t it? Of course, the pages in this file are in correct
+Pretty nice, isn’t it? But the pages in this file already were in correct
order, so let’s put them back.
.. ipython::
@@ -113,6 +111,21 @@ We can also replace specific pages with assignment (or slicing).
In [1]: pdf.pages[2] = congress.pages[0]
+.. note::
+
+ Some interactive PDF features such as hyperlinks internal to the document
+ may stop working when a page is copied from one file to another.
+
+
+Copying pages within a PDF
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a page is copied (assigned) to a different position within the same PDF,
+the copy is constructed as a new page rather than a reference to the existing
+one. This is different from standard Python behavior.
+
+For a detailed explanation and workarounds, see :ref:`page-copying`.
+
Saving changes
~~~~~~~~~~~~~~
@@ -128,6 +141,23 @@ is commented out to avoid upsetting the documentation generator.)
You may save a file multiple times, and you may continue modifying it after
saving.
+Saving with encryption
+~~~~~~~~~~~~~~~~~~~~~~
+
+To save an encrypted (password protected) PDF, use a :class:`pikepdf.Encryption`
+object to specify the encryption settings. By default, pikepdf selects the strongest
+security handler and algorithm, but allows full access to modify file contents.
+A :class:`pikepdf.Permissions` object can be used to specify restrictions.
+
+.. ipython::
+ :verbatim:
+
+ In [1]: no_extracting = pikepdf.Permissions(extract=False)
+
+ In [1]: pdf.save('output.pdf', encryption=pikepdf.Encryption(
+ ...: user="user password", owner="owner password", allow=no_extracting
+ ...: ))
+
.. _splitpdf:
Split a PDF one page PDFs
@@ -190,6 +220,8 @@ numbers:
In [1]: pdf.pages[0] # Also the first page in the document
+ In [1]: pdf.pages.remove(p=1) # Remove first page in the document
+
To avoid confusion, the ``.p()`` accessor does not accept Python slices,
and ``.p(0)`` raises an exception. It is also not possible to delete using it.
diff --git a/docs/tutorial/streams.rst b/docs/tutorial/streams.rst
index 830578d..405f9f8 100644
--- a/docs/tutorial/streams.rst
+++ b/docs/tutorial/streams.rst
@@ -87,3 +87,21 @@ pikepdf provides a C++ optimized content stream parser.
>>> page = pdf.pages[0]
>>> for operands, command in parse_content_stream(page):
>>> print(command)
+
+Extracting text from PDFs
+-------------------------
+
+If you guessed that the content streams were the place to look for text inside a PDF
+– you'd be correct. Unfortunately, extracting the text is fairly difficult because
+content stream actually specifies as a font and glyph numbers to use. Sometimes, there
+is a 1:1 transparent mapping between Unicode numbers and glyph numbers, and dump of the
+content stream will show the text. In general, you cannot rely on there being a
+transparent mapping; in fact, it is perfectly legal for a font to specify no Unicode
+mapping at all, or to use an unconventional mapping (when a PDF contains a subsetted
+font for example).
+
+**We strongly recommend against trying to scrape text from the content stream.**
+
+pikepdf does not currently implement text extraction. We recommend `pdfminer.six <https://github.com/pdfminer/pdfminer.six>`_, a
+read-only text extraction tool. If you wish to write PDFs containing text, consider
+`reportlab <https://www.reportlab.com/opensource/>`_.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..653b6f6
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = [
+ "setuptools >= 30.3.0",
+ "wheel",
+ "setuptools_scm",
+ "setuptools_scm_git_archive",
+ "pybind11 >= 2.2.4, < 3"
+]
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 88
+target-version = ["py35"]
+skip-string-normalization = true
+include = '\.pyi?$'
+exclude = '''
+/(
+ \.eggs
+ | \.git
+ | \.hg
+ | \.mypy_cache
+ | \.tox
+ | \.venv
+ | _build
+ | buck-out
+ | build
+ | dist
+ | docs
+ | misc
+ | \.egg-info
+)/
+'''
diff --git a/requirements/docs.txt b/requirements/docs.txt
index e683077..8a7df44 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,4 +1,3 @@
-defusedxml
ipython
matplotlib
pybind11 # not strictly necessary if pybind11 is vendored
diff --git a/requirements/test.txt b/requirements/test.txt
index e798212..1844781 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,7 +1,8 @@
attrs >= 17.4.0
-hypothesis >= 3.56.9
+hypothesis >= 4.16, < 5
Pillow >= 5.0.0
-pytest >= 3.6.0, < 4.1.0
-pytest-xdist >= 1.22.2
-pytest-helpers-namespace >= 2017.11.11
-pytest-timeout >= 1.3.0
+pytest >= 4.4.0, < 5
+pytest-xdist >= 1.28, < 2
+pytest-helpers-namespace >= 2019.1.8
+pytest-timeout >= 1.3.3
+python-xmp-toolkit >= 2.0.1 ; sys_platform != "nt"
diff --git a/setup.cfg b/setup.cfg
index 50244a4..c0e2838 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,8 +24,12 @@ exclude_lines =
if 0:
if __name__ == .__main__.:
-omit =
- src/pikepdf/_boneyard.py
-
[coverage:html]
directory = coverage/pycov
+
+[isort]
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
diff --git a/setup.py b/setup.py
index 3793c56..c1f62b2 100644
--- a/setup.py
+++ b/setup.py
@@ -21,9 +21,15 @@ class get_pybind_include(object):
if exists('src/vendor/pybind11'):
return 'src/vendor/pybind11/include'
import pybind11
+
return pybind11.get_include(self.user)
+extra_includes = []
+if 'bsd' in sys.platform:
+ extra_includes = ['/usr/local/include']
+
+
ext_modules = [
Extension(
'pikepdf._qpdf',
@@ -32,11 +38,12 @@ ext_modules = [
include_dirs=[
# Path to pybind11 headers
get_pybind_include(),
- get_pybind_include(user=True)
+ get_pybind_include(user=True),
+ *extra_includes,
],
libraries=['qpdf'],
- language='c++'
- ),
+ language='c++',
+ )
]
@@ -47,6 +54,7 @@ def has_flag(compiler, flagname):
the specified compiler.
"""
import tempfile
+
with tempfile.NamedTemporaryFile('w', suffix='.cpp') as tmpf:
tmpf.write('int main (int argc, char **argv) { return 0; }')
try:
@@ -66,16 +74,15 @@ def cpp_flag(compiler):
elif has_flag(compiler, '-std=c++11'):
return '-std=c++11'
else:
- raise RuntimeError('Unsupported compiler -- at least C++11 support '
- 'is needed!')
+ raise RuntimeError(
+ 'Unsupported compiler -- at least C++11 support ' 'is needed!'
+ )
class BuildExt(build_ext):
"""A custom build extension for adding compiler-specific options."""
- c_opts = {
- 'msvc': ['/EHsc'],
- 'unix': [],
- }
+
+ c_opts = {'msvc': ['/EHsc'], 'unix': []}
if sys.platform == 'darwin':
c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
@@ -86,7 +93,10 @@ class BuildExt(build_ext):
if ct == 'unix':
opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
opts.append(cpp_flag(self.compiler))
- if has_flag(self.compiler, '-fvisibility=hidden'):
+ if (
+ has_flag(self.compiler, '-fvisibility=hidden')
+ and sys.platform != 'darwin'
+ ):
opts.append('-fvisibility=hidden')
elif ct == 'msvc':
opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
@@ -94,19 +104,18 @@ class BuildExt(build_ext):
ext.extra_compile_args = opts
build_ext.build_extensions(self)
+
setup_py_cwd = dirname(__file__)
with open(join(setup_py_cwd, 'requirements/docs.txt')) as f:
docs_require = [
- line.strip() for line in f
- if line.strip() and not line.strip().startswith('#')
+ line.strip() for line in f if line.strip() and not line.strip().startswith('#')
]
with open(join(setup_py_cwd, 'requirements/test.txt')) as f:
tests_require = [
- line.strip() for line in f
- if line.strip() and not line.strip().startswith('#')
+ line.strip() for line in f if line.strip() and not line.strip().startswith('#')
]
with open(join(setup_py_cwd, 'README.md'), encoding='utf-8') as f:
@@ -121,29 +130,20 @@ setup(
long_description=readme,
long_description_content_type='text/markdown',
ext_modules=ext_modules,
- install_requires=[
- 'defusedxml >= 0.5.0',
- 'lxml >= 4.0',
- ],
- extras_require={
- 'docs': docs_require
- },
+ install_requires=['lxml >= 4.0'],
+ extras_require={'docs': docs_require},
cmdclass={'build_ext': BuildExt},
zip_safe=False,
python_requires='>=3.5',
setup_requires=[
- 'pytest-runner',
'setuptools_scm',
- 'pybind11 >= 2.2.4, < 3'
+ 'setuptools_scm_git_archive',
],
use_scm_version=True,
tests_require=tests_require,
package_dir={'': 'src'},
packages=setuptools.find_packages('src'),
- package_data={
- '': ['*.txt'],
- 'pikepdf': ['qpdf21.dll']
- },
+ package_data={'': ['*.txt'], 'pikepdf': ['qpdf21.dll']},
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
@@ -160,6 +160,6 @@ setup(
project_urls={
'Documentation': 'https://pikepdf.readthedocs.io/',
'Source': 'https://github.com/pikepdf/pikepdf',
- 'Tracker': 'https://github.com/pikepdf/pikepdf/issues'
- }
+ 'Tracker': 'https://github.com/pikepdf/pikepdf/issues',
+ },
)
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py
index 2e42605..8de467a 100644
--- a/src/pikepdf/__init__.py
+++ b/src/pikepdf/__init__.py
@@ -4,38 +4,46 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-
-from pkg_resources import (
- get_distribution as _get_distribution,
- DistributionNotFound
-)
+"""A library for manipulating PDFs"""
try:
from . import _qpdf
except ImportError:
raise ImportError("pikepdf's extension library failed to import")
-from ._qpdf import (
- PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel
-)
+from ._version import __version__
+from ._qpdf import PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel
from .objects import (
- Object, ObjectType, Name, String, Array, Dictionary, Stream, Operator
+ Object,
+ ObjectType,
+ Name,
+ String,
+ Array,
+ Dictionary,
+ Stream,
+ Operator,
)
from .models import (
- PdfImage, PdfInlineImage, UnsupportedImageTypeError, PdfMatrix,
- parse_content_stream
+ PdfImage,
+ PdfInlineImage,
+ UnsupportedImageTypeError,
+ PdfMatrix,
+ Encryption,
+ Permissions,
+ parse_content_stream,
)
from . import _methods
-
-try:
- __version__ = _get_distribution(__name__).version
-except DistributionNotFound:
- __version__ = "Not installed"
+from . import codec
__libqpdf_version__ = _qpdf.qpdf_version()
def open(*args, **kwargs): # pylint: disable=redefined-builtin
- "Alias for :func:`pikepdf.Pdf.open`."
+ """Alias for :func:`pikepdf.Pdf.open`. Open a PDF."""
return Pdf.open(*args, **kwargs)
+
+
+def new(*args, **kwargs):
+ """Alias for :func:`pikepdf.Pdf.new`. Create a new empty PDF."""
+ return Pdf.new(*args, **kwargs)
diff --git a/src/pikepdf/_cpphelpers.py b/src/pikepdf/_cpphelpers.py
index d975657..7ef0654 100644
--- a/src/pikepdf/_cpphelpers.py
+++ b/src/pikepdf/_cpphelpers.py
@@ -12,12 +12,13 @@ called from Python, and subject to change at any time.
import os
import sys
-
# Provide os.fspath equivalent for Python <3.6
if sys.version_info[0:2] <= (3, 5): # pragma: no cover
+
def fspath(path):
'''https://www.python.org/dev/peps/pep-0519/#os'''
import pathlib
+
if isinstance(path, (str, bytes)):
return path
@@ -36,12 +37,23 @@ if sys.version_info[0:2] <= (3, 5): # pragma: no cover
if isinstance(path, (str, bytes)):
return path
else:
- raise TypeError("expected __fspath__() to return str or bytes, "
- "not " + type(path).__name__)
+ raise TypeError(
+ "expected __fspath__() to return str or bytes, "
+ "not " + type(path).__name__
+ )
raise TypeError(
"expected str, bytes, pathlib.Path or os.PathLike object, not "
- + path_type.__name__)
+ + path_type.__name__
+ )
+
else:
fspath = os.fspath
+
+
+def update_xmp_pdfversion(pdf, version):
+
+ with pdf.open_metadata(set_pikepdf_as_editor=False, update_docinfo=False) as meta:
+ if 'pdf:PDFVersion' in meta:
+ meta['pdf:PDFVersion'] = version
diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py
index 6c2b90b..ac6134c 100644
--- a/src/pikepdf/_methods.py
+++ b/src/pikepdf/_methods.py
@@ -12,22 +12,21 @@ bindings after the fact.
We can also move the implementation to C++ if desired.
"""
-from tempfile import NamedTemporaryFile
-from subprocess import run, PIPE
-from io import BytesIO
-
-from collections.abc import KeysView
-
import inspect
+from collections import namedtuple
+from collections.abc import KeysView
+from io import BytesIO
+from subprocess import PIPE, run
+from tempfile import NamedTemporaryFile
-from . import Pdf, Dictionary, Array, Name, Stream, Object
+from . import Array, Dictionary, Name, Object, Pdf, Stream
from ._qpdf import _ObjectMapping
-from .models import PdfMetadata
-
+from .models import PdfMetadata, Permissions, EncryptionInfo
# pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object
-def extends(cls_cpp):
+
+def augments(cls_cpp):
"""Attach methods of a Python support class to an existing class
This monkeypatches all methods defined in the support class onto an
@@ -35,37 +34,44 @@ def extends(cls_cpp):
.. code-block:: python
- @extends(ClassDefinedInCpp)
+ @augments(ClassDefinedInCpp)
class SupportClass:
def foo(self):
pass
- The method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
+ The Python method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass
has no meaning on its own and should not be used, but gets returned from
this function so IDE code inspection doesn't get too confused.
We don't subclass because it's much more convenient to monkeypatch Python
methods onto the existing Python binding of the C++ class. For one thing,
this allows the implementation to be moved from Python to C++ or vice
- versa. It saves having to implement an intermediate subclass and then
- ensures that the superclass never 'leaks' to pikepdf users.
+ versa. It saves having to implement an intermediate Python subclass and then
+ ensures that the C++ superclass never 'leaks' to pikepdf users. Finally,
+ wrapper classes and subclasses can become problematic if the call stack
+ crosses the C++/Python boundary multiple times.
Any existing methods may be used, regardless of whether they defined
elsewhere in the support class or in the target class.
+
+ The target class does not have to be C++ or derived from pybind11.
"""
- def real_class_extend(cls, cls_cpp=cls_cpp):
+ def class_augment(cls, cls_cpp=cls_cpp):
for name, fn in inspect.getmembers(cls, inspect.isfunction):
- fn.__qualname__ = fn.__qualname__.replace(
- cls.__name__, cls_cpp.__name__)
+ fn.__qualname__ = fn.__qualname__.replace(cls.__name__, cls_cpp.__name__)
setattr(cls_cpp, name, fn)
for name, fn in inspect.getmembers(cls, inspect.isdatadescriptor):
setattr(cls_cpp, name, fn)
+
def block_init(self):
+ # Prevent initialization of the support class
raise NotImplementedError(self.__class__.__name__ + '.__init__')
+
cls.__init__ = block_init
return cls
- return real_class_extend
+
+ return class_augment
def _single_page_pdf(page):
@@ -86,17 +92,15 @@ def _mudraw(buffer, fmt):
tmp_in.flush()
proc = run(
- ['mudraw', '-F', fmt, '-o', '-', tmp_in.name],
- stdout=PIPE, stderr=PIPE
+ ['mudraw', '-F', fmt, '-o', '-', tmp_in.name], stdout=PIPE, stderr=PIPE
)
if proc.stderr:
raise RuntimeError(proc.stderr.decode())
return proc.stdout
-@extends(Object)
+@augments(Object)
class Extend_Object:
-
def _repr_mimebundle_(self, **kwargs):
"""Present options to IPython for rich display of this object
@@ -127,13 +131,116 @@ class Extend_Object:
pass
return data
+ def emplace(self, other):
+ """Copy all items from other without making a new object.
-@extends(Pdf)
-class Extend_Pdf:
+ Particularly when working with pages, it may be desirable to remove all
+ of the existing page's contents and emplace (insert) a new page on top
+ of it, in a way that preserves all links and references to the original
+ page. (Or similarly, for other Dictionary objects in a PDF.)
- def _repr_mimebundle_(self, **kwargs):
+ When a page is assigned (``pdf.pages[0] = new_page``), only the
+ application knows if references to the original the original page are
+ still valid. For example, a PDF optimizer might restructure a page
+ object into another visually similar one, and references would be valid;
+ but for a program that reorganizes page contents such as a N-up
+ compositor, references may not be valid anymore.
+
+ This method takes precautions to ensure that child objects in common
+ with ``self`` and ``other`` are not inadvertently deleted.
+
+ Example:
+ >>> pdf.pages[0].objgen
+ (16, 0)
+ >>> pdf.pages[0].emplace(pdf.pages[1])
+ >>> pdf.pages[0].objgen
+ (16, 0) # Same object
+ """
+ del_keys = set(self.keys()) - set(other.keys())
+ for k in other.keys():
+ self[k] = other[k] # pylint: disable=unsupported-assignment-operation
+ for k in del_keys:
+ del self[k] # pylint: disable=unsupported-delete-operation
+
+ def write(self, data, *, filter=None, decode_parms=None, type_check=True):
+ """
+ Replace stream object's data with new (possibly compressed) `data`.
+
+ `filter` and `decode_parms` specify that compression that is present on
+ the input `data`.
+
+ When writing the PDF in :meth:`pikepdf.Pdf.save`,
+ pikepdf may change the compression or apply compression to data that was
+ not compressed, depending on the parameters given to that function. It
+ will never change lossless to lossy encoding.
+
+ PNG and TIFF images, even if compressed, cannot be directly inserted
+ into a PDF and displayed as images.
+
+ Args:
+ data (bytes): the new data to use for replacement
+ filter (pikepdf.Name or pikepdf.Array): The filter(s) with which the
+ data is (already) encoded
+ decode_parms (pikepdf.Dictionary or pikepdf.Array): Parameters for the
+ filters with which the object is encode
+ type_check (bool): Check arguments; use False only if you want to
+ intentionally create malformed PDFs.
+
+ If only one `filter` is specified, it may be a name such as
+ `Name('/FlateDecode')`. If there are multiple filters, then array
+ of names should be given.
+
+ If there is only one filter, `decode_parms` is a Dictionary of
+ parameters for that filter. If there are multiple filters, then
+ `decode_parms` is an Array of Dictionary, where each array index
+ is corresponds to the filter.
"""
- Present options to IPython for rich display of this object
+
+ if type_check and filter is not None:
+ if isinstance(filter, list):
+ filter = Array(filter)
+ filter = filter.wrap_in_array()
+
+ if isinstance(decode_parms, list):
+ decode_parms = Array(decode_parms)
+ elif decode_parms is None:
+ decode_parms = Array([])
+ else:
+ decode_parms = decode_parms.wrap_in_array()
+
+ if not all(isinstance(item, Name) for item in filter):
+ raise TypeError(
+ "filter must be: pikepdf.Name or pikepdf.Array([pikepdf.Name])"
+ )
+ if not all(
+ (isinstance(item, Dictionary) or item is None) for item in decode_parms
+ ):
+ raise TypeError(
+ "decode_parms must be: pikepdf.Dictionary or "
+ "pikepdf.Array([pikepdf.Dictionary])"
+ )
+ if len(decode_parms) != 0:
+ if len(filter) != len(decode_parms):
+ raise ValueError(
+ (
+ "filter ({}) and decode_parms ({}) must be arrays of "
+ " same length"
+ ).format(repr(filter), repr(decode_parms))
+ )
+ if len(filter) == 1:
+ filter = filter[0]
+ if len(decode_parms) == 0:
+ decode_parms = None
+ elif len(decode_parms) == 1:
+ decode_parms = decode_parms[0]
+ self._write(data, filter=filter, decode_parms=decode_parms)
+
+
+@augments(Pdf)
+class Extend_Pdf:
+ def _repr_mimebundle_(self, **_kwargs):
+ """
+ Present options to IPython or Jupyter for rich display of this object
See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display
"""
@@ -145,16 +252,12 @@ class Extend_Pdf:
data = {'application/pdf': bio.read()}
return data
- def open_metadata(
- self,
- set_pikepdf_as_editor=True,
- update_docinfo=True
- ):
+ def open_metadata(self, set_pikepdf_as_editor=True, update_docinfo=True):
"""
Open the PDF's XMP metadata for editing
Recommend for use in a ``with`` block. Changes are committed to the
- PDF when the block exits.
+ PDF when the block exits. (The ``Pdf`` must still be opened.)
Example:
>>> with pdf.open_metadata() as meta:
@@ -173,11 +276,128 @@ class Extend_Pdf:
pikepdf.models.PdfMetadata
"""
return PdfMetadata(
- self,
- pikepdf_mark=set_pikepdf_as_editor,
- sync_docinfo=update_docinfo
+ self, pikepdf_mark=set_pikepdf_as_editor, sync_docinfo=update_docinfo
+ )
+
+ def make_stream(self, data):
+ """
+ Create a new pikepdf.Stream object that is attached to this PDF.
+
+ Args:
+ data (bytes): Binary data for the stream object
+ """
+ return Stream(self, data)
+
+ def add_blank_page(self, *, page_size=(612, 792)):
+ """
+ Add a blank page to this PD. If pages already exist, the page will be added to
+ the end. Pages may be reordered using ``Pdf.pages``.
+
+ The caller may add content to the page by modifying its objects after creating
+ it.
+
+ Args:
+ page_size (tuple): The size of the page in PDF units (1/72 inch or 0.35mm).
+ Default size is set to a US Letter 8.5" x 11" page.
+ """
+ for dim in page_size:
+ if not (3 <= dim <= 14400):
+ raise ValueError('Page size must be between 3 and 14400 PDF units')
+
+ page_dict = Dictionary(
+ Type=Name.Page,
+ MediaBox=Array([0, 0, page_size[0], page_size[1]]),
+ Contents=self.make_stream(b''),
+ Resources=Dictionary(),
+ )
+ page = self.make_indirect(page_dict)
+ self._add_page(page, first=False)
+ return page
+
+ def close(self):
+ """
+ Close a Pdf object and release resources acquired by pikepdf
+
+ If pikepdf opened the file handle it will close it (e.g. when opened with a file
+ path). If the caller opened the file for pikepdf, the caller close the file.
+
+ pikepdf lazily loads data from PDFs, so some :class:`pikepdf.Object` may
+ implicitly depend on the :class:`pikepdf.Pdf` being open. This is always the
+ case for :class:`pikepdf.Stream` but can be true for any object. Do not close
+ the `Pdf` object if you might still be accessing content from it.
+
+ When an ``Object`` is copied from one ``Pdf`` to another, the ``Object`` is copied into
+ the destination ``Pdf`` immediately, so after accessing all desired information
+ from the source ``Pdf`` it may be closed.
+
+ Caution:
+ Closing the ``Pdf`` is currently implemented by resetting it to an empty
+ sentinel. It is currently possible to edit the sentinel as if it were a live
+ object. This behavior should not be relied on and is subject to change.
+
+ """
+
+ EMPTY_PDF = (
+ b"%PDF-1.3\n"
+ b"1 0 obj\n"
+ b"<< /Type /Catalog /Pages 2 0 R >>\n"
+ b"endobj\n"
+ b"2 0 obj\n"
+ b"<< /Type /Pages /Kids [] /Count 0 >>\n"
+ b"endobj\n"
+ b"xref\n"
+ b"0 3\n"
+ b"0000000000 65535 f \n"
+ b"0000000009 00000 n \n"
+ b"0000000058 00000 n \n"
+ b"trailer << /Size 3 /Root 1 0 R >>\n"
+ b"startxref\n"
+ b"110\n"
+ b"%%EOF\n"
)
+ if self.filename:
+ description = "closed file: " + self.filename
+ else:
+ description = "closed object"
+ self._process(description, EMPTY_PDF)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ @property
+ def allow(self):
+ """
+ Report permissions associated with this PDF.
+
+ By default these permissions will be replicated when the PDF is
+ saved. Permissions may also only be changed when a PDF is being saved,
+ and are only available for encrypted PDFs. If a PDF is not encrypted,
+ all operations are reported as allowed.
+
+ pikepdf has no way of enforcing permissions.
+
+ Returns: pikepdf.models.Permissions
+ """
+ results = {}
+ for field in Permissions.fields():
+ results[field] = getattr(self, '_allow_' + field)
+ return Permissions(**results)
+
+ @property
+ def encryption(self):
+ """
+ Report encryption information for this PDF.
+
+ Encryption settings may only be changed when a PDF is saved.
+
+ Returns: pikepdf.models.EncryptionInfo
+ """
+ return EncryptionInfo(self._encryption_data)
+
def _attach(self, *, basename, filebytes, mime=None, desc=''):
"""
Attach a file to this PDF
@@ -219,6 +439,7 @@ class Extend_Pdf:
if not mime:
from mimetypes import guess_type
+
mime, _encoding = guess_type(basename)
if not mime:
mime = 'application/octet-stream'
@@ -226,28 +447,28 @@ class Extend_Pdf:
filestream = Stream(self, filebytes)
filestream.Subtype = Name('/' + mime)
- filespec = Dictionary({
- '/Type': Name.Filespec,
- '/F': basename,
- '/UF': basename,
- '/Desc': desc,
- '/EF': Dictionary({
- '/F': filestream
- })
- })
+ filespec = Dictionary(
+ {
+ '/Type': Name.Filespec,
+ '/F': basename,
+ '/UF': basename,
+ '/Desc': desc,
+ '/EF': Dictionary({'/F': filestream}),
+ }
+ )
# names = self.Root.Names.EmbeddedFiles.Names.as_list()
# names.append(filename) # Key
# names.append(self.make_indirect(filespec))
- self.Root.Names.EmbeddedFiles.Names = Array([
- basename, # key
- self.make_indirect(filespec)
- ])
+ self.Root.Names.EmbeddedFiles.Names = Array(
+ [basename, self.make_indirect(filespec)] # key
+ )
if '/PageMode' not in self.Root:
self.Root.PageMode = Name.UseAttachments
-@extends(_ObjectMapping)
+
+@augments(_ObjectMapping)
class Extend_ObjectMapping:
def __contains__(self, key):
try:
diff --git a/src/pikepdf/_version.py b/src/pikepdf/_version.py
new file mode 100644
index 0000000..c9d4b7b
--- /dev/null
+++ b/src/pikepdf/_version.py
@@ -0,0 +1,13 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+from pkg_resources import DistributionNotFound
+from pkg_resources import get_distribution as _get_distribution
+
+try:
+ __version__ = _get_distribution(__package__).version
+except DistributionNotFound:
+ __version__ = "Not installed"
diff --git a/src/pikepdf/codec.py b/src/pikepdf/codec.py
new file mode 100644
index 0000000..d008fb2
--- /dev/null
+++ b/src/pikepdf/codec.py
@@ -0,0 +1,48 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+import codecs
+
+from ._qpdf import utf8_to_pdf_doc, pdf_doc_to_utf8
+
+
+def pdfdoc_encode(input, errors='strict'):
+ error_marker = b'?' if errors == 'replace' else b'\xad'
+ success, pdfdoc = utf8_to_pdf_doc(input, error_marker)
+ if not success:
+ if errors == 'strict':
+ raise ValueError("'pdfdoc' codec can't encode")
+ if errors == 'ignore':
+ pdfdoc = pdfdoc.replace(b'\xad', b'')
+ return pdfdoc, len(input)
+
+
+def pdfdoc_decode(input, errors='strict'):
+ if isinstance(input, memoryview):
+ input = input.tobytes()
+ utf8 = pdf_doc_to_utf8(input)
+ return utf8, len(input)
+
+
+class PdfDocCodec(codecs.Codec):
+ """Implements PdfDocEncoding character map used inside PDFs"""
+
+ def encode(self, input, errors='strict'):
+ return pdfdoc_encode(input, errors)
+
+ def decode(self, input, errors='strict'):
+ return pdfdoc_decode(input, errors)
+
+
+def find_pdfdoc(encoding):
+ if encoding == 'pdfdoc':
+ return codecs.CodecInfo(
+ name='pdfdoc', encode=PdfDocCodec().encode, decode=PdfDocCodec().decode
+ )
+ return None
+
+
+codecs.register(find_pdfdoc)
diff --git a/src/pikepdf/models/__init__.py b/src/pikepdf/models/__init__.py
index b0d27bc..023b836 100644
--- a/src/pikepdf/models/__init__.py
+++ b/src/pikepdf/models/__init__.py
@@ -4,10 +4,11 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-from .. import Object, ObjectType, PdfError
-from .matrix import PdfMatrix
+from pikepdf import Object, ObjectType, PdfError
+from .encryption import Permissions, Encryption, EncryptionInfo
from .image import PdfImage, PdfInlineImage, UnsupportedImageTypeError
+from .matrix import PdfMatrix
from .metadata import PdfMetadata
@@ -50,8 +51,10 @@ def parse_content_stream(page_or_stream, operators=''):
if not isinstance(page_or_stream, Object):
raise TypeError("stream must a PDF object")
- if page_or_stream._type_code != ObjectType.stream \
- and page_or_stream.get('/Type') != '/Page':
+ if (
+ page_or_stream._type_code != ObjectType.stream
+ and page_or_stream.get('/Type') != '/Page'
+ ):
raise TypeError("parse_content_stream called on page or stream object")
try:
@@ -87,8 +90,7 @@ class _Page:
raise AttributeError(item)
def __repr__(self):
- return repr(self.obj).replace(
- 'pikepdf.Dictionary', 'pikepdf.Page', 1)
+ return repr(self.obj).replace('pikepdf.Dictionary', 'pikepdf.Page', 1)
@property
def mediabox(self):
@@ -107,8 +109,7 @@ class _Page:
:return: True if there is text
"""
text_showing_operators = """TJ " ' Tj"""
- text_showing_insts = parse_content_stream(
- self.obj, text_showing_operators)
+ text_showing_insts = parse_content_stream(self.obj, text_showing_operators)
if len(text_showing_insts) > 0:
return True
return False
diff --git a/src/pikepdf/models/encryption.py b/src/pikepdf/models/encryption.py
new file mode 100644
index 0000000..c61df71
--- /dev/null
+++ b/src/pikepdf/models/encryption.py
@@ -0,0 +1,154 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+
+import collections
+import types
+
+
+class Permissions(types.SimpleNamespace):
+ """
+ Stores the permissions for an encrypted PDF.
+
+ Unencrypted PDFs implicitly have all permissions allowed.
+ pikepdf does not enforce the restrictions in any way. Permissions
+ can only be changed when a PDF is saved.
+ """
+
+ def __init__(
+ self,
+ accessibility=True,
+ extract=True,
+ modify_annotation=True,
+ modify_assembly=False,
+ modify_form=True,
+ modify_other=True,
+ print_lowres=True,
+ print_highres=True,
+ ):
+ kvs = locals()
+ del kvs['self']
+ super().__init__(**kvs)
+
+ def _readonly(self, *args):
+ raise TypeError("object is read-only")
+
+ __setattr__ = _readonly
+
+ __delattr__ = _readonly
+
+ def keys(self):
+ yield from (k for k in self.__dict__ if not k.startswith('_'))
+
+ def values(self):
+ yield from (v for k, v in self.__dict__.items() if not k.startswith('_'))
+
+ @classmethod
+ def fields(cls):
+ yield from (k for k in cls().__dict__ if not k.startswith('_'))
+
+
+class EncryptionInfo:
+ """
+ Reports encryption information for an encrypted PDF.
+
+ This information may not be changed, except when a PDF is saved.
+ This object is not used to specify the encryption settings to save
+ a PDF, due to non-overlapping information requirements.
+ """
+
+ def __init__(self, encdict):
+ self._encdict = encdict
+
+ @property
+ def R(self):
+ """Revision number of the security handler."""
+ return self._encdict['R']
+
+ @property
+ def V(self):
+ """Version of PDF password algorithm."""
+ return self._encdict['V']
+
+ @property
+ def P(self):
+ """Encoded permission bits.
+
+ See :meth:`Pdf.allow` instead.
+ """
+ return self._encdict['P']
+
+ @property
+ def stream_method(self):
+ """Encryption method used to encode streams."""
+ return self._encdict['stream']
+
+ @property
+ def string_method(self):
+ """Encryption method used to encode strings."""
+ return self._encdict['string']
+
+ @property
+ def file_method(self):
+ """Encryption method used to encode the whole file."""
+ return self._encdict['file']
+
+ @property
+ def user_password(self):
+ """If possible, return the user password.
+
+ The user password can only be retrieved when a PDF is opened
+ with the owner password and when older versions of the
+ encryption algorithm are used.
+
+ The password is always returned as ``bytes`` even if it has
+ a clear Unicode representation.
+ """
+ return self._encdict['user_passwd']
+
+ @property
+ def encryption_key(self):
+ """The RC4 or AES encryption key used for this file."""
+ return self._encdict['encryption_key']
+
+ @property
+ def bits(self):
+ """The number of encryption bits."""
+ return len(self._encdict['encryption_key']) * 8
+
+
+class Encryption(dict):
+ """
+ Specify the encryption settings to apply when a PDF is saved.
+
+ Args:
+ owner (str): The owner password to use. This allows full control
+ of the file. If blank, the PDF will be encrypted and
+ present as "(SECURED)" in PDF viewers. If the owner password
+ is blank, the user password should be as well.
+ user (str): The user password to use. With this password, some
+ restrictions will be imposed by a typical PDF reader.
+ If blank, the PDF can be opened by anyone, but only modified
+ as allowed by the permissions in ``allow``.
+ R (int): Select the security handler algorithm to use. Choose from:
+ ``2``, ``3``, ``4`` or ``6``. By default, the highest version of
+ is selected (``6``). ``5`` is a deprecated algorithm that should
+ not be used.
+ allow (pikepdf.Permissions): The permissions to set.
+ If omitted, all permissions are granted to the user.
+ aes (bool): If True, request the AES algorithm. If False, use RC4.
+ If omitted, AES is selected whenever possible (R >= 4).
+ metadata (bool): If True, also encrypt the PDF metadata. If False,
+ metadata is not encrypted. Reading document metadata without
+ decryption may be desirable in some cases. Requires ``aes=True``.
+ If omitted, metadata is encrypted whenever possible.
+ """
+
+ def __init__(
+ self, *, owner, user, R=6, allow=Permissions(), aes=True, metadata=True
+ ):
+ self.update(
+ dict(R=R, owner=owner, user=user, allow=allow, aes=aes, metadata=metadata)
+ )
diff --git a/src/pikepdf/models/image.py b/src/pikepdf/models/image.py
index 8ecb571..6493d85 100644
--- a/src/pikepdf/models/image.py
+++ b/src/pikepdf/models/image.py
@@ -4,20 +4,23 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+import struct
+from abc import ABC, abstractmethod
+from decimal import Decimal
from io import BytesIO
from itertools import zip_longest
-from abc import ABC, abstractmethod
-import struct
+from pathlib import Path
+from shutil import copyfileobj
+from tempfile import NamedTemporaryFile
+from zlib import decompress, error as ZlibError
-from decimal import Decimal
+from .. import Array, Dictionary, Name, Object, PdfError, Stream
-from .. import (
- Object, Array, PdfError, Name, Dictionary, Stream
-)
class DependencyError(Exception):
pass
+
class UnsupportedImageTypeError(Exception):
pass
@@ -37,7 +40,11 @@ def array_str_colorspace(value):
result = [str(items[n]) for n in range(3)]
result.append(bytes(items[3]))
return result
+ if len(items) == 2 and items[0] == '/ICCBased':
+ result = [str(items[0]), items[1]]
+ return result
return array_str(items)
+
return array_str(value)
@@ -112,16 +119,16 @@ class PdfImageBase(ABC):
if self._colorspaces:
if self._colorspaces[0] in self.SIMPLE_COLORSPACES:
return self._colorspaces[0]
- if self._colorspaces[0] == '/DeviceCMYK':
+ if self._colorspaces[0] in ('/DeviceCMYK', '/ICCBased'):
return self._colorspaces[0]
- if self._colorspaces[0] == '/Indexed' \
- and self._colorspaces[1] in self.SIMPLE_COLORSPACES:
+ if (
+ self._colorspaces[0] == '/Indexed'
+ and self._colorspaces[1] in self.SIMPLE_COLORSPACES
+ ):
return self._colorspaces[1]
- if self._colorspaces[0] == '/ICCBased':
- icc = self._colorspaces[1]
- return icc.stream_dict.get('/Alternate', '')
raise NotImplementedError(
- "not sure how to get colorspace: " + repr(self._colorspaces))
+ "not sure how to get colorspace: " + repr(self._colorspaces)
+ )
@property
def bits_per_component(self):
@@ -136,6 +143,11 @@ class PdfImageBase(ABC):
pass
@property
+ @abstractmethod
+ def icc(self):
+ pass
+
+ @property
def indexed(self):
"""``True`` if the image has a defined color palette"""
return '/Indexed' in self._colorspaces
@@ -147,7 +159,12 @@ class PdfImageBase(ABC):
@property
def mode(self):
- """``PIL.Image.mode`` equivalent for this image"""
+ """``PIL.Image.mode`` equivalent for this image, where possible
+
+ If an ICC profile is attached to the image, we still attempt to resolve a Pillow
+ mode.
+ """
+
m = ''
if self.indexed:
m = 'P'
@@ -160,6 +177,18 @@ class PdfImageBase(ABC):
m = 'L'
elif self.colorspace == '/DeviceCMYK':
m = 'CMYK'
+ elif self.colorspace == '/ICCBased':
+ try:
+ icc_profile = self._colorspaces[1]
+ icc_profile_nchannels = int(icc_profile['/N'])
+ if icc_profile_nchannels == 1:
+ m = 'L'
+ elif icc_profile_nchannels == 3:
+ m = 'RGB'
+ elif icc_profile_nchannels == 4:
+ m = 'CMYK'
+ except (ValueError, TypeError):
+ pass
if m == '':
raise NotImplementedError("Not sure how to handle PDF image of this type")
return m
@@ -175,7 +204,6 @@ class PdfImageBase(ABC):
[(/FilterName, {/DecodeParmName: Value, ...}), ...]
The order of /Filter matters as indicates the encoding/decoding sequence.
-
"""
return list(zip_longest(self.filters, self.decode_parms, fillvalue={}))
@@ -183,8 +211,8 @@ class PdfImageBase(ABC):
def palette(self):
"""Retrieves the color palette for this image
- :returns: (base_colorspace: str, palette: bytes)
- :rtype: tuple
+ Returns:
+ tuple (base_colorspace: str, palette: bytes)
"""
if not self.indexed:
@@ -209,6 +237,29 @@ class PdfImageBase(ABC):
def as_pil_image(self):
pass
+ @staticmethod
+ def _unstack_compression(buffer, filters):
+ """Remove stacked compression where it appears.
+
+ Stacked compression means when an image is set to:
+ ``[/FlateDecode /DCTDecode]``
+ for example.
+
+ Only Flate can be stripped off the front currently.
+
+ Args:
+ buffer (pikepdf._qpdf.Buffer): the compressed image data
+ filters (list of str): all files on the data
+ """
+ data = memoryview(buffer)
+ while len(filters) > 1 and filters[0] == '/FlateDecode':
+ try:
+ data = decompress(data)
+ except ZlibError as e:
+ raise UnsupportedImageTypeError() from e
+ filters = filters[1:]
+ return data, filters
+
class PdfImage(PdfImageBase):
"""Support class to provide a consistent API for manipulating PDF images
@@ -237,21 +288,20 @@ class PdfImage(PdfImageBase):
obj (pikepdf.Object): an Image XObject
"""
- if isinstance(obj, Stream) and \
- obj.stream_dict.get("/Subtype") != "/Image":
+ if isinstance(obj, Stream) and obj.stream_dict.get("/Subtype") != "/Image":
raise TypeError("can't construct PdfImage from non-image")
self.obj = obj
+ self._icc = None
@classmethod
def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover
"""Insert a PIL image into a PDF (rudimentary)
- :param pdf: the PDF to attach the image to
- :type pdf: pikepdf.Pdf
- :param page: the page to attach the image to
- :param name: the name to set the image
- :param image: image
- :type image: PIL.Image.Image
+ Args:
+ pdf (pikepdf.Pdf): the PDF to attach the image to
+ page (pikepdf.Object): the page to attach the image to
+ name (str or pikepdf.Name): the name to set the image
+ image (PIL.Image.Image): the image to insert
"""
data = image.tobytes()
@@ -279,6 +329,26 @@ class PdfImage(PdfImageBase):
"""``False`` for image XObject"""
return False
+ @property
+ def icc(self):
+ """If an ICC profile is attached, return a Pillow object that describe it.
+
+ Most of the information may be found in ``icc.profile``.
+
+ Returns:
+ PIL.ImageCms.ImageCmsProfile
+ """
+ from PIL import ImageCms
+
+ if self.colorspace != '/ICCBased':
+ return None
+ if not self._icc:
+ iccstream = self._colorspaces[1]
+ iccbuffer = iccstream.get_stream_buffer()
+ iccbytesio = BytesIO(iccbuffer)
+ self._icc = ImageCms.ImageCmsProfile(iccbytesio)
+ return self._icc
+
def _extract_direct(self, *, stream):
"""Attempt to extract the image directly to a usable image file
@@ -296,30 +366,38 @@ class PdfImage(PdfImageBase):
# saved as a standard JPEG. RGB JPEGs without YUV conversion can't
# be saved as JPEGs, and are probably bugs. Some software in the
# wild actually produces RGB JPEGs in PDFs (probably a bug).
- return (self.mode == 'RGB' and
- self.filter_decodeparms[0][1].get('/ColorTransform', 1))
+ DEFAULT_CT_RGB = 1
+ ct = self.filter_decodeparms[0][1].get('/ColorTransform', DEFAULT_CT_RGB)
+ return self.mode == 'RGB' and ct == DEFAULT_CT_RGB
def normal_dct_cmyk():
# Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved.
# There is a YUVK colorspace but CMYK JPEGs don't generally use it
- return (self.mode == 'CMYK' and
- self.filter_decodeparms[0][1].get('/ColorTransform', 0))
+ DEFAULT_CT_CMYK = 0
+ ct = self.filter_decodeparms[0][1].get('/ColorTransform', DEFAULT_CT_CMYK)
+ return self.mode == 'CMYK' and ct == DEFAULT_CT_CMYK
+
+ data, filters = self._unstack_compression(
+ self.obj.get_raw_stream_buffer(), self.filters
+ )
- if self.filters == ['/CCITTFaxDecode']:
- data = self.obj.read_raw_bytes()
+ if filters == ['/CCITTFaxDecode']:
+ if self.colorspace == '/ICCBased':
+ raise UnsupportedImageTypeError("Cannot direct-extract CCITT + ICC")
stream.write(self._generate_ccitt_header(data))
stream.write(data)
return '.tif'
- elif self.filters == ['/DCTDecode'] and (
- self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk):
- buffer = self.obj.get_raw_stream_buffer()
- stream.write(buffer)
+ elif filters == ['/DCTDecode'] and (
+ self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk()
+ ):
+ stream.write(data)
return '.jpg'
raise UnsupportedImageTypeError()
def _extract_transcoded(self):
from PIL import Image
+
im = None
if self.mode == 'RGB' and self.bits_per_component == 8:
# No point in accessing the buffer here, size qpdf decodes to 3-byte
@@ -330,8 +408,7 @@ class PdfImage(PdfImageBase):
buffer = self.get_stream_buffer()
stride = 0 # tell Pillow to calculate stride from line width
ystep = 1 # image is top to bottom in memory
- im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride,
- ystep)
+ im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride, ystep)
if self.mode == 'P':
base_mode, palette = self.palette
if base_mode in ('RGB', 'L'):
@@ -347,14 +424,15 @@ class PdfImage(PdfImageBase):
im = Image.frombytes('1', self.size, data)
base_mode, palette = self.palette
- if not (palette == b'\x00\x00\x00\xff\xff\xff'
- or palette == b'\x00\xff'):
- raise NotImplementedError(
- 'monochrome image with nontrivial palette')
+ if not (palette == b'\x00\x00\x00\xff\xff\xff' or palette == b'\x00\xff'):
+ raise NotImplementedError('monochrome image with nontrivial palette')
+
+ if self.colorspace == '/ICCBased':
+ im.info['icc_profile'] = self.icc.tobytes()
return im
- def extract_to(self, *, stream):
+ def _extract_to_stream(self, *, stream):
"""Attempt to extract the image directly to a usable image file
If possible, the compressed data is extracted and inserted into
@@ -386,6 +464,51 @@ class PdfImage(PdfImageBase):
raise UnsupportedImageTypeError(repr(self))
+ def extract_to(self, *, stream=None, fileprefix=''):
+ """Attempt to extract the image directly to a usable image file
+
+ If possible, the compressed data is extracted and inserted into
+ a compressed image file format without transcoding the compressed
+ content. If this is not possible, the data will be decompressed
+ and extracted to an appropriate format.
+
+ Because it is not known until attempted what image format will be
+ extracted, users should not assume what format they are getting back.
+ When saving the image to a file, use a temporary filename, and then
+ rename the file to its final name based on the returned file extension.
+
+ Examples:
+
+ >>> im.extract_to(stream=bytes_io)
+ '.png'
+
+ >>> im.extract_to(fileprefix='/tmp/image00')
+ '/tmp/image00.jpg'
+
+ Args:
+ stream: Writable stream to write data to.
+ fileprefix (str or Path): The path to write the extracted image to,
+ without the file extension.
+
+ Returns:
+ str: If *fileprefix* was provided, then the fileprefix with the
+ appropriate extension. If no *fileprefix*, then an extension
+ indicating the file type.
+ """
+
+ if bool(stream) == bool(fileprefix):
+ raise ValueError("Cannot set both stream and fileprefix")
+ if stream:
+ return self._extract_to_stream(stream=stream)
+
+ bio = BytesIO()
+ extension = self._extract_to_stream(stream=bio)
+ bio.seek(0)
+ filepath = Path(str(Path(fileprefix)) + extension)
+ with filepath.open('wb') as target:
+ copyfileobj(bio, target)
+ return str(filepath)
+
def read_bytes(self):
"""Decompress this image and return it as unencoded bytes"""
return self.obj.read_bytes()
@@ -433,6 +556,7 @@ class PdfImage(PdfImageBase):
img_size = len(data)
tiff_header_struct = '<' + '2s' + 'H' + 'L' + 'H' + 'HHLL' * 8 + 'L'
+ # fmt: off
tiff_header = struct.pack(
tiff_header_struct,
b'II', # Byte order indication: Little endian
@@ -449,6 +573,7 @@ class PdfImage(PdfImageBase):
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image
0 # last IFD
)
+ # fmt: on
return tiff_header
def show(self):
@@ -457,7 +582,8 @@ class PdfImage(PdfImageBase):
def __repr__(self):
return '<pikepdf.PdfImage image mode={} size={}x{} at {}>'.format(
- self.mode, self.width, self.height, hex(id(self)))
+ self.mode, self.width, self.height, hex(id(self))
+ )
def _repr_png_(self):
"""Display hook for IPython/Jupyter"""
@@ -468,14 +594,17 @@ class PdfImage(PdfImageBase):
class PdfJpxImage(PdfImage):
-
def __init__(self, obj):
super().__init__(obj)
self.pil = self.as_pil_image()
def _extract_direct(self, *, stream):
- buffer = self.obj.get_raw_stream_buffer()
- stream.write(buffer)
+ data, filters = self._unstack_compression(
+ self.obj.get_raw_stream_buffer(), self.filters
+ )
+ if filters != ['/JPXDecode']:
+ raise UnsupportedImageTypeError(self.filters)
+ stream.write(data)
return '.jp2'
@property
@@ -508,7 +637,8 @@ class PdfJpxImage(PdfImage):
def __repr__(self):
return '<pikepdf.PdfJpxImage JPEG2000 image mode={} size={}x{} at {}>'.format(
- self.mode, self.width, self.height, hex(id(self)))
+ self.mode, self.width, self.height, hex(id(self))
+ )
class PdfInlineImage(PdfImageBase):
@@ -532,7 +662,7 @@ class PdfInlineImage(PdfImageBase):
b'/LZW': b'/LZWDecode',
b'/RL': b'/RunLengthDecode',
b'/CCF': b'/CCITTFaxDecode',
- b'/DCT': b'/DCTDecode'
+ b'/DCT': b'/DCTDecode',
}
def __init__(self, *, image_data, image_object: tuple):
@@ -554,8 +684,7 @@ class PdfInlineImage(PdfImageBase):
try:
reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')
except PdfError as e:
- raise PdfError(
- "parsing inline " + reparse.decode('unicode_escape')) from e
+ raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e
self.obj = reparsed_obj
self.pil = None
@@ -575,7 +704,6 @@ class PdfInlineImage(PdfImageBase):
else:
raise NotImplementedError(repr(obj))
-
def _metadata(self, name, type_, default):
return metadata_from_obj(self.obj, name, type_, default)
@@ -597,6 +725,10 @@ class PdfInlineImage(PdfImageBase):
def is_inline(self):
return True
+ @property
+ def icc(self):
+ raise ValueError("Inline images may not have ICC profiles")
+
def __repr__(self):
mode = '?'
try:
@@ -604,23 +736,24 @@ class PdfInlineImage(PdfImageBase):
except Exception:
pass
return '<pikepdf.PdfInlineImage image mode={} size={}x{} at {}>'.format(
- mode, self.width, self.height, hex(id(self)))
+ mode, self.width, self.height, hex(id(self))
+ )
def as_pil_image(self):
- from PIL import Image
-
if self.pil:
return self.pil
raise NotImplementedError('not yet')
- def extract_to(self, *, stream): # pylint: disable=unused-argument
+ def extract_to(
+ self, *, stream=None, fileprefix=''
+ ): # pylint: disable=unused-argument
raise UnsupportedImageTypeError("inline images don't support extract")
def read_bytes(self):
raise NotImplementedError("qpdf returns compressed")
- #return self._data._inline_image_bytes()
+ # return self._data._inline_image_bytes()
def get_stream_buffer(self):
raise NotImplementedError("qpdf returns compressed")
- #return memoryview(self._data.inline_image_bytes())
+ # return memoryview(self._data.inline_image_bytes())
diff --git a/src/pikepdf/models/matrix.py b/src/pikepdf/models/matrix.py
index d68fae6..4c5c2fb 100644
--- a/src/pikepdf/models/matrix.py
+++ b/src/pikepdf/models/matrix.py
@@ -4,7 +4,8 @@
#
# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
-from math import cos, sin, pi
+from math import cos, pi, sin
+
class PdfMatrix:
"""
@@ -31,6 +32,7 @@ class PdfMatrix:
"""
def __init__(self, *args):
+ # fmt: off
if not args:
self.values = ((1, 0, 0), (0, 1, 0), (0, 0, 1))
elif len(args) == 6:
@@ -51,6 +53,7 @@ class PdfMatrix:
tuple(args[0][2]))
else:
raise ValueError('arguments')
+ # fmt: on
@staticmethod
def identity():
@@ -66,10 +69,13 @@ class PdfMatrix:
a = self.values
b = other.values
return PdfMatrix(
- [[sum([float(i) * float(j)
- for i, j in zip(row, col)]
- ) for col in zip(*b)]
- for row in a]
+ [
+ [
+ sum([float(i) * float(j) for i, j in zip(row, col)])
+ for col in zip(*b)
+ ]
+ for row in a
+ ]
)
def scaled(self, x, y):
diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py
index 65934cd..a029b2b 100644
--- a/src/pikepdf/models/metadata.py
+++ b/src/pikepdf/models/metadata.py
@@ -4,24 +4,20 @@
#
# Copyright (C) 2018, James R. Barlow (https://github.com/jbarlow83/)
+import re
+import sys
from collections import namedtuple
from collections.abc import MutableMapping
from datetime import datetime
from functools import wraps
from io import BytesIO
-from pkg_resources import (
- get_distribution as _get_distribution,
- DistributionNotFound
-)
from warnings import warn
-import re
-import sys
from lxml import etree
-from lxml.etree import QName, XMLSyntaxError
-from defusedxml.lxml import parse
+from lxml.etree import parse, QName, XMLSyntaxError
-from .. import Stream, Name, String, PdfError
+from .. import Name, PdfError, Stream, String
+from .. import __version__ as pikepdf_version
XMP_NS_DC = "http://purl.org/dc/elements/1.1/"
XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/"
@@ -79,27 +75,21 @@ XMP_CONTAINERS = [
XmpContainer('Seq', list, list.append),
]
-LANG_ALTS = frozenset([
- str(QName(XMP_NS_DC, 'title')),
- str(QName(XMP_NS_DC, 'description')),
- str(QName(XMP_NS_DC, 'rights')),
- str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
-])
+LANG_ALTS = frozenset(
+ [
+ str(QName(XMP_NS_DC, 'title')),
+ str(QName(XMP_NS_DC, 'description')),
+ str(QName(XMP_NS_DC, 'rights')),
+ str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')),
+ ]
+)
# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive,
# but we'll be strict to ensure wider compatibility.)
re_xml_illegal_chars = re.compile(
r"(?u)[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"
)
-re_xml_illegal_bytes = re.compile(
- br"[^\x09\x0A\x0D\x20-\xFF]|&#0;"
-)
-
-# Repeat this to avoid circular from top package's pikepdf.__version__
-try:
- pikepdf_version = _get_distribution(__name__).version
-except DistributionNotFound:
- pikepdf_version = ""
+re_xml_illegal_bytes = re.compile(br"[^\x09\x0A\x0D\x20-\xFF]|&#0;")
def encode_pdf_date(d: datetime) -> str:
@@ -171,6 +161,7 @@ class AuthorConverter:
if sys.version_info < (3, 7):
+
def fromisoformat(datestr):
# strptime %z can't parse a timezone with punctuation
if re.search(r'[+-]\d{2}[-:]\d{2}$', datestr):
@@ -179,9 +170,12 @@ if sys.version_info < (3, 7):
return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S%z")
except ValueError:
return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S")
+
+
else:
fromisoformat = datetime.fromisoformat
+
class DateConverter:
@staticmethod
def xmp_from_docinfo(docinfo_val):
@@ -203,6 +197,7 @@ def ensure_loaded(fn):
if not self._xmp:
self._load()
return fn(self, *args, **kwargs)
+
return wrapper
@@ -228,10 +223,10 @@ class PdfMetadata(MutableMapping):
To update metadata, use a with block.
- .. code-block:: python
+ Example:
- with pdf.open_metadata() as records:
- records['dc:title'] = 'New Title'
+ >>> with pdf.open_metadata() as records:
+ records['dc:title'] = 'New Title'
See Also:
:meth:`pikepdf.Pdf.open_metadata`
@@ -289,7 +284,9 @@ class PdfMetadata(MutableMapping):
continue
self[qname] = val
except (ValueError, AttributeError) as e:
- msg = "The metadata field {} could not be copied to XMP".format(docinfo_name)
+ msg = "The metadata field {} could not be copied to XMP".format(
+ docinfo_name
+ )
if raise_failure:
raise ValueError(msg) from e
else:
@@ -314,6 +311,15 @@ class PdfMetadata(MutableMapping):
pis = self._xmp.xpath('/processing-instruction()')
for pi in pis:
etree.strip_tags(self._xmp, pi.tag)
+ try:
+ self._get_rdf_root()
+ except ValueError:
+ if self._xmp.find('.', self.NS).tag == '{adobe:ns:meta/}xmpmeta':
+ # Looks like: <x:xmpmeta></x:xmpmeta>, so reload with template
+ # that includes <rdf:RDF>
+ return self._load_from(XMP_EMPTY)
+ else:
+ raise # Probably not XMP
@ensure_loaded
def __enter__(self):
@@ -347,7 +353,11 @@ class PdfMetadata(MutableMapping):
try:
value = converter.docinfo_from_xmp(value)
except ValueError:
- warn("The DocumentInfo field {} could not be updated from XMP".format(docinfo_name))
+ warn(
+ "The DocumentInfo field {} could not be updated from XMP".format(
+ docinfo_name
+ )
+ )
value = None
if value is None:
if docinfo_name in self._pdf.docinfo:
@@ -562,19 +572,19 @@ class PdfMetadata(MutableMapping):
val = AltList([clean(val)])
if isinstance(val, (list, set)):
rdfdesc = etree.SubElement(
- rdf, QName(XMP_NS_RDF, 'Description'),
- attrib={
- QName(XMP_NS_RDF, 'about'): '',
- },
+ rdf,
+ QName(XMP_NS_RDF, 'Description'),
+ attrib={QName(XMP_NS_RDF, 'about'): ''},
)
node = etree.SubElement(rdfdesc, self._qname(key))
add_array(node, val)
elif isinstance(val, str):
rdfdesc = etree.SubElement(
- rdf, QName(XMP_NS_RDF, 'Description'),
+ rdf,
+ QName(XMP_NS_RDF, 'Description'),
attrib={
QName(XMP_NS_RDF, 'about'): '',
- self._qname(key): clean(val)
+ self._qname(key): clean(val),
},
)
else:
@@ -588,7 +598,11 @@ class PdfMetadata(MutableMapping):
node, attrib, _oldval, parent = next(self._get_elements(key))
if attrib: # Inline
del node.attrib[attrib]
- if len(node.attrib) == 1 and len(node) == 0 and QName(XMP_NS_RDF, 'about') in node.attrib:
+ if (
+ len(node.attrib) == 1
+ and len(node) == 0
+ and QName(XMP_NS_RDF, 'about') in node.attrib
+ ):
# The only thing left on this node is rdf:about="", so remove it
parent.remove(node)
else:
diff --git a/src/pikepdf/objects.py b/src/pikepdf/objects.py
index a888b97..2e42eb9 100644
--- a/src/pikepdf/objects.py
+++ b/src/pikepdf/objects.py
@@ -8,24 +8,33 @@
The purpose of these is to provide nice-looking classes to allow explicit
construction of PDF objects and more pythonic idioms and facilitate discovery
-by documentation generators.
+by documentation generators and linters.
It's also a place to narrow the scope of input types to those more easily
converted to C++.
-In reality all of these return objects of class pikepdf.Object or rather
-QPDFObjectHandle which is a generic type.
-
+There is some deliberate "smoke and mirrors" here: all of the objects are truly
+instances of ``pikepdf.Object``, which is a variant container object. The
+``__new__`` constructs a ``pikepdf.Object`` in each case, and the rest of the
+class definition is present as an aide for code introspection.
"""
from . import _qpdf
-from ._qpdf import Object, ObjectType
-# pylint: disable=unused-import
-from ._qpdf import Operator
+# pylint: disable=unused-import, abstract-method
+from ._qpdf import Object, ObjectType, Operator
+
+# By default pikepdf.Object will identify itself as pikepdf._qpdf.Object
+# Here we change the module to discourage people from using that internal name
+# Instead it will become pikepdf.objects.Object
+Object.__module__ = __name__
+ObjectType.__module__ = __name__
+Operator.__module__ = __name__
-class _ObjectMeta(type):
+
+# type(Object) is the metaclass that pybind11 defines; we wish to extend that
+class _ObjectMeta(type(Object)):
"""Supports instance checking"""
def __instancecheck__(cls, instance):
@@ -38,9 +47,13 @@ class _NameObjectMeta(_ObjectMeta):
"""Supports usage pikepdf.Name.Whatever -> Name('/Whatever')"""
def __getattr__(self, attr):
+ if attr.startswith('_'):
+ return _ObjectMeta.__getattr__(attr)
return Name('/' + attr)
- def __setattr__(self, name, value):
+ def __setattr__(self, attr, value):
+ if attr.startswith('_'):
+ return _ObjectMeta.__setattr__(attr, value)
raise TypeError("Attributes may not be set on pikepdf.Name")
def __getitem__(self, item):
@@ -56,7 +69,7 @@ class _NameObjectMeta(_ObjectMeta):
)
-class Name(metaclass=_NameObjectMeta):
+class Name(Object, metaclass=_NameObjectMeta):
"""Constructs a PDF Name object
Names can be constructed with two notations:
@@ -69,6 +82,7 @@ class Name(metaclass=_NameObjectMeta):
that are normally expected to be in a PDF. The latter is preferred for
dynamic names and attributes.
"""
+
object_type = ObjectType.name
def __new__(cls, name):
@@ -79,8 +93,9 @@ class Name(metaclass=_NameObjectMeta):
return _qpdf._new_name(name)
-class String(metaclass=_ObjectMeta):
+class String(Object, metaclass=_ObjectMeta):
"""Constructs a PDF String object"""
+
object_type = ObjectType.string
def __new__(cls, s):
@@ -97,8 +112,9 @@ class String(metaclass=_ObjectMeta):
return _qpdf._new_string_utf8(s)
-class Array(metaclass=_ObjectMeta):
+class Array(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Array object"""
+
object_type = ObjectType.array
def __new__(cls, a=None):
@@ -118,8 +134,9 @@ class Array(metaclass=_ObjectMeta):
return _qpdf._new_array(a)
-class Dictionary(metaclass=_ObjectMeta):
+class Dictionary(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Dictionary object"""
+
object_type = ObjectType.dictionary
def __new__(cls, d=None, **kwargs):
@@ -147,15 +164,15 @@ class Dictionary(metaclass=_ObjectMeta):
if kwargs:
# Add leading slash
# Allows Dictionary(MediaBox=(0,0,1,1), Type=Name('/Page')...
- return _qpdf._new_dictionary(
- {('/' + k) : v for k, v in kwargs.items()})
+ return _qpdf._new_dictionary({('/' + k): v for k, v in kwargs.items()})
if not d:
d = {}
return _qpdf._new_dictionary(d)
-class Stream(metaclass=_ObjectMeta):
+class Stream(Object, metaclass=_ObjectMeta):
"""Constructs a PDF Stream object"""
+
object_type = ObjectType.stream
def __new__(cls, owner, obj):
diff --git a/src/qpdf/annotation.cpp b/src/qpdf/annotation.cpp
new file mode 100644
index 0000000..f82ebdf
--- /dev/null
+++ b/src/qpdf/annotation.cpp
@@ -0,0 +1,52 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright (C) 2019, James R. Barlow (https://github.com/jbarlow83/)
+ */
+
+
+
+#include <qpdf/Constants.h>
+#include <qpdf/Types.h>
+#include <qpdf/DLL.h>
+#include <qpdf/QPDFExc.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/QPDFAnnotationObjectHelper.hh>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "pikepdf.h"
+
+
+void init_annotation(py::module &m)
+{
+ py::class_<QPDFAnnotationObjectHelper>(m, "Annotation")
+ .def(py::init<QPDFObjectHandle &>(), py::keep_alive<0, 1>())
+ .def_property_readonly("subtype", &QPDFAnnotationObjectHelper::getSubtype)
+ .def_property_readonly("flags", &QPDFAnnotationObjectHelper::getFlags)
+ .def_property_readonly("appearance_state", &QPDFAnnotationObjectHelper::getAppearanceState)
+ .def_property_readonly("appearance_dict", &QPDFAnnotationObjectHelper::getAppearanceDictionary)
+ .def("get_appearance_stream",
+ [](QPDFAnnotationObjectHelper& anno, QPDFObjectHandle& which, std::string const& state = "") {
+ // if (!which.isName())
+ // throw py::type_error("which must be pikepdf.Name");
+ return anno.getAppearanceStream(which.getName(), state);
+ },
+ py::arg("which"),
+ py::arg("state") = ""
+ )
+ .def("get_page_content_for_appearance",
+ [](QPDFAnnotationObjectHelper& anno, QPDFObjectHandle& name, int rotate, int required_flags, int forbidden_flags) {
+ //auto name = name_.getName();
+ return anno.getPageContentForAppearance(name.getName(), rotate, required_flags, forbidden_flags);
+ },
+ py::arg("name"),
+ py::arg("rotate"),
+ py::arg("required_flags") = 0,
+ py::arg("forbidden_flags") = an_invisible | an_hidden
+ )
+ ;
+}
diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp
index 392d9ff..1270961 100644
--- a/src/qpdf/object.cpp
+++ b/src/qpdf/object.cpp
@@ -363,7 +363,8 @@ void init_object(py::module& m)
[](QPDFObjectHandle &h, std::shared_ptr<QPDF> possible_owner) {
return (h.getOwningQPDF() == possible_owner.get());
},
- "Test if this object is owned by the indicated *possible_owner*."
+ "Test if this object is owned by the indicated *possible_owner*.",
+ py::arg("possible_owner")
)
.def_property_readonly("is_indirect", &QPDFObjectHandle::isIndirect)
.def("__repr__", &objecthandle_repr)
@@ -555,9 +556,9 @@ void init_object(py::module& m)
}
return py::cast(value);
},
- "for dictionary objects, behave as dict.get(key, default=None)",
+ "For ``pikepdf.Dictionary`` objects, behave as ``dict.get(key, default=None)``",
py::arg("key"),
- py::arg("default_") = py::none(),
+ py::arg("default") = py::none(),
py::return_value_policy::reference_internal
)
.def("get",
@@ -570,9 +571,9 @@ void init_object(py::module& m)
}
return py::cast(value);
},
- "for dictionary objects, behave as dict.get(key, default=None)",
+ "For ``pikepdf.Dictionary`` objects, behave as ``dict.get(key, default=None)``",
py::arg("key"),
- py::arg("default_") = py::none(),
+ py::arg("default") = py::none(),
py::return_value_policy::reference_internal
)
.def("keys", &QPDFObjectHandle::getKeys)
@@ -664,6 +665,12 @@ void init_object(py::module& m)
h.eraseItem(u_index);
}
)
+ .def("wrap_in_array",
+ [](QPDFObjectHandle &h) {
+ return h.wrapInArray();
+ },
+ "Return the object wrapped in an array if not already an array."
+ )
.def("get_stream_buffer",
[](QPDFObjectHandle &h) {
PointerHolder<Buffer> phbuf = h.getStreamData();
@@ -694,37 +701,17 @@ void init_object(py::module& m)
},
"Read the content stream associated with this object without decoding"
)
- .def("write",
- [](QPDFObjectHandle &h, py::bytes data, py::args args, py::kwargs kwargs) {
+ .def("_write",
+ [](QPDFObjectHandle &h, py::bytes data, py::object filter, py::object decode_parms) {
std::string sdata = data;
- QPDFObjectHandle filter = QPDFObjectHandle::newNull();
- QPDFObjectHandle decode_parms = QPDFObjectHandle::newNull();
- if (args.size() != 0)
- throw py::value_error("Too many positional arguments");
- if (kwargs.contains("filter"))
- filter = objecthandle_encode(kwargs["filter"]);
- if (kwargs.contains("decode_parms"))
- decode_parms = objecthandle_encode(kwargs["decode_parms"]);
- h.replaceStreamData(sdata, filter, decode_parms);
+ QPDFObjectHandle h_filter = objecthandle_encode(filter);
+ QPDFObjectHandle h_decode_parms = objecthandle_encode(decode_parms);
+ h.replaceStreamData(sdata, h_filter, h_decode_parms);
},
R"~~~(
- Replace the content stream with `data`, compressed according to `filter` and `decode_parms`
-
- :param data: the new data to use for replacement
- :type data: bytes
- :param filter: The filter(s) with which the data is (already) encoded
- :param decode_parms: Parameters for the filters with which the object is encode
-
- If only one `filter` is specified, it may be a name such as
- `Name('/FlateDecode')`. If there are multiple filters, then array
- of names should be given.
-
- If there is only one filter, `decode_parms` is a Dictionary of
- parameters for that filter. If there are multiple filters, then
- `decode_parms` is an Array of Dictionary, where each array index
- is corresponds to the filter.
-
- )~~~"
+ Low level write/replace stream data without argument checking. Use .write().
+ )~~~",
+ py::arg("data"), py::arg("filter"), py::arg("decode_parms")
)
.def_property_readonly("images",
[](QPDFObjectHandle &h) {
@@ -749,7 +736,16 @@ void init_object(py::module& m)
py::arg("prepend") = false,
py::keep_alive<1, 2>()
)
- .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams)
+ .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams,
+ R"~~~(
+ Coalesce an array of page content streams into a single content stream.
+
+ The PDF specification allows the ``/Contents`` object to contain either
+ an array of content streams or a single content stream. However, it
+ simplifies parsing and editing if there is only a single content stream.
+ This function merges all content streams.
+ )~~~"
+ )
.def_property_readonly("_objgen",
&object_get_objgen
)
@@ -811,6 +807,41 @@ void init_object(py::module& m)
py::arg("resolved") = false,
"Convert PDF objects into their binary representation, optionally resolving indirect objects."
)
+ .def("to_json",
+ [](QPDFObjectHandle &h, bool dereference = false) -> py::bytes {
+ return h.getJSON(dereference).unparse();
+ },
+ py::arg("dereference") = false,
+ R"~~~(
+ Convert to a QPDF JSON representation of the object.
+
+ See the QPDF manual for a description of its JSON representation.
+ http://qpdf.sourceforge.net/files/qpdf-manual.html#ref.json
+
+ Not necessarily compatible with other PDF-JSON representations that
+ exist in the wild.
+
+ * Names are encoded as UTF-8 strings
+ * Indirect references are encoded as strings containing ``obj gen R``
+ * Strings are encoded as UTF-8 strings with unrepresentable binary
+ characters encoded as ``\uHHHH``
+ * Encoding streams just encodes the stream's dictionary; the stream
+ data is not represented
+ * Object types that are only valid in content streams (inline
+ image, operator) as well as "reserved" objects are not
+ representable and will be serialized as ``null``.
+
+ Args:
+ dereference (bool): If True, deference the object is this is an
+ indirect object.
+
+ Returns:
+ bytes: JSON bytestring of object. The object is UTF-8 encoded
+ and may be decoded to a Python str that represents the binary
+ values ``\x00-\xFF`` as ``U+0000`` to ``U+00FF``; that is,
+ it may contain mojibake.
+ )~~~"
+ )
; // end of QPDFObjectHandle bindings
m.def("_new_boolean", &QPDFObjectHandle::newBool, "Construct a PDF Boolean object");
@@ -900,7 +931,8 @@ void init_object(py::module& m)
[](const std::string& op) {
return QPDFObjectHandle::newOperator(op);
},
- "Construct a PDF Operator object for use in content streams"
+ "Construct a PDF Operator object for use in content streams.",
+ py::arg("op")
);
m.def("_Null", &QPDFObjectHandle::newNull,
"Construct a PDF Null object"
diff --git a/src/qpdf/pikepdf.cpp b/src/qpdf/pikepdf.cpp
new file mode 100644
index 0000000..2daa69a
--- /dev/null
+++ b/src/qpdf/pikepdf.cpp
@@ -0,0 +1,98 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright (C) 2019, James R. Barlow (https://github.com/jbarlow83/)
+ */
+
+#include <sstream>
+#include <type_traits>
+#include <cerrno>
+#include <cstring>
+
+#include "pikepdf.h"
+
+#include <qpdf/QPDFExc.hh>
+#include <qpdf/QPDFSystemError.hh>
+#include <qpdf/QUtil.hh>
+
+#include <pybind11/stl.h>
+#include <pybind11/iostream.h>
+#include <pybind11/buffer_info.h>
+
+#include "qpdf_pagelist.h"
+#include "utils.h"
+
+
+extern "C" const char* qpdf_get_qpdf_version();
+
+
+class TemporaryErrnoChange {
+public:
+ TemporaryErrnoChange(int val) {
+ stored = errno;
+ errno = val;
+ }
+ ~TemporaryErrnoChange() {
+ errno = stored;
+ }
+private:
+ int stored;
+};
+
+
+PYBIND11_MODULE(_qpdf, m) {
+ //py::options options;
+ //options.disable_function_signatures();
+
+ m.doc() = "pikepdf provides a Pythonic interface for QPDF";
+
+ m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version");
+
+ init_qpdf(m);
+ init_pagelist(m);
+ init_object(m);
+ init_annotation(m);
+
+ m.def("utf8_to_pdf_doc",
+ [](py::str utf8, char unknown) {
+ std::string pdfdoc;
+ bool success = QUtil::utf8_to_pdf_doc(std::string(utf8), pdfdoc, unknown);
+ return py::make_tuple(success, py::bytes(pdfdoc));
+ }
+ );
+ m.def("pdf_doc_to_utf8",
+ [](py::bytes pdfdoc) -> py::str {
+ return py::str(QUtil::pdf_doc_to_utf8(pdfdoc));
+ }
+ );
+
+ static py::exception<QPDFExc> exc_main(m, "PdfError");
+ static py::exception<QPDFExc> exc_password(m, "PasswordError");
+ py::register_exception_translator([](std::exception_ptr p) {
+ try {
+ if (p) std::rethrow_exception(p);
+ } catch (const QPDFExc &e) {
+ if (e.getErrorCode() == qpdf_e_password) {
+ exc_password(e.what());
+ } else {
+ exc_main(e.what());
+ }
+ } catch (const QPDFSystemError &e) {
+ if (e.getErrno() != 0) {
+ TemporaryErrnoChange errno_holder(e.getErrno());
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, e.getDescription().c_str());
+ } else {
+ exc_main(e.what());
+ }
+ }
+ });
+
+
+#ifdef VERSION_INFO
+ m.attr("__version__") = VERSION_INFO;
+#else
+ m.attr("__version__") = "dev";
+#endif
+}
diff --git a/src/qpdf/pikepdf.h b/src/qpdf/pikepdf.h
index 7fbd6e8..0acd807 100644
--- a/src/qpdf/pikepdf.h
+++ b/src/qpdf/pikepdf.h
@@ -40,9 +40,6 @@ namespace pybind11 { namespace detail {
};
}}
-#define CUSTOM_TYPE_CONVERSION 1
-#if CUSTOM_TYPE_CONVERSION
-
// From object_convert.cpp
pybind11::object decimal_from_pdfobject(QPDFObjectHandle h);
@@ -57,24 +54,9 @@ namespace pybind11 { namespace detail {
* Conversion part 1 (Python->C++): convert a PyObject into a Object
*/
bool load(handle src, bool convert) {
- // if (src.is_none()) {
- // if (!convert) return false;
- // value = QPDFObjectHandle::newNull();
- // return true;
- // }
- // Attempting to construct these does not work...
- // if (convert) {
- // if (PYBIND11_LONG_CHECK(src.ptr())) {
- // auto as_int = src.cast<long long>();
- // value = QPDFObjectHandle::newInteger(as_int);
- // } /*else if (PyFloat_Check(src.ptr())) {
- // auto as_double = src.cast<double>();
- // value = QPDFObjectHandle::newReal(as_double);
- // } */ else {
- // return base::load(src, convert);
- // }
- // return true;
- // }
+ // Do whatever our base does
+ // Potentially we could convert some scalrs to QPDFObjectHandle here,
+ // but most of the interfaces just expect straight C++ types.
return base::load(src, convert);
}
@@ -157,7 +139,6 @@ namespace pybind11 { namespace detail {
}
};
}} // namespace pybind11::detail
-#endif
namespace py = pybind11;
@@ -166,6 +147,8 @@ PYBIND11_MAKE_OPAQUE(std::vector<QPDFObjectHandle>);
typedef std::map<std::string, QPDFObjectHandle> ObjectMap;
PYBIND11_MAKE_OPAQUE(ObjectMap);
+// From qpdf.cpp
+void init_qpdf(py::module& m);
// From object.cpp
size_t list_range_check(QPDFObjectHandle h, int index);
@@ -183,6 +166,9 @@ QPDFObjectHandle objecthandle_encode(const py::handle handle);
std::vector<QPDFObjectHandle> array_builder(const py::iterable iter);
std::map<std::string, QPDFObjectHandle> dict_builder(const py::dict dict);
+// From annotation.cpp
+void init_annotation(py::module &m);
+
// Support for recursion checks
class StackGuard
{
diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp
index 5bb8ea9..0a5fc26 100644
--- a/src/qpdf/qpdf.cpp
+++ b/src/qpdf/qpdf.cpp
@@ -28,10 +28,9 @@
#include "qpdf_pagelist.h"
#include "qpdf_inputsource.h"
+#include "qpdf_pipeline.h"
#include "utils.h"
-extern "C" const char* qpdf_get_qpdf_version();
-
void check_stream_is_usable(py::object stream)
{
@@ -58,6 +57,7 @@ open_pdf(
q->setPasswordIsHexKey(hex_password);
q->setIgnoreXRefStreams(ignore_xref_streams);
q->setAttemptRecovery(attempt_recovery);
+ q->setImmediateCopyFrom(true);
if (py::hasattr(filename_or_stream, "read") && py::hasattr(filename_or_stream, "seek")) {
// Python code gave us an object with a stream interface
@@ -66,7 +66,7 @@ open_pdf(
check_stream_is_usable(stream);
// The PythonInputSource object will be owned by q
- InputSource* input_source = new PythonInputSource(stream);
+ auto input_source = PointerHolder<InputSource>(new PythonInputSource(stream));
py::gil_scoped_release release;
q->processInputSource(input_source, password.c_str());
} else {
@@ -80,7 +80,7 @@ open_pdf(
q->processFile(
description.c_str(),
file, // transferring ownership
- true, // QPDF will close the file
+ true, // QPDF will close the file (including if there are exceptions)
password.c_str()
);
file = nullptr; // QPDF owns the file and will close it
@@ -116,122 +116,320 @@ private:
};
+void update_xmp_pdfversion(QPDF &q, std::string version)
+{
+ auto impl = py::module::import("pikepdf._cpphelpers").attr("update_xmp_pdfversion");
+ auto pypdf = py::cast(q);
+ impl(pypdf, version);
+}
+
+
+void setup_encryption(
+ QPDFWriter &w,
+ py::object encryption,
+ std::string &owner,
+ std::string &user
+)
+{
+ bool aes = true;
+ bool metadata = true;
+ std::map<std::string, bool> allow;
+ int encryption_level = 6;
+
+ if (encryption.contains("R")) {
+ if (!py::isinstance<py::int_>(encryption["R"]))
+ throw py::type_error("Encryption level 'R' must be an integer");
+ encryption_level = py::int_(encryption["R"]);
+ }
+ if (encryption_level < 2 || encryption_level > 6)
+ throw py::value_error("Invalid encryption level: must be 2, 3, 4 or 6");
+
+ if (encryption_level == 5) {
+ auto warn = py::module::import("warnings").attr("warn");
+ warn("Encryption R=5 is deprecated");
+ }
+
+ if (encryption.contains("owner")) {
+ if (encryption_level <= 4) {
+ auto success = QUtil::utf8_to_pdf_doc(encryption["owner"].cast<std::string>(), owner);
+ if (!success)
+ throw py::value_error("Encryption level is R3/R4 and password is not encodable as PDFDocEncoding");
+ } else {
+ owner = encryption["owner"].cast<std::string>();
+ }
+ }
+ if (encryption.contains("user")) {
+ if (encryption_level <= 4) {
+ auto success = QUtil::utf8_to_pdf_doc(encryption["user"].cast<std::string>(), user);
+ if (!success)
+ throw py::value_error("Encryption level is R3/R4 and password is not encodable as PDFDocEncoding");
+ } else {
+ user = encryption["user"].cast<std::string>();
+ }
+ }
+ if (encryption.contains("allow")) {
+ auto pyallow = encryption["allow"];
+ allow["accessibility"] = pyallow.attr("accessibility").cast<bool>();
+ allow["extract"] = pyallow.attr("extract").cast<bool>();
+ allow["modify_assembly"] = pyallow.attr("modify_assembly").cast<bool>();
+ allow["modify_annotation"] = pyallow.attr("modify_annotation").cast<bool>();
+ allow["modify_form"] = pyallow.attr("modify_form").cast<bool>();
+ allow["modify_other"] = pyallow.attr("modify_other").cast<bool>();
+ allow["print_lowres"] = pyallow.attr("print_lowres").cast<bool>();
+ allow["print_highres"] = pyallow.attr("print_highres").cast<bool>();
+ }
+ if (encryption.contains("aes")) {
+ if (py::isinstance<py::bool_>(encryption["aes"]))
+ aes = py::bool_(encryption["aes"]);
+ else
+ throw py::type_error("aes must be bool");
+ } else {
+ aes = (encryption_level >= 4);
+ }
+ if (encryption.contains("metadata")) {
+ if (py::isinstance<py::bool_>(encryption["metadata"]))
+ metadata = py::bool_(encryption["metadata"]);
+ else
+ throw py::type_error("metadata must be bool");
+ } else {
+ metadata = (encryption_level >= 4);
+ }
+
+ if (metadata && encryption_level < 4) {
+ throw py::value_error("Cannot encrypt metadata when R < 4");
+ }
+ if (aes && encryption_level < 4) {
+ throw py::value_error("Cannot encrypt with AES when R < 4");
+ }
+ if (encryption_level == 6 && !aes) {
+ throw py::value_error("When R = 6, AES encryption must be enabled");
+ }
+ if (metadata && !aes) {
+ throw py::value_error("Cannot encrypt metadata unless AES encryption is enabled");
+ }
+
+ qpdf_r3_print_e print;
+ if (allow["print_highres"])
+ print = qpdf_r3p_full;
+ else if (allow["print_lowres"])
+ print = qpdf_r3p_low;
+ else
+ print = qpdf_r3p_none;
+
+ if (encryption_level == 6) {
+ w.setR6EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print,
+ metadata
+ );
+ } else if (encryption_level == 5) {
+ // TODO WARNING
+ w.setR5EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print,
+ metadata
+ );
+ } else if (encryption_level == 4) {
+ w.setR4EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print,
+ metadata,
+ aes
+ );
+ } else if (encryption_level == 3) {
+ w.setR3EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ allow["accessibility"],
+ allow["extract"],
+ allow["modify_assembly"],
+ allow["modify_annotation"],
+ allow["modify_form"],
+ allow["modify_other"],
+ print
+ );
+ } else if (encryption_level == 2) {
+ w.setR2EncryptionParameters(
+ user.c_str(), owner.c_str(),
+ (print != qpdf_r3p_none),
+ allow["modify_assembly"],
+ allow["extract"],
+ allow["modify_annotation"]
+ );
+ }
+}
+
+
+typedef std::pair<std::string, int> pdf_version_extension;
+
+pdf_version_extension get_version_extension(py::object ver_ext)
+{
+ std::string version = "";
+ int extension = 0;
+ try {
+ version = ver_ext.cast<std::string>();
+ extension = 0;
+ } catch (py::cast_error) {
+ try {
+ auto version_ext = ver_ext.cast<pdf_version_extension>();
+ version = version_ext.first;
+ extension = version_ext.second;
+ } catch (py::cast_error) {
+ throw py::type_error("PDF version must be a tuple: (str, int)");
+ }
+ }
+ return pdf_version_extension(version, extension);
+}
+
+
+/* Helper class to ensure streams we open get closed by destructor */
+class Closer
+{
+public:
+ Closer() : monitored(py::none()) {}
+ ~Closer() {
+ if (!this->monitored.is_none()) {
+ this->monitored.attr("close")();
+ }
+ }
+ void set(py::object monitored) {
+ this->monitored = monitored;
+ }
+ Closer(const Closer& other) = delete;
+ Closer(Closer&& other) = delete;
+ Closer& operator= (const Closer& other) = delete;
+ Closer& operator= (Closer&& other) = delete;
+
+private:
+ py::object monitored;
+};
+
void save_pdf(
QPDF& q,
py::object filename_or_stream,
bool static_id=false,
bool preserve_pdfa=true,
- std::string min_version="",
- std::string force_version="",
+ py::object min_version=py::none(),
+ py::object force_version=py::none(),
+ bool fix_metadata_version=true,
bool compress_streams=true,
- qpdf_stream_decode_level_e stream_decode_level=qpdf_dl_generalized,
+ py::object stream_decode_level=py::none(),
qpdf_object_stream_e object_stream_mode=qpdf_o_preserve,
bool normalize_content=false,
bool linearize=false,
bool qdf=false,
- py::object progress=py::none())
+ py::object progress=py::none(),
+ py::object encryption=py::none())
{
+ std::string owner;
+ std::string user;
+ std::string description;
QPDFWriter w(q);
- // Parameters
if (static_id) {
w.setStaticID(true);
}
w.setNewlineBeforeEndstream(preserve_pdfa);
- if (!min_version.empty()) {
- w.setMinimumPDFVersion(min_version, 0);
- }
- if (!force_version.empty()) {
- w.forcePDFVersion(force_version, 0);
+
+ if (!min_version.is_none()) {
+ auto version_ext = get_version_extension(min_version);
+ w.setMinimumPDFVersion(version_ext.first, version_ext.second);
}
w.setCompressStreams(compress_streams);
- w.setDecodeLevel(stream_decode_level);
+ if (!stream_decode_level.is_none()) {
+ // Unconditionally calling setDecodeLevel has side effects, disabling
+ // preserve encryption in particular
+ w.setDecodeLevel(stream_decode_level.cast<qpdf_stream_decode_level_e>());
+ }
w.setObjectStreamMode(object_stream_mode);
- if (normalize_content && linearize) {
- throw py::value_error("cannot save with both normalize_content and linearize");
- }
- w.setContentNormalization(normalize_content);
- w.setLinearization(linearize);
- w.setQDFMode(qdf);
-
- if (!progress.is_none()) {
- auto reporter = PointerHolder<QPDFWriter::ProgressReporter>(new PikeProgressReporter(progress));
- w.registerProgressReporter(reporter);
- }
+ py::object stream;
+ Closer stream_closer;
if (py::hasattr(filename_or_stream, "write") && py::hasattr(filename_or_stream, "seek")) {
// Python code gave us an object with a stream interface
- py::object stream = filename_or_stream;
+ stream = filename_or_stream;
check_stream_is_usable(stream);
+ description = py::repr(stream);
+ } else {
+ py::object filename = fspath(filename_or_stream);
+ py::object ospath = py::module::import("os").attr("path");
+ py::object samefile = ospath.attr("samefile");
+ py::object exists = ospath.attr("exists");
+ if (exists(filename).cast<bool>() && samefile(filename, q.getFilename()).cast<bool>()) {
+ throw py::value_error("Cannot overwrite input file");
+ }
+ stream = py::module::import("io").attr("open")(filename, "wb");
+ stream_closer.set(stream);
+ description = py::str(filename);
+ }
- // TODO might be able to improve this by streaming rather than buffering
- // using subclass of Pipeline that routes calls to Python.
- w.setOutputMemory();
+ // We must set up the output pipeline before we configure encryption
+ Pl_PythonOutput output_pipe(description.c_str(), stream);
+ w.setOutputPipeline(&output_pipe);
- // It would be kind to release the GIL here, but this is not possible if
- // another thread has an object and tries to mess with it. Correctness
- // is more important than performance.
- w.write();
+ if (encryption.is(py::bool_(true)) && !q.isEncrypted()) {
+ throw py::value_error("can't perserve encryption parameters on a file with no encryption");
+ }
- // But now that we've held the GIL forever, we can release it and take
- // it back again; at least in theory giving other threads a chance to
- // to do something.
- {
- py::gil_scoped_release release;
- }
+ if (
+ (encryption.is(py::bool_(true)) || py::isinstance<py::dict>(encryption))
+ && (normalize_content || !stream_decode_level.is_none())
+ ) {
+ throw py::value_error("cannot save with encryption and normalize_content or stream_decode_level");
+ }
- // getBuffer returns Buffer* and qpdf says we are responsible for
- // deleting it, so capture it in a unique_ptr
- std::unique_ptr<Buffer> output_buffer(w.getBuffer());
-
- // Create a memoryview of the buffer that libqpdf created
- // Awkward API alert:
- // QPDFWriter::getBuffer -> Buffer* (caller frees memory)
- // and Buffer::getBuffer -> unsigned char* (caller does not own memory)
- py::buffer_info output_buffer_info(
- output_buffer->getBuffer(),
- output_buffer->getSize());
- py::memoryview view_output_buffer(output_buffer_info);
-
- // Send it to the stream object (probably copying)
- stream.attr("write")(view_output_buffer);
+ if (encryption.is(py::bool_(true))) {
+ w.setPreserveEncryption(true); // Keep existing encryption
+ } else if (encryption.is_none() || encryption.is(py::bool_(false))) {
+ w.setPreserveEncryption(false); // Remove encryption
} else {
- py::object filename = filename_or_stream;
- std::string description = py::str(filename);
- // Delete the intended filename, in case it is the same as the input file.
- // This ensures that the input file will continue to exist in memory on Linux.
- portable_unlink(filename);
- FILE* file = portable_fopen(filename, "wb");
- w.setOutputFile(description.c_str(), file, true);
- w.write();
- file = nullptr; // QPDF will close it
+ setup_encryption(w, encryption, owner, user);
}
-}
+ if (normalize_content && linearize) {
+ throw py::value_error("cannot save with both normalize_content and linearize");
+ }
+ w.setContentNormalization(normalize_content);
+ w.setLinearization(linearize);
+ w.setQDFMode(qdf);
-PYBIND11_MODULE(_qpdf, m) {
- //py::options options;
- //options.disable_function_signatures();
+ if (!force_version.is_none()) {
+ auto version_ext = get_version_extension(force_version);
+ w.forcePDFVersion(version_ext.first, version_ext.second);
+ }
+ if (fix_metadata_version) {
+ update_xmp_pdfversion(q, w.getFinalVersion());
+ }
- m.doc() = "pikepdf provides a Pythonic interface for QPDF";
+ if (!progress.is_none()) {
+ auto reporter = PointerHolder<QPDFWriter::ProgressReporter>(new PikeProgressReporter(progress));
+ w.registerProgressReporter(reporter);
+ }
- m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version");
+ w.write();
+}
- static py::exception<QPDFExc> exc_main(m, "PdfError");
- static py::exception<QPDFExc> exc_password(m, "PasswordError");
- py::register_exception_translator([](std::exception_ptr p) {
- try {
- if (p) std::rethrow_exception(p);
- } catch (const QPDFExc &e) {
- if (e.getErrorCode() == qpdf_e_password) {
- exc_password(e.what());
- } else {
- exc_main(e.what());
- }
- }
- });
+void init_qpdf(py::module &m)
+{
py::enum_<qpdf_object_stream_e>(m, "ObjectStreamMode")
.value("disable", qpdf_object_stream_e::qpdf_o_disable)
.value("preserve", qpdf_object_stream_e::qpdf_o_preserve)
@@ -243,7 +441,12 @@ PYBIND11_MODULE(_qpdf, m) {
.value("specialized", qpdf_stream_decode_level_e::qpdf_dl_specialized)
.value("all", qpdf_stream_decode_level_e::qpdf_dl_all);
- init_pagelist(m);
+ py::enum_<QPDF::encryption_method_e>(m, "EncryptionMethod")
+ .value("none", QPDF::encryption_method_e::e_none)
+ .value("unknown", QPDF::encryption_method_e::e_unknown)
+ .value("rc4", QPDF::encryption_method_e::e_rc4)
+ .value("aes", QPDF::encryption_method_e::e_aes)
+ .value("aesv3", QPDF::encryption_method_e::e_aesv3);
py::class_<QPDF, std::shared_ptr<QPDF>>(m, "Pdf", "In-memory representation of a PDF")
.def_static("new",
@@ -253,20 +456,31 @@ PYBIND11_MODULE(_qpdf, m) {
q->setSuppressWarnings(true);
return q;
},
- "create a new empty PDF from stratch"
+ "Create a new empty PDF from stratch."
)
.def_static("open", open_pdf,
R"~~~(
- Open an existing file at `filename_or_stream`.
+ Open an existing file at *filename_or_stream*.
- If `filename_or_stream` is path-like, the file will be opened. The
- file should not be modified by another process while it is open in
- pikepdf.
+ If *filename_or_stream* is path-like, the file will be opened for reading.
+ The file should not be modified by another process while it is open in
+ pikepdf. The file will not be altered when opened in this way. Any changes
+ to the file must be persisted by using ``.save()``.
- If `filename_or_stream` has `.read()` and `.seek()` methods, the file
+ If *filename_or_stream* has ``.read()`` and ``.seek()`` methods, the file
will be accessed as a readable binary stream. pikepdf will read the
entire stream into a private buffer.
+ ``.open()`` may be used in a ``with``-block, ``.close()`` will be called when
+ the block exists.
+
+ Examples:
+
+ >>> with Pdf.open("test.pdf") as pdf:
+ ...
+
+ >>> pdf = Pdf.open("test.pdf", password="rosebud")
+
Args:
filename_or_stream (os.PathLike): Filename of PDF to open
password (str or bytes): User or owner password to open an
@@ -278,7 +492,8 @@ PYBIND11_MODULE(_qpdf, m) {
ignore_xref_streams (bool): If True, ignore cross-reference
streams. See qpdf documentation.
suppress_warnings (bool): If True (default), warnings are not
- printed to stderr. Use `get_warnings()` to retrieve warnings.
+ printed to stderr. Use :meth:`pikepdf.Pdf.get_warnings()` to
+ retrieve warnings.
attempt_recovery (bool): If True (default), attempt to recover
from PDF parsing errors.
inherit_page_attributes (bool): If True (default), push attributes
@@ -289,7 +504,7 @@ PYBIND11_MODULE(_qpdf, m) {
file.
pikepdf.PdfError: If for other reasons we could not open
the file.
- TypeError: If the type of `filename_or_stream` is not
+ TypeError: If the type of ``filename_or_stream`` is not
usable.
FileNotFoundError: If the file was not found.
)~~~",
@@ -307,15 +522,15 @@ PYBIND11_MODULE(_qpdf, m) {
}
)
.def_property_readonly("filename", &QPDF::getFilename,
- "the source filename of an existing PDF, when available")
+ "The source filename of an existing PDF, when available.")
.def_property_readonly("pdf_version", &QPDF::getPDFVersion,
- "the PDF standard version, such as '1.7'")
+ "The PDF standard version, such as '1.7'.")
.def_property_readonly("extension_level", &QPDF::getExtensionLevel)
.def_property_readonly("Root", &QPDF::getRoot,
- "the /Root object of the PDF"
+ "The /Root object of the PDF."
)
.def_property_readonly("root", &QPDF::getRoot,
- "alias for .Root, the /Root object of the PDF"
+ "Alias for .Root, the /Root object of the PDF."
)
.def_property("docinfo",
[](QPDF& q) {
@@ -330,7 +545,16 @@ PYBIND11_MODULE(_qpdf, m) {
throw py::value_error("docinfo must be an indirect object - use Pdf.make_indirect");
q.getTrailer().replaceKey("/Info", replace);
},
- "access the document information dictionary"
+ R"~~~(
+ Access the (deprecated) document information dictionary.
+
+ The document information dictionary is a brief metadata record
+ that can store some information about the origin of a PDF. It is
+ deprecated and removed in the PDF 2.0 specification. Use the
+ ``.open_metadata()`` API instead, which will edit the modern (and
+ unfortunately, more complicated) XMP metadata object and synchronize
+ changes to the document information dictionary.
+ )~~~"
)
.def_property_readonly("trailer", &QPDF::getTrailer,
R"~~~(
@@ -394,8 +618,9 @@ PYBIND11_MODULE(_qpdf, m) {
The page can be either be a newly constructed PDF object or it can
be obtained from another PDF.
- :param pikepdf.Object page: The page object to attach
- :param bool first: If True, prepend this before the first page; if False append after last page
+ Args:
+ page (pikepdf.Object): The page object to attach
+ first (bool): If True, prepend this before the first page; if False append after last page
)~~~",
py::arg("page"),
py::arg("first")=false,
@@ -423,11 +648,14 @@ PYBIND11_MODULE(_qpdf, m) {
.def("save",
save_pdf,
R"~~~(
- Save all modifications to this :class:`pikepdf.Pdf`
+ Save all modifications to this :class:`pikepdf.Pdf`.
Args:
filename (str or stream): Where to write the output. If a file
- exists in this location it will be overwritten.
+ exists in this location it will be overwritten. The file
+ should not be the same as the input file, because data from
+ the input file may be lazily loaded; as such overwriting
+ in place will null-out objects.
static_id (bool): Indicates that the ``/ID`` metadata, normally
calculated as a hash of certain PDF contents and metadata
@@ -437,12 +665,20 @@ PYBIND11_MODULE(_qpdf, m) {
manner compliant with PDF/A and other stricter variants.
This should be True, the default, in most cases.
- min_version (str): Sets the minimum version of PDF
+ min_version (str or tuple): Sets the minimum version of PDF
specification that should be required. If left alone QPDF
- will decide.
- force_version (str): Override the version recommend by QPDF,
+ will decide. If a tuple, the second element is an integer, the
+ extension level.
+ force_version (str or tuple): Override the version recommend by QPDF,
potentially creating an invalid file that does not display
- in old versions. See QPDF manual for details.
+ in old versions. See QPDF manual for details. If a tuple, the
+ second element is an integer, the extension level.
+ fix_metadata_version (bool): If True (default) and the XMP metadata
+ contains the optional PDF version field, ensure the version in
+ metadata is correct. If the XMP metadata does not contain a PDF
+ version field, none will be added. To ensure that the field is
+ added, edit the metadata and insert a placeholder value in
+ ``pdf:PDFVersion``.
object_stream_mode (pikepdf.ObjectStreamMode):
``disable`` prevents the use of object streams.
@@ -472,10 +708,24 @@ PYBIND11_MODULE(_qpdf, m) {
the program ``fix-qdf`` to fix convert back to a standard
PDF.
+ progress (callable): Specify a callback function that is called
+ as the PDF is written. The function will be called with an
+ integer between 0-100 as the sole parameter, the progress
+ percentage. This function may not access or modify the PDF
+ while it is being written, or data corruption will almost
+ certainly occur.
+
+ encryption (pikepdf.models.Encryption or bool): If ``False``
+ or omitted, existing encryption will be removed. If ``True``
+ encryption settings are copied from the originating PDF.
+ Alternately, an ``Encryption`` object may be provided that
+ sets the parameters for new encryption.
+
You may call ``.save()`` multiple times with different parameters
to generate different versions of a file, and you *may* continue
to modify the file after saving it. ``.save()`` does not modify
- the ``Pdf`` object in memory.
+ the ``Pdf`` object in memory, except possibly by updating the XMP
+ metadata version with ``fix_metadata_version``.
.. note::
@@ -491,13 +741,15 @@ PYBIND11_MODULE(_qpdf, m) {
py::arg("preserve_pdfa")=true,
py::arg("min_version")="",
py::arg("force_version")="",
+ py::arg("fix_metadata_version")=true,
py::arg("compress_streams")=true,
- py::arg("stream_decode_level")=qpdf_stream_decode_level_e::qpdf_dl_generalized,
+ py::arg("stream_decode_level")=py::none(),
py::arg("object_stream_mode")=qpdf_object_stream_e::qpdf_o_preserve,
py::arg("normalize_content")=false,
py::arg("linearize")=false,
py::arg("qdf")=false,
- py::arg("progress")=py::none()
+ py::arg("progress")=py::none(),
+ py::arg("encryption")=py::none()
)
.def("_get_object_id", &QPDF::getObjectByID)
.def("get_object",
@@ -510,7 +762,8 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
)~~~",
- py::return_value_policy::reference_internal
+ py::return_value_policy::reference_internal,
+ py::arg("objgen")
)
.def("get_object",
[](QPDF &q, int objid, int gen) {
@@ -522,7 +775,9 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
)~~~",
- py::return_value_policy::reference_internal
+ py::return_value_policy::reference_internal,
+ py::arg("objid"),
+ py::arg("gen")
)
.def("make_indirect", &QPDF::makeIndirectObject,
R"~~~(
@@ -544,7 +799,8 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
- )~~~"
+ )~~~",
+ py::arg("h")
)
.def("make_indirect",
[](QPDF &q, py::object obj) -> QPDFObjectHandle {
@@ -555,7 +811,8 @@ PYBIND11_MODULE(_qpdf, m) {
Returns:
pikepdf.Object
- )~~~"
+ )~~~",
+ py::arg("obj")
)
.def("copy_foreign",
[](QPDF &q, QPDFObjectHandle &h) -> QPDFObjectHandle {
@@ -563,20 +820,106 @@ PYBIND11_MODULE(_qpdf, m) {
},
"Copy object from foreign PDF to this one.",
py::return_value_policy::reference_internal,
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ py::arg("h")
)
.def("_replace_object",
[](QPDF &q, int objid, int gen, QPDFObjectHandle &h) {
q.replaceObject(objid, gen, h);
}
)
- ; // class Pdf
-
- init_object(m);
+ .def("_swap_objects",
+ [](QPDF &q, std::pair<int, int> objgen1, std::pair<int, int> objgen2) {
+ QPDFObjGen o1(objgen1.first, objgen1.second);
+ QPDFObjGen o2(objgen2.first, objgen2.second);
+ q.swapObjects(o1, o2);
+ }
+ )
+ .def("_process",
+ [](QPDF &q, std::string description, py::bytes data) {
+ std::string s = data;
+ q.processMemoryFile(
+ description.c_str(),
+ s.data(),
+ s.size()
+ );
+ },
+ R"~~~(
+ Process a new in-memory PDF, replacing the existing PDF
-#ifdef VERSION_INFO
- m.attr("__version__") = VERSION_INFO;
-#else
- m.attr("__version__") = "dev";
-#endif
+ Used to implement Pdf.close().
+ )~~~"
+ )
+ .def_property_readonly("_allow_accessibility",
+ [](QPDF &q) {
+ return q.allowAccessibility();
+ }
+ )
+ .def_property_readonly("_allow_extract",
+ [](QPDF &q) {
+ return q.allowExtractAll();
+ }
+ )
+ .def_property_readonly("_allow_print_lowres",
+ [](QPDF &q) {
+ return q.allowPrintLowRes();
+ }
+ )
+ .def_property_readonly("_allow_print_highres",
+ [](QPDF &q) {
+ return q.allowPrintHighRes();
+ }
+ )
+ .def_property_readonly("_allow_modify_assembly",
+ [](QPDF &q) {
+ return q.allowModifyAssembly();
+ }
+ )
+ .def_property_readonly("_allow_modify_form",
+ [](QPDF &q) {
+ return q.allowModifyForm();
+ }
+ )
+ .def_property_readonly("_allow_modify_annotation",
+ [](QPDF &q) {
+ return q.allowModifyAnnotation();
+ }
+ )
+ .def_property_readonly("_allow_modify_other",
+ [](QPDF &q) {
+ return q.allowModifyOther();
+ }
+ )
+ .def_property_readonly("_allow_modify_all",
+ [](QPDF &q) {
+ return q.allowModifyAll();
+ }
+ )
+ .def_property_readonly("_encryption_data",
+ [](QPDF &q) {
+ int R = 0;
+ int P = 0;
+ int V = 0;
+ QPDF::encryption_method_e stream_method = QPDF::e_unknown;
+ QPDF::encryption_method_e string_method = QPDF::e_unknown;
+ QPDF::encryption_method_e file_method = QPDF::e_unknown;
+ if (!q.isEncrypted(R, P, V, stream_method, string_method, file_method))
+ return py::dict();
+
+ auto user_passwd = q.getTrimmedUserPassword();
+ auto encryption_key = q.getEncryptionKey();
+
+ return py::dict(
+ py::arg("R") = R,
+ py::arg("P") = P,
+ py::arg("V") = V,
+ py::arg("stream") = stream_method,
+ py::arg("string") = string_method,
+ py::arg("file") = file_method,
+ py::arg("user_passwd") = py::bytes(user_passwd),
+ py::arg("encryption_key") = py::bytes(encryption_key)
+ );
+ }
+ )
+ ; // class Pdf
}
diff --git a/src/qpdf/qpdf_inputsource.h b/src/qpdf/qpdf_inputsource.h
index dc26267..b29b309 100644
--- a/src/qpdf/qpdf_inputsource.h
+++ b/src/qpdf/qpdf_inputsource.h
@@ -17,7 +17,7 @@
#include <qpdf/Buffer.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/InputSource.hh>
-
+#include <qpdf/QUtil.hh>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
diff --git a/src/qpdf/qpdf_pagelist.cpp b/src/qpdf/qpdf_pagelist.cpp
index d8222dd..07c496d 100644
--- a/src/qpdf/qpdf_pagelist.cpp
+++ b/src/qpdf/qpdf_pagelist.cpp
@@ -121,18 +121,6 @@ void PageList::set_pages_from_iterable(py::slice slice, py::iterable other)
void PageList::delete_page(size_t index)
{
auto page = this->get_page(index);
- /*
- // Need a dec_ref to match the inc_ref in insert_page, but it's unclear
- // how to do that. The item will be set the current QPDF always.
- // Accessing data from another PDF seems to involve some pipeline
- // magic in QPDF around libqpdf/QPDFWriter.cc:1614
- if (original page owner != &this->getQPDF()) {
- // If we are removing a page not originally owned by our QPDF,
- // remove the reference count we put it in insert_page()
- py::object pyqpdf = py::cast(page_owner);
- pyqpdf.dec_ref();
- }
- */
this->qpdf->removePage(page);
}
@@ -175,24 +163,6 @@ void PageList::insert_page(size_t index, QPDFObjectHandle page)
// qpdf does not accept duplicating pages within the same file,
// so manually create a copy
page = this->qpdf->makeIndirectObject(page);
- } else {
- // libqpdf does not transfer a page's contents to the new QPDF.
- // Instead WHEN ASKED TO WRITE it will go back and get the data
- // from objecthandle->getOwningQPDF(). Therefore we must ensure
- // our previous owner is kept alive.
-#if 1
- auto tinfo = py::detail::get_type_info(typeid(QPDF));
- py::handle pyqpdf = py::detail::get_object_handle(page_owner, tinfo);
- py::handle pypage = py::cast(page);
- py::detail::keep_alive_impl(pypage, pyqpdf);
-#else
- // MSVC++ complains about the symbol
- // QPDF::Members::~Members() not being exported when this version
- // is used, but it works for GCC and Clang.
- py::handle pyqpdf = py::cast(page_owner);
- py::handle pypage = py::cast(page);
- py::detail::keep_alive_impl(pypage, pyqpdf);
-#endif
}
if (index != this->count()) {
QPDFObjectHandle refpage = this->get_page(index);
@@ -202,7 +172,6 @@ void PageList::insert_page(size_t index, QPDFObjectHandle page)
}
}
-
void init_pagelist(py::module &m)
{
py::class_<PageList>(m, "PageList")
@@ -229,12 +198,13 @@ void init_pagelist(py::module &m)
.def("__delitem__", &PageList::delete_pages_from_iterable)
.def("__len__", &PageList::count)
.def("p",
- [](PageList &pl, size_t index) {
- if (index == 0) // Indexing past end is checked in .get_page
+ [](PageList &pl, size_t pnum) {
+ if (pnum == 0) // Indexing past end is checked in .get_page
throw py::index_error("page access out of range in 1-based indexing");
- return pl.get_page(index - 1);
+ return pl.get_page(pnum - 1);
},
- "convenience - look up page number in ordinal numbering, .p(1) is first page"
+ "Convenience - look up page number in ordinal numbering, ``.p(1)`` is first page",
+ py::arg("pnum")
)
.def("__iter__",
[](PageList &pl) {
@@ -252,7 +222,16 @@ void init_pagelist(py::module &m)
[](PageList &pl, ssize_t index, py::object obj) {
size_t uindex = uindex_from_index(pl, index);
pl.insert_page(uindex, obj);
- }, py::keep_alive<1, 3>()
+ }, py::keep_alive<1, 3>(),
+ R"~~~(
+ Insert a page at the specified location.
+
+ Args:
+ index (int): location at which to insert page, 0-based indexing
+ obj (pikepdf.Object): page object to insert
+ )~~~",
+ py::arg("index"),
+ py::arg("obj")
)
.def("reverse",
[](PageList &pl) {
@@ -262,13 +241,16 @@ void init_pagelist(py::module &m)
PySlice_New(Py_None, Py_None, step.ptr()));
py::list reversed_pages = pl.get_pages(reversed);
pl.set_pages_from_iterable(ordinary_indices, reversed_pages);
- }
+ },
+ "Reverse the order of pages."
)
.def("append",
[](PageList &pl, py::object page) {
pl.insert_page(pl.count(), page);
},
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ "Add another page to the end.",
+ py::arg("page")
)
.def("extend",
[](PageList &pl, PageList &other) {
@@ -279,7 +261,9 @@ void init_pagelist(py::module &m)
pl.insert_page(pl.count(), other.get_page(i));
}
},
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ "Extend the ``Pdf`` by adding pages from another ``Pdf.pages``.",
+ py::arg("other")
)
.def("extend",
[](PageList &pl, py::iterable iterable) {
@@ -290,6 +274,29 @@ void init_pagelist(py::module &m)
++it;
}
},
- py::keep_alive<1, 2>()
+ py::keep_alive<1, 2>(),
+ "Extend the ``Pdf`` by adding pages from an iterable of pages.",
+ py::arg("iterable")
+ )
+ .def("remove",
+ [](PageList &pl, py::kwargs kwargs) {
+ auto pnum = kwargs["p"].cast<size_t>();
+ if (pnum == 0) // Indexing past end is checked in .get_page
+ throw py::index_error("page access out of range in 1-based indexing");
+ pl.delete_page(pnum - 1);
+ },
+ R"~~~(
+ Remove a page (using 1-based numbering)
+
+ Args:
+ p (int): 1-based page number
+ )~~~"
+ )
+ .def("__repr__",
+ [](PageList &pl) {
+ return std::string("<pikepdf._qpdf.PageList len=")
+ + std::to_string(pl.count())
+ + std::string(">");
+ }
);
}
diff --git a/src/qpdf/qpdf_pipeline.h b/src/qpdf/qpdf_pipeline.h
new file mode 100644
index 0000000..f922827
--- /dev/null
+++ b/src/qpdf/qpdf_pipeline.h
@@ -0,0 +1,77 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/)
+ */
+
+#include <cstdio>
+#include <cstring>
+
+#include <qpdf/Constants.h>
+#include <qpdf/Types.h>
+#include <qpdf/DLL.h>
+#include <qpdf/QPDFExc.hh>
+#include <qpdf/PointerHolder.hh>
+#include <qpdf/Buffer.hh>
+#include <qpdf/QPDF.hh>
+#include <qpdf/Pipeline.hh>
+#include <qpdf/QUtil.hh>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "pikepdf.h"
+
+
+class Pl_PythonOutput : public Pipeline
+{
+public:
+ Pl_PythonOutput(const char *identifier, py::object stream) :
+ Pipeline(identifier, nullptr),
+ stream(stream)
+ {
+ }
+
+ virtual ~Pl_PythonOutput() = default;
+ Pl_PythonOutput(const Pl_PythonOutput&) = delete;
+ Pl_PythonOutput& operator= (const Pl_PythonOutput&) = delete;
+ Pl_PythonOutput(Pl_PythonOutput&&) = delete;
+ Pl_PythonOutput& operator= (Pl_PythonOutput&&) = delete;
+
+ void write(unsigned char *buf, size_t len)
+ {
+ py::gil_scoped_acquire gil;
+ size_t so_far = 0;
+ while (len > 0) {
+ py::buffer_info buffer(buf, len);
+ py::memoryview view_buffer(buffer);
+ py::object result = this->stream.attr("write")(view_buffer);
+ try {
+ so_far = result.cast<size_t>();
+ } catch (const py::cast_error &e) {
+ throw py::type_error("Unexpected return type of write()");
+ }
+ if (so_far == 0) {
+ QUtil::throw_system_error(this->identifier);
+ } else {
+ buf += so_far;
+ len -= so_far;
+ }
+ }
+ }
+
+ void finish()
+ {
+ py::gil_scoped_acquire gil;
+ try {
+ this->stream.attr("flush")();
+ } catch (const py::attr_error &e) {
+ // Suppress
+ }
+ }
+
+private:
+ py::object stream;
+};
diff --git a/tests/conftest.py b/tests/conftest.py
index 8a67e83..8887415 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,8 @@
import os
import sys
-import platform
-
-pytest_plugins = ['helpers_namespace']
+from pathlib import Path
import pytest
-from pathlib import Path
-from subprocess import Popen, PIPE
if sys.version_info < (3, 4):
@@ -24,5 +20,10 @@ def resources():
@pytest.fixture(scope="function")
-def outdir(tmpdir):
- return Path(str(tmpdir))
+def outdir(tmp_path):
+ return tmp_path
+
+
+@pytest.fixture(scope="function")
+def outpdf(tmp_path):
+ return tmp_path / 'out.pdf'
diff --git a/tests/resources/1biticc.pdf b/tests/resources/1biticc.pdf
new file mode 100644
index 0000000..b33b4bb
--- /dev/null
+++ b/tests/resources/1biticc.pdf
Binary files differ
diff --git a/tests/resources/graph-encrypted.pdf b/tests/resources/graph-encrypted.pdf
index 6e086af..6c9741e 100644
--- a/tests/resources/graph-encrypted.pdf
+++ b/tests/resources/graph-encrypted.pdf
Binary files differ
diff --git a/tests/resources/pike-flate-jp2.pdf b/tests/resources/pike-flate-jp2.pdf
new file mode 100644
index 0000000..c074e69
--- /dev/null
+++ b/tests/resources/pike-flate-jp2.pdf
Binary files differ
diff --git a/tests/test_codec.py b/tests/test_codec.py
new file mode 100644
index 0000000..c101d8c
--- /dev/null
+++ b/tests/test_codec.py
@@ -0,0 +1,16 @@
+import pytest
+
+import pikepdf.codec
+
+
+def test_encode():
+ assert 'abc'.encode('pdfdoc') == b'abc'
+ with pytest.raises(ValueError):
+ '你好'.encode('pdfdoc')
+ assert '你好 world'.encode('pdfdoc', 'replace') == b'?? world'
+ assert '你好 world'.encode('pdfdoc', 'ignore') == b' world'
+
+
+def test_decode():
+ assert b'A'.decode('pdfdoc') == 'A'
+ assert b'\xa0'.decode('pdfdoc') == '€'
diff --git a/tests/test_dictionary.py b/tests/test_dictionary.py
index 5341968..78a558a 100644
--- a/tests/test_dictionary.py
+++ b/tests/test_dictionary.py
@@ -1,6 +1,10 @@
-from pikepdf import Pdf
import pytest
+from pikepdf import Pdf
+
+
+# pylint: disable=redefined-outer-name,pointless-statement,expression-not-assigned
+
@pytest.fixture
def congress(resources):
diff --git a/tests/test_encrypt.py b/tests/test_encrypt.py
new file mode 100644
index 0000000..50b4e8d
--- /dev/null
+++ b/tests/test_encrypt.py
@@ -0,0 +1,128 @@
+import pytest
+
+import pikepdf
+
+
+@pytest.fixture
+def trivial(resources):
+ return pikepdf.open(resources / 'pal-1bit-trivial.pdf')
+
+
+@pytest.fixture
+def graph_encrypted(resources):
+ return pikepdf.open(resources / 'graph-encrypted.pdf', password='owner')
+
+
+@pytest.mark.parametrize(
+ "R,owner,user",
+ [
+ (6, "foo", "bar"),
+ (4, "password", "password"),
+ (3, "12345678", "secret"),
+ (2, "qwerty", "123456"),
+ ],
+)
+def test_encrypt_basic(trivial, outpdf, R, owner, user):
+ trivial.save(outpdf, encryption=dict(R=R, owner=owner, user=user))
+ pdf_owner = pikepdf.open(outpdf, password=owner)
+ assert pdf_owner.is_encrypted
+ pdf_user = pikepdf.open(outpdf, password=user)
+ assert pdf_user.is_encrypted
+
+
+def test_encrypt_R5(trivial, outpdf):
+ with pytest.warns(UserWarning):
+ trivial.save(outpdf, encryption=dict(R=5, owner='foo', user='foo'))
+
+
+@pytest.mark.parametrize("R", [-1, 0, 1, 7, 9, 42])
+def test_encrypt_invalid_level_value(trivial, outpdf, R):
+ with pytest.raises(ValueError):
+ trivial.save(outpdf, encryption=dict(R=R, owner='foo', user='foo'))
+
+
+@pytest.mark.parametrize("R", [3.14, '6', b'6', None])
+def test_encrypt_invalid_level(trivial, outpdf, R):
+ with pytest.raises(TypeError):
+ trivial.save(outpdf, encryption=dict(R=R, owner='foo', user='foo'))
+
+
+def test_encrypt_without_owner(trivial, outpdf):
+ trivial.save(outpdf, encryption=dict(user='foo'))
+
+
+def test_encrypt_no_passwords(trivial, outpdf):
+ trivial.save(outpdf, encryption=dict(R=6))
+
+
+def test_encrypt_permissions_deny(trivial, outpdf):
+ perms = pikepdf.models.Permissions(extract=False)
+ trivial.save(
+ outpdf, encryption=pikepdf.Encryption(owner='sun', user='moon', allow=perms)
+ )
+ pdf = pikepdf.open(outpdf, password='sun')
+ assert not pdf.allow.extract
+ assert pdf.allow.modify_form
+
+
+def test_encrypt_info(trivial, outpdf):
+ trivial.save(outpdf, encryption=dict(R=4, owner='foo', user='bar'))
+ pdf = pikepdf.open(outpdf, password='foo')
+ assert pdf.encryption.user_password == b'bar'
+ assert pdf.encryption.bits == 128
+
+
+@pytest.mark.parametrize(
+ "R,owner,user,aes,metadata,err",
+ [
+ (6, "foo", "bar", 42, False, r"aes.*bool"),
+ (6, "password", "password", True, 42, r"metadata.*bool"),
+ (3, "12345678", "secret", False, True, r"metadata.*R < 4"),
+ (2, "qwerty", "123456", True, False, r"AES.*R < 4"),
+ (6, "rc4", "rc4", False, True, r"R = 6.*AES"),
+ (4, "met", "met", False, True, r"unless AES"),
+ ],
+)
+def test_bad_settings(trivial, outpdf, R, owner, user, aes, metadata, err):
+ with pytest.raises(Exception, match=err):
+ trivial.save(
+ outpdf,
+ encryption=pikepdf.Encryption(
+ R=R, owner=owner, user=user, aes=aes, metadata=metadata
+ ),
+ )
+
+
+def test_block_encryption_and_normalize(trivial, outpdf):
+ with pytest.raises(ValueError, match=r'encryption and normalize_content'):
+ trivial.save(
+ outpdf,
+ encryption=pikepdf.Encryption(owner='foo', user='bar'),
+ normalize_content=True,
+ )
+
+
+def test_consistency_saving_removes_encryption(graph_encrypted, outpdf):
+ # This was not intended behavior. It's a side effect of unconditionally calling
+ # w.setDecodeLevel(), which disables preserving encryption in
+ # QPDFWriter::doWriteSetup()
+ graph_encrypted.save(outpdf)
+ with pikepdf.open(outpdf) as pdf:
+ assert not pdf.is_encrypted
+
+
+def test_save_without_encryption(graph_encrypted, outpdf):
+ graph_encrypted.save(outpdf, encryption=False)
+ with pikepdf.open(outpdf) as pdf:
+ assert not pdf.is_encrypted
+
+
+def test_save_preserve_encryption(graph_encrypted, outpdf):
+ graph_encrypted.save(outpdf, encryption=True)
+ with pikepdf.open(outpdf, 'owner') as pdf:
+ assert pdf.is_encrypted
+
+
+def test_preserve_encryption_not_encrypted(trivial, outpdf):
+ with pytest.raises(ValueError):
+ trivial.save(outpdf, encryption=True)
diff --git a/tests/test_formxobject.py b/tests/test_formxobject.py
index f402d76..7e252c9 100644
--- a/tests/test_formxobject.py
+++ b/tests/test_formxobject.py
@@ -1,5 +1,6 @@
import pytest
-from pikepdf import Pdf, Object, Stream, Name, Dictionary
+
+from pikepdf import Dictionary, Name, Object, Pdf, Stream
# pylint: disable=e1137
@@ -8,7 +9,8 @@ def test_create_form_xobjects(outdir):
pdf = Pdf.new()
font = pdf.make_indirect(
- Object.parse(b"""
+ Object.parse(
+ b"""
<<
/Type /Font
/Subtype /Type1
@@ -16,14 +18,16 @@ def test_create_form_xobjects(outdir):
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
>>
- """)
+ """
+ )
)
width, height = 100, 100
image_data = b"\xff\x7f\x00" * (width * height)
image = Stream(pdf, image_data)
- image.stream_dict = Object.parse("""
+ image.stream_dict = Object.parse(
+ """
<<
/Type /XObject
/Subtype /Image
@@ -32,15 +36,17 @@ def test_create_form_xobjects(outdir):
/Width 100
/Height 100
>>
- """)
+ """
+ )
xobj_image = Dictionary({'/Im1': image})
- form_xobj_res = Dictionary({
- '/XObject': xobj_image
- })
- form_xobj = Stream(pdf, b"""
+ form_xobj_res = Dictionary({'/XObject': xobj_image})
+ form_xobj = Stream(
+ pdf,
+ b"""
/Im1 Do
- """)
+ """,
+ )
form_xobj['/Type'] = Name('/XObject')
form_xobj['/Subtype'] = Name('/Form')
form_xobj['/FormType'] = 1
@@ -50,10 +56,7 @@ def test_create_form_xobjects(outdir):
rfont = {'/F1': font}
- resources = {
- '/Font': rfont,
- '/XObject': {'/Form1': form_xobj},
- }
+ resources = {'/Font': rfont, '/XObject': {'/Form1': form_xobj}}
mediabox = [0, 0, 612, 792]
@@ -65,12 +68,14 @@ def test_create_form_xobjects(outdir):
contents = Stream(pdf, stream)
- page = pdf.make_indirect({
- '/Type': Name('/Page'),
- '/MediaBox': mediabox,
- '/Contents': contents,
- '/Resources': resources
- })
+ page = pdf.make_indirect(
+ {
+ '/Type': Name('/Page'),
+ '/MediaBox': mediabox,
+ '/Contents': contents,
+ '/Resources': resources,
+ }
+ )
pdf.pages.append(page)
pdf.save(outdir / 'formxobj.pdf')
diff --git a/tests/test_image_access.py b/tests/test_image_access.py
index 05fa010..113a5ef 100644
--- a/tests/test_image_access.py
+++ b/tests/test_image_access.py
@@ -1,16 +1,28 @@
-import pytest
-import imghdr
-from io import BytesIO
-from PIL import Image, features as PIL_features
import zlib
+from io import BytesIO
+from pathlib import Path
-# pylint: disable=w0621
-
+import pytest
+from PIL import Image
+from PIL import features as PIL_features
from pikepdf import (
- Pdf, PdfImage, PdfError, Name,
- parse_content_stream, PdfInlineImage, Stream, StreamDecodeLevel
+ Array,
+ Dictionary,
+ Name,
+ Pdf,
+ PdfError,
+ PdfImage,
+ PdfInlineImage,
+ Stream,
+ StreamDecodeLevel,
+ parse_content_stream,
)
+from pikepdf._cpphelpers import fspath
+from pikepdf.models.image import UnsupportedImageTypeError
+
+
+# pylint: disable=redefined-outer-name
def first_image_in(filename):
@@ -55,10 +67,7 @@ def test_image_replace(congress, outdir):
grayscale = pillowimage.convert('L')
grayscale = grayscale.resize((4, 4)) # So it is not obnoxious on error
- congress[0].write(
- zlib.compress(grayscale.tobytes()),
- filter=Name("/FlateDecode")
- )
+ congress[0].write(zlib.compress(grayscale.tobytes()), filter=Name("/FlateDecode"))
congress[0].ColorSpace = Name("/DeviceGray")
pdf = congress[1]
pdf.save(outdir / 'congress_gray.pdf')
@@ -69,7 +78,8 @@ def test_lowlevel_jpeg(congress):
with pytest.raises(PdfError):
congress[0].read_bytes()
- assert imghdr.what('', h=raw_bytes) == 'jpeg'
+ im = Image.open(BytesIO(raw_bytes))
+ assert im.format == 'JPEG'
pim = PdfImage(congress[0])
b = BytesIO()
@@ -89,8 +99,7 @@ def test_lowlevel_replace_jpeg(congress, outdir):
grayscale = grayscale.resize((4, 4)) # So it is not obnoxious on error
congress[0].write(
- zlib.compress(grayscale.tobytes()[:10]),
- filter=Name("/FlateDecode")
+ zlib.compress(grayscale.tobytes()[:10]), filter=Name("/FlateDecode")
)
congress[0].ColorSpace = Name('/DeviceGray')
@@ -121,11 +130,14 @@ def test_bits_per_component_missing(congress):
assert PdfImage(congress[0]).bits_per_component == 8
-@pytest.mark.parametrize('w,h,pixeldata,cs,bpc', [
- (1, 1, b'\xff', '/DeviceGray', 1),
- (1, 1, b'\xf0', '/DeviceGray', 8),
- (1, 1, b'\xff\x00\xff', '/DeviceRGB', 8)
-])
+@pytest.mark.parametrize(
+ 'w,h,pixeldata,cs,bpc',
+ [
+ (1, 1, b'\xff', '/DeviceGray', 1),
+ (1, 1, b'\xf0', '/DeviceGray', 8),
+ (1, 1, b'\xff\x00\xff', '/DeviceRGB', 8),
+ ],
+)
def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc):
pdf = Pdf.new()
@@ -149,16 +161,15 @@ def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc):
'/Type': Name('/Page'),
'/MediaBox': mediabox,
'/Contents': contents,
- '/Resources': resources
+ '/Resources': resources,
}
page = pdf.make_indirect(page_dict)
pdf.pages.append(page)
- outfile = outdir / 'test{w}{h}{cs}{bpc}.pdf'.format(
- w=w, h=h, cs=cs[1:], bpc=bpc
+ outfile = outdir / 'test{w}{h}{cs}{bpc}.pdf'.format(w=w, h=h, cs=cs[1:], bpc=bpc)
+ pdf.save(
+ outfile, compress_streams=False, stream_decode_level=StreamDecodeLevel.none
)
- pdf.save(outfile, compress_streams=False,
- stream_decode_level=StreamDecodeLevel.none)
p2 = pdf.open(outfile)
pim = PdfImage(p2.pages[0].Resources.XObject['/Im1'])
@@ -185,16 +196,17 @@ def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc):
assert pim.mode == im.mode
-@pytest.mark.parametrize('filename,bpc,filters,ext,mode,format',
+@pytest.mark.parametrize(
+ 'filename,bpc,filters,ext,mode,format_',
[
('sandwich.pdf', 1, ['/CCITTFaxDecode'], '.tif', '1', 'TIFF'),
('congress-gray.pdf', 8, ['/DCTDecode'], '.jpg', 'L', 'JPEG'),
('congress.pdf', 8, ['/DCTDecode'], '.jpg', 'RGB', 'JPEG'),
- ('cmyk-jpeg.pdf', 8, ['/DCTDecode'], '.jpg', 'CMYK', 'JPEG')
- ]
+ ('cmyk-jpeg.pdf', 8, ['/DCTDecode'], '.jpg', 'CMYK', 'JPEG'),
+ ],
)
-def test_direct_extract(resources, filename, bpc, filters, ext, mode, format):
- xobj, pdf = first_image_in(resources / filename)
+def test_direct_extract(resources, filename, bpc, filters, ext, mode, format_):
+ xobj, _pdf = first_image_in(resources / filename)
pim = PdfImage(xobj)
assert pim.bits_per_component == bpc
@@ -207,14 +219,19 @@ def test_direct_extract(resources, filename, bpc, filters, ext, mode, format):
im = Image.open(outstream)
assert im.mode == mode
- assert im.format == format
+ assert im.format == format_
-@pytest.mark.parametrize('filename,bpc', [
- ('pal.pdf', 8),
- ('pal-1bit-trivial.pdf', 1),
- pytest.param('pal-1bit-rgb.pdf', 1, marks=pytest.mark.xfail(raises=NotImplementedError)),
-])
+@pytest.mark.parametrize(
+ 'filename,bpc',
+ [
+ ('pal.pdf', 8),
+ ('pal-1bit-trivial.pdf', 1),
+ pytest.param(
+ 'pal-1bit-rgb.pdf', 1, marks=pytest.mark.xfail(raises=NotImplementedError)
+ ),
+ ],
+)
def test_image_palette(resources, filename, bpc):
pdf = Pdf.open(resources / filename)
pim = PdfImage(next(iter(pdf.pages[0].images.values())))
@@ -234,8 +251,9 @@ def test_bool_in_inline_image():
assert piim.image_mask
-@pytest.mark.skipif(not PIL_features.check_codec('jpg_2000'),
- reason='no JPEG2000 codec')
+@pytest.mark.skipif(
+ not PIL_features.check_codec('jpg_2000'), reason='no JPEG2000 codec'
+)
def test_jp2(resources):
pdf = Pdf.open(resources / 'pike-jp2.pdf')
xobj = next(iter(pdf.pages[0].images.values()))
@@ -258,3 +276,52 @@ def test_jp2(resources):
pim = PdfImage(xobj)
assert pim.colorspace == '/DeviceRGB'
assert pim.bits_per_component == 8
+
+
+def test_extract_filepath(congress, outdir):
+ xobj, _pdf = congress
+ pim = PdfImage(xobj)
+
+ # fspath is for Python 3.5
+ result = pim.extract_to(fileprefix=fspath(outdir / 'image'))
+ assert Path(result).exists()
+ assert (outdir / 'image.jpg').exists()
+
+
+def test_extract_direct_fails_nondefault_colortransform(congress):
+ xobj, _pdf = congress
+
+ xobj.DecodeParms = Dictionary(
+ ColorTransform=42 # Non standard (or allowed in the spec)
+ )
+ pim = PdfImage(xobj)
+
+ bio = BytesIO()
+ with pytest.raises(UnsupportedImageTypeError):
+ pim._extract_direct(stream=bio)
+
+ xobj.ColorSpace = Name.DeviceCMYK
+ pim = PdfImage(xobj)
+ with pytest.raises(UnsupportedImageTypeError):
+ pim._extract_direct(stream=bio)
+
+
+def test_icc_use(resources):
+ xobj, _pdf = first_image_in(resources / '1biticc.pdf')
+
+ pim = PdfImage(xobj)
+ assert pim.mode == '1'
+ assert pim.colorspace == '/ICCBased'
+ assert pim.bits_per_component == 1
+
+ assert pim.icc.profile.xcolor_space == 'GRAY'
+
+
+def test_stacked_compression(resources):
+ xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf')
+
+ pim = PdfImage(xobj)
+ assert pim.mode == 'RGB'
+ assert pim.colorspace == '/DeviceRGB'
+ assert pim.bits_per_component == 8
+ assert pim.filters == ['/FlateDecode', '/JPXDecode']
diff --git a/tests/test_io.py b/tests/test_io.py
new file mode 100644
index 0000000..4ce8eb5
--- /dev/null
+++ b/tests/test_io.py
@@ -0,0 +1,26 @@
+import pytest
+
+from pikepdf import Pdf
+from io import BytesIO
+
+
+@pytest.fixture
+def sandwich(resources):
+ # Has XMP, docinfo, <?adobe-xap-filters esc="CRLF"?>, shorthand attribute XMP
+ return Pdf.open(resources / 'sandwich.pdf')
+
+
+class LimitedBytesIO(BytesIO):
+ """Version of BytesIO that only accepts small reads/writes"""
+
+ def write(self, b):
+ amt = min(len(b), 100)
+ return super().write(b[:amt])
+
+
+def test_weird_output_stream(sandwich):
+ bio = BytesIO()
+ lbio = LimitedBytesIO()
+ sandwich.save(bio, static_id=True)
+ sandwich.save(lbio, static_id=True)
+ assert bio.getvalue() == lbio.getvalue()
diff --git a/tests/test_ipython.py b/tests/test_ipython.py
index 4f616c8..36e2e4b 100644
--- a/tests/test_ipython.py
+++ b/tests/test_ipython.py
@@ -2,9 +2,10 @@
Test IPython/Jupyter display hooks
"""
-import pikepdf
import pytest
+import pikepdf
+
@pytest.fixture
def graph(resources):
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index 41a879c..3de8ccf 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -1,28 +1,36 @@
-from pathlib import Path
-from datetime import datetime, timezone, timedelta
import re
+from datetime import datetime, timedelta, timezone
+import os
+from pathlib import Path
+import xml.etree.ElementTree as ET
import pytest
from hypothesis import given, example
from hypothesis.strategies import integers
+
import pikepdf
-from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream
+from pikepdf import Dictionary, Name, PasswordError, Pdf, Stream
from pikepdf.models.metadata import (
- decode_pdf_date, encode_pdf_date,
- XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP,
- DateConverter
+ XMP_NS_DC,
+ XMP_NS_PDF,
+ XMP_NS_XMP,
+ DateConverter,
+ decode_pdf_date,
+ encode_pdf_date,
)
-import defusedxml.ElementTree as ET
-
try:
- from libxmp import XMPMeta
-except ImportError:
- XMPMeta = None
+ from libxmp import XMPMeta, XMPError
+except Exception:
+ XMPMeta, XMPError = None, None
+
+needs_libxmp = pytest.mark.skipif(
+ os.name == 'nt' or not XMPMeta, reason="test requires libxmp"
+)
pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning')
-# pylint: disable=w0621
+# pylint: disable=redefined-outer-name,pointless-statement
@pytest.fixture
@@ -58,7 +66,10 @@ def invalid_creationdate(resources):
def test_lowlevel(sandwich):
meta = sandwich.open_metadata()
assert meta._qname('pdf:Producer') == '{http://ns.adobe.com/pdf/1.3/}Producer'
- assert meta._prefix_from_uri('{http://ns.adobe.com/pdf/1.3/}Producer') == 'pdf:Producer'
+ assert (
+ meta._prefix_from_uri('{http://ns.adobe.com/pdf/1.3/}Producer')
+ == 'pdf:Producer'
+ )
assert 'pdf:Producer' in meta
assert '{http://ns.adobe.com/pdf/1.3/}Producer' in meta
assert 'xmp:CreateDate' in meta
@@ -120,16 +131,14 @@ def test_add_new_xmp_and_mark(trivial):
) as xmp_view:
assert not xmp_view
- with trivial.open_metadata(update_docinfo=False
- ) as xmp:
+ with trivial.open_metadata(update_docinfo=False) as xmp:
assert not xmp # No changes at this point
del xmp
print(trivial.Root.Metadata.read_bytes())
- with trivial.open_metadata(update_docinfo=False
- ) as xmp:
- assert 'pikepdf' in xmp['pdf:Producer']
+ with trivial.open_metadata(update_docinfo=False) as xmp:
+ assert xmp['pdf:Producer'] == 'pikepdf ' + pikepdf.__version__
assert 'xmp:MetadataDate' in xmp
@@ -147,7 +156,9 @@ def test_update_docinfo(vera):
assert Name.Author not in vera.docinfo
-@pytest.mark.parametrize('filename', list((Path(__file__).parent / 'resources').glob('*.pdf')))
+@pytest.mark.parametrize(
+ 'filename', list((Path(__file__).parent / 'resources').glob('*.pdf'))
+)
def test_roundtrip(filename):
try:
pdf = Pdf.open(filename)
@@ -175,6 +186,7 @@ def test_build_metadata(trivial, graph, outdir):
assert xmp_date == docinfo_date.isoformat()
+@needs_libxmp
def test_python_xmp_validate_add(trivial):
with trivial.open_metadata() as xmp:
xmp['dc:creator'] = ['Bob', 'Doug']
@@ -185,9 +197,6 @@ def test_python_xmp_validate_add(trivial):
assert '<rdf:Seq><rdf:li>Bob</rdf:li><rdf:li>Doug</rdf:li>' in xmp_str
assert '<rdf:Bag><rdf:li>Mackenzie</rdf:li>' in xmp_str
- if not XMPMeta:
- pytest.skip(msg='needs libxmp')
-
xmpmeta = XMPMeta(xmp_str=str(xmp))
DC = XMP_NS_DC
assert xmpmeta.does_array_item_exist(DC, 'creator', 'Bob')
@@ -196,6 +205,7 @@ def test_python_xmp_validate_add(trivial):
assert xmpmeta.does_array_item_exist(DC, 'publisher', 'Mackenzie')
+@needs_libxmp
def test_python_xmp_validate_change_list(graph):
with graph.open_metadata() as xmp:
assert 'dc:creator' in xmp
@@ -209,14 +219,13 @@ def test_python_xmp_validate_change_list(graph):
assert xmpmeta.does_array_item_exist(DC, 'creator', 'Kreacher')
+@needs_libxmp
def test_python_xmp_validate_change(sandwich):
with sandwich.open_metadata() as xmp:
assert 'xmp:CreatorTool' in xmp
xmp['xmp:CreatorTool'] = 'Creator' # Exists as a xml tag text
xmp['pdf:Producer'] = 'Producer' # Exists as a tag node
assert str(xmp)
- if not XMPMeta:
- pytest.skip(msg='needs libxmp')
xmpmeta = XMPMeta(xmp_str=str(xmp))
assert xmpmeta.does_property_exist(XMP_NS_XMP, 'CreatorTool')
assert xmpmeta.does_property_exist(XMP_NS_PDF, 'Producer')
@@ -228,7 +237,10 @@ def test_decode_pdf_date():
("20180101010101Z00'00'", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
("20180101010101Z", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
("20180101010101+0000", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
- ("20180101010101+0100", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone(timedelta(hours=1)))),
+ (
+ "20180101010101+0100",
+ datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone(timedelta(hours=1))),
+ ),
]
for s, d in VALS:
assert decode_pdf_date(s) == d
@@ -291,8 +303,10 @@ def test_xpacket_generation(sandwich):
xmpstr2 = sandwich.Root.Metadata.read_bytes()
assert xmpstr2.startswith(xpacket_begin)
+
def only_one_substring(s, subs):
return s.find(subs) == s.rfind(subs)
+
assert only_one_substring(xmpstr2, xpacket_begin)
assert only_one_substring(xmpstr2, xpacket_end)
@@ -318,7 +332,9 @@ def test_remove_attribute_metadata(sandwich):
def test_no_x_xmpmeta(trivial):
- trivial.Root.Metadata = Stream(trivial, b"""
+ trivial.Root.Metadata = Stream(
+ trivial,
+ b"""
<?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:xmp="http://ns.adobe.com/xap/1.0/">
@@ -334,9 +350,71 @@ def test_no_x_xmpmeta(trivial):
</rdf:Description>
</rdf:RDF>
<?xpacket end="w"?>
- """.strip())
+ """.strip(),
+ )
with trivial.open_metadata() as xmp:
assert xmp._get_rdf_root() is not None
xmp['pdfaid:part'] = '2'
assert xmp['pdfaid:part'] == '2'
+
+
+def test_empty_xmpmeta(trivial):
+ trivial.Root.Metadata = Stream(
+ trivial,
+ b"""<?xpacket begin="" id=""?>
+ <x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="">
+ </x:xmpmeta>
+ <?xpacket end=""?>
+ """,
+ )
+ with trivial.open_metadata() as xmp:
+ pass
+
+
+@needs_libxmp
+def test_pdf_version_update(graph, outdir):
+ def get_xmp_version(filename):
+ meta = pikepdf.open(filename).open_metadata()
+ xmp = XMPMeta(xmp_str=str(meta))
+ try:
+ return xmp.get_property('http://ns.adobe.com/pdf/1.3/', 'PDFVersion')
+ except XMPError:
+ return ''
+
+ # We don't update PDFVersion unless it is present, even if we change the PDF version
+ graph.save(
+ outdir / 'empty_xmp_pdfversion.pdf',
+ force_version='1.7',
+ fix_metadata_version=True,
+ )
+ assert get_xmp_version(outdir / 'empty_xmp_pdfversion.pdf') == ''
+
+ # Add PDFVersion field for remaining tests
+ with graph.open_metadata() as m:
+ m['pdf:PDFVersion'] = graph.pdf_version
+
+ # Confirm we don't update the field when the flag is false
+ graph.save(
+ outdir / 'inconsistent_version.pdf',
+ force_version='1.6',
+ fix_metadata_version=False,
+ )
+ assert get_xmp_version(outdir / 'inconsistent_version.pdf') == '1.3'
+
+ # Confirm we update if present
+ graph.save(outdir / 'consistent_version.pdf', force_version='1.5')
+ assert get_xmp_version(outdir / 'consistent_version.pdf') == '1.5'
+
+
+def test_extension_level(trivial, outpdf):
+ trivial.save(outpdf, min_version=('1.6', 314159))
+ pdf = pikepdf.open(outpdf)
+ assert pdf.pdf_version >= '1.6' and pdf.extension_level == 314159
+
+ trivial.save(outpdf, force_version=('1.7', 42))
+ pdf = pikepdf.open(outpdf)
+ assert pdf.pdf_version == '1.7' and pdf.extension_level == 42
+
+ with pytest.raises(TypeError):
+ trivial.save(outpdf, force_version=('1.7', 'invalid extension level'))
diff --git a/tests/test_object.py b/tests/test_object.py
index 5e4b008..8fc5db0 100644
--- a/tests/test_object.py
+++ b/tests/test_object.py
@@ -1,16 +1,34 @@
+import json
+import sys
from decimal import Decimal, InvalidOperation
from math import isclose, isfinite
-import sys
+from zlib import compress
-import pikepdf
-from pikepdf import _qpdf as qpdf
-from pikepdf import (Object, String, Array, Name,
- Dictionary, Operator, PdfError)
-from hypothesis import given, example, assume
-from hypothesis.strategies import (integers, binary, lists, floats,
- characters, recursive, booleans)
import pytest
+from hypothesis import assume, example, given
+from hypothesis.strategies import (
+ binary,
+ booleans,
+ characters,
+ floats,
+ integers,
+ lists,
+ recursive,
+)
+import pikepdf
+from pikepdf import (
+ Array,
+ Dictionary,
+ Name,
+ Object,
+ Operator,
+ PdfError,
+ String,
+ Stream,
+ Pdf,
+)
+from pikepdf import _qpdf as qpdf
# pylint: disable=eval-used,unnecessary-lambda
@@ -27,15 +45,16 @@ def test_booleans():
assert encode(False) == False
-@given(characters(min_codepoint=0x20, max_codepoint=0x7f))
+@given(characters(min_codepoint=0x20, max_codepoint=0x7F))
@example('')
def test_ascii_involution(ascii_):
b = ascii_.encode('ascii')
assert encode(b) == b
-@given(characters(min_codepoint=0x0, max_codepoint=0xfef0,
- blacklist_categories=('Cs',)))
+@given(
+ characters(min_codepoint=0x0, max_codepoint=0xFEF0, blacklist_categories=('Cs',))
+)
@example('')
def test_unicode_involution(s):
assert str(encode(s)) == s
@@ -47,18 +66,20 @@ def test_binary_involution(binary_):
int64s = integers(min_value=-9223372036854775807, max_value=9223372036854775807)
+
+
@given(int64s, int64s)
def test_integer_comparison(a, b):
- equals = (a == b)
- encoded_equals = (encode(a) == encode(b))
+ equals = a == b
+ encoded_equals = encode(a) == encode(b)
assert encoded_equals == equals
- lessthan = (a < b)
- encoded_lessthan = (encode(a) < encode(b))
+ lessthan = a < b
+ encoded_lessthan = encode(a) < encode(b)
assert lessthan == encoded_lessthan
-@given(integers(-10**12, 10**12), integers(0, 12))
+@given(integers(-10 ** 12, 10 ** 12), integers(0, 12))
def test_decimal_involution(num, radix):
strnum = str(num)
if radix > len(strnum):
@@ -85,7 +106,7 @@ def test_decimal_from_float(f):
assert isclose(py_d, d, abs_tol=1e-5), (d, f.hex())
else:
- with pytest.raises(PdfError, message=repr(f)):
+ with pytest.raises(PdfError):
Object.parse(str(d))
@@ -95,13 +116,17 @@ def test_list(array):
assert a == array
-@given(lists(lists(integers(1,10), min_size=1, max_size=5),min_size=1,max_size=5))
+@given(lists(lists(integers(1, 10), min_size=1, max_size=5), min_size=1, max_size=5))
def test_nested_list(array):
a = pikepdf.Array(array)
assert a == array
-@given(recursive(integers(1,10) | booleans(), lambda children: lists(children), max_leaves=20))
+@given(
+ recursive(
+ integers(1, 10) | booleans(), lambda children: lists(children), max_leaves=20
+ )
+)
def test_nested_list2(array):
assume(isinstance(array, list))
a = pikepdf.Array(array)
@@ -125,11 +150,11 @@ def test_stack_depth():
rlimit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(100)
- with pytest.raises(RecursionError, message="recursion"):
+ with pytest.raises(RecursionError):
assert encode(a) == a
- with pytest.raises(RecursionError, message="recursion"):
+ with pytest.raises(RecursionError):
encode(a) == encode(a) # pylint: disable=expression-not-assigned
- with pytest.raises(RecursionError, message="recursion"):
+ with pytest.raises(RecursionError):
repr(a)
finally:
sys.setrecursionlimit(rlimit) # So other tests are not affected
@@ -151,6 +176,11 @@ def test_len_array():
assert len(Array([3])) == 1
+def test_wrap_array():
+ assert Name('/Foo').wrap_in_array() == Array([Name('/Foo')])
+ assert Array([42]).wrap_in_array() == Array([42])
+
+
def test_name_equality():
# Who needs transitivity? :P
# While this is less than ideal ('/Foo' != b'/Foo') it allows for slightly
@@ -174,7 +204,6 @@ def test_forbidden_name_usage():
class TestHashViolation:
-
def check(self, a, b):
assert a == b, "invalid test case"
assert hash(a) == hash(b), "hash violation"
@@ -202,22 +231,23 @@ class TestHashViolation:
def test_not_constructible():
- with pytest.raises(TypeError, message="constructor"):
+ with pytest.raises(TypeError, match="constructor"):
Object()
class TestRepr:
-
def test_repr_dict(self):
- d = Dictionary({
- '/Boolean': True,
- '/Integer': 42,
- '/Real': Decimal('42.42'),
- '/String': String('hi'),
- '/Array': Array([1, 2, 3.14]),
- '/Operator': Operator('q'),
- '/Dictionary': Dictionary({'/Color': 'Red'})
- })
+ d = Dictionary(
+ {
+ '/Boolean': True,
+ '/Integer': 42,
+ '/Real': Decimal('42.42'),
+ '/String': String('hi'),
+ '/Array': Array([1, 2, 3.14]),
+ '/Operator': Operator('q'),
+ '/Dictionary': Dictionary({'/Color': 'Red'}),
+ }
+ )
expected = """\
pikepdf.Dictionary({
"/Array": [ 1, 2, Decimal('3.140000') ],
@@ -245,7 +275,7 @@ class TestRepr:
Decimal('3.14'),
String('scalar'),
Name('/Bob'),
- Operator('Q')
+ Operator('Q'),
]
for s in scalars:
assert eval(repr(s)) == s
@@ -262,12 +292,8 @@ def test_utf16_error():
class TestDictionary:
-
def test_dictionary_contains(self):
- d = Dictionary({
- '/Monty': 'Python',
- '/Flying': 'Circus'
- })
+ d = Dictionary({'/Monty': 'Python', '/Flying': 'Circus'})
assert Name.Flying in d
assert Name('/Monty') in d
assert Name.Brian not in d
@@ -298,10 +324,12 @@ class TestDictionary:
for k in d.items():
pass
+
def test_not_convertible():
class PurePythonObj:
def __repr__(self):
return 'PurePythonObj()'
+
c = PurePythonObj()
with pytest.raises(RuntimeError):
encode(c)
@@ -311,3 +339,80 @@ def test_not_convertible():
d = pikepdf.Dictionary()
with pytest.raises(RuntimeError):
d.SomeKey = c
+
+
+def test_json():
+ d = Dictionary(
+ {
+ '/Boolean': True,
+ '/Integer': 42,
+ '/Real': Decimal('42.42'),
+ '/String': String('hi'),
+ '/Array': Array([1, 2, 3.14]),
+ '/Dictionary': Dictionary({'/Color': 'Red'}),
+ }
+ )
+ json_bytes = d.to_json(False)
+ try:
+ as_dict = json.loads(json_bytes)
+ except TypeError:
+ as_dict = json.loads(json_bytes.decode('utf-8')) # Py3.5 shim
+ assert as_dict == {
+ "/Array": [1, 2, 3.140000],
+ "/Boolean": True,
+ "/Dictionary": {"/Color": "Red"},
+ "/Integer": 42,
+ "/Real": 42.42,
+ "/String": "hi",
+ }
+
+
+@pytest.fixture
+def stream_object():
+ pdf = pikepdf.new()
+ return Stream(pdf, b'')
+
+
+@pytest.fixture
+def sandwich(resources):
+ return Pdf.open(resources / 'sandwich.pdf')
+
+
+class TestObjectWrite:
+ def test_basic(self, stream_object):
+ stream_object.write(b'abc')
+ assert stream_object.read_bytes() == b'abc'
+
+ def test_compressed_readback(self, stream_object):
+ stream_object.write(compress(b'def'), filter=Name.FlateDecode)
+ assert stream_object.read_bytes() == b'def'
+
+ def test_stacked_compression(self, stream_object):
+ double_compressed = compress(compress(b'pointless'))
+ stream_object.write(
+ double_compressed, filter=[Name.FlateDecode, Name.FlateDecode]
+ )
+ assert stream_object.read_bytes() == b'pointless'
+ assert stream_object.read_raw_bytes() == double_compressed
+
+ def test_explicit_decodeparms(self, stream_object):
+ double_compressed = compress(compress(b'pointless'))
+ stream_object.write(
+ double_compressed,
+ filter=[Name.FlateDecode, Name.FlateDecode],
+ decode_parms=[None, None],
+ )
+ assert stream_object.read_bytes() == b'pointless'
+ assert stream_object.read_raw_bytes() == double_compressed
+
+ def test_no_kwargs(self, stream_object):
+ with pytest.raises(TypeError):
+ stream_object.write(compress(b'x'), [Name.FlateDecode])
+
+ def test_ccitt(self, sandwich, stream_object):
+ ccitt = b'\x00' # Not valid data, just for testing decode_parms
+ stream_object.write(
+ ccitt,
+ filter=Name.CCITTFaxDecode,
+ decode_parms=Dictionary(K=-1, Columns=8, Length=1),
+ )
diff --git a/tests/test_pages.py b/tests/test_pages.py
index a542250..c3b2ec9 100644
--- a/tests/test_pages.py
+++ b/tests/test_pages.py
@@ -1,12 +1,16 @@
-import pytest
-from pikepdf import Pdf, Stream, PdfMatrix
-
+import gc
from contextlib import suppress
from shutil import copy
-import gc
-
from sys import getrefcount as refcount
+import pytest
+
+from pikepdf import Pdf, PdfMatrix, Stream
+
+
+# pylint: disable=redefined-outer-name,pointless-statement
+
+
@pytest.fixture
def graph(resources):
return Pdf.open(resources / 'graph.pdf')
@@ -47,12 +51,17 @@ def test_delete_last_page(graph, outdir):
def test_replace_page(graph, fourpages):
q = fourpages
q2 = graph
+ q2.pages[0].CropBox = [0, 0, 500, 500]
+
+ # Ensure the page keys are different, not subsets
+ assert q.pages[1].keys() - q2.pages[0].keys()
+ assert q2.pages[0].keys() - q.pages[1].keys()
assert len(q.pages) == 4
q.pages[1] = q2.pages[0]
assert len(q.pages) == 4
- assert q.pages[1].Resources.XObject.keys() == \
- q2.pages[0].Resources.XObject.keys()
+ assert q.pages[1].keys() == q2.pages[0].keys()
+ assert q.pages[1].Resources.XObject.keys() == q2.pages[0].Resources.XObject.keys()
def test_hard_replace_page(fourpages, graph, sandwich, outdir):
@@ -98,11 +107,11 @@ def test_evil_page_deletion(resources, outdir):
assert refcount(src) == 2
pdf.pages.append(src.pages[0])
- assert refcount(src) == 3
+ assert refcount(src) == 2
del src.pages[0]
gc.collect()
- assert refcount(src) == 3
+ assert refcount(src) == 2
with suppress(PermissionError): # Fails on Windows
(outdir / 'sandwich.pdf').unlink()
@@ -115,9 +124,6 @@ def test_evil_page_deletion(resources, outdir):
pdf.save(outdir / 'out_nopages.pdf')
del pdf
gc.collect()
- # Ideally we'd see the check_refcount(src, 2) at this point, but we don't
- # have a way to find out when a PDF can be closed if a page was copied out
- # of it to another PDF
def test_append_all(sandwich, fourpages, outdir):
@@ -154,10 +160,12 @@ def test_slice_unequal_replacement(fourpages, sandwich, outdir):
assert len(pdf.pages) == 2, "number of pages must be changed"
pdf.save(outdir / 'out.pdf')
- assert pdf.pages[0].Contents.Length == page0_content_len, \
- "page 0 should be unchanged"
- assert pdf.pages[1].Contents.Length != page1_content_len, \
- "page 1's contents should have changed"
+ assert (
+ pdf.pages[0].Contents.Length == page0_content_len
+ ), "page 0 should be unchanged"
+ assert (
+ pdf.pages[1].Contents.Length != page1_content_len
+ ), "page 1's contents should have changed"
def test_slice_with_step(fourpages, sandwich, outdir):
@@ -171,24 +179,21 @@ def test_slice_with_step(fourpages, sandwich, outdir):
pdf.pages[0::2] = pdf2.pages
pdf.save(outdir / 'out.pdf')
- assert all(page.Contents.Length == pdf2_content_len
- for page in pdf.pages[0::2])
+ assert all(page.Contents.Length == pdf2_content_len for page in pdf.pages[0::2])
def test_slice_differing_lengths(fourpages, sandwich):
pdf = fourpages
pdf2 = sandwich
- with pytest.raises(ValueError,
- message="attempt to assign"):
+ with pytest.raises(ValueError, match="attempt to assign"):
pdf.pages[0::2] = pdf2.pages[0:1]
@pytest.mark.timeout(1)
def test_self_extend(fourpages):
pdf = fourpages
- with pytest.raises(ValueError,
- message="source page list modified during iteration"):
+ with pytest.raises(ValueError, match="source page list modified during iteration"):
pdf.pages.extend(pdf.pages)
@@ -240,3 +245,26 @@ def test_negative_indexing(fourpages, graph):
fourpages.pages[-42] = graph.pages[0]
with pytest.raises(IndexError):
del fourpages.pages[-42]
+
+
+def test_concatenate(resources, outdir):
+ # Issue #22
+ def concatenate(n):
+ print('concatenating same page', n, 'times')
+ output_pdf = Pdf.new()
+ for i in range(n):
+ print(i)
+ pdf_page = Pdf.open(resources / 'pal.pdf')
+ output_pdf.pages.extend(pdf_page.pages)
+ output_pdf.save(outdir / '{}.pdf'.format(n))
+
+ concatenate(5)
+
+
+def test_emplace(fourpages):
+ p0_objgen = fourpages.pages[0].objgen
+ fourpages.pages[0].emplace(fourpages.pages[1])
+ assert p0_objgen == fourpages.pages[0].objgen
+ assert fourpages.pages[0].keys() == fourpages.pages[1].keys()
+ for k in fourpages.pages[0].keys():
+ assert fourpages.pages[0][k] == fourpages.pages[1][k]
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index fac0ccd..f79e8f1 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -1,18 +1,16 @@
+import shutil
+from subprocess import PIPE, run
import sys
import pytest
-from pikepdf import (
- parse_content_stream, Pdf, Stream, Operator, Object,
- Dictionary
-)
-from pikepdf.models import _Page as Page
-from pikepdf._qpdf import StreamParser
-from subprocess import run, PIPE
-import shutil
+from pikepdf import Dictionary, Object, Operator, Pdf, Stream, parse_content_stream
+from pikepdf._qpdf import StreamParser
+from pikepdf.models import _Page as Page
# pylint: disable=useless-super-delegation
+
class PrintParser(StreamParser):
def __init__(self):
super().__init__()
@@ -48,15 +46,15 @@ def test_parser_exception(resources):
Object._parse_stream(stream, ExceptionParser())
-@pytest.mark.skipif(
- shutil.which('pdftotext') is None,
- reason="poppler not installed")
+@pytest.mark.skipif(shutil.which('pdftotext') is None, reason="poppler not installed")
+@pytest.mark.skipif(sys.version_info < (3, 6), reason="subprocess.run on 3.5")
def test_text_filter(resources, outdir):
input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'
# Ensure the test PDF has detect we can find
- proc = run(['pdftotext', str(input_pdf), '-'],
- check=True, stdout=PIPE, encoding='utf-8')
+ proc = run(
+ ['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8'
+ )
assert proc.stdout.strip() != '', "Need input test file that contains text"
pdf = Pdf.open(input_pdf)
@@ -76,8 +74,12 @@ def test_text_filter(resources, outdir):
pdf.save(outdir / 'notext.pdf', True)
- proc = run(['pdftotext', str(outdir / 'notext.pdf'), '-'],
- check=True, stdout=PIPE, encoding='utf-8')
+ proc = run(
+ ['pdftotext', str(outdir / 'notext.pdf'), '-'],
+ check=True,
+ stdout=PIPE,
+ encoding='utf-8',
+ )
assert proc.stdout.strip() == '', "Expected text to be removed"
@@ -87,13 +89,16 @@ def test_invalid_stream_object():
parse_content_stream(Dictionary({"/Hi": 3}))
-@pytest.mark.parametrize("test_file,expected", [
- ("fourpages.pdf", True),
- ("graph.pdf", False),
- ("veraPDF test suite 6-2-10-t02-pass-a.pdf", True),
- ("veraPDF test suite 6-2-3-3-t01-fail-c.pdf", False),
- ('sandwich.pdf', True)
-])
+@pytest.mark.parametrize(
+ "test_file,expected",
+ [
+ ("fourpages.pdf", True),
+ ("graph.pdf", False),
+ ("veraPDF test suite 6-2-10-t02-pass-a.pdf", True),
+ ("veraPDF test suite 6-2-3-3-t01-fail-c.pdf", False),
+ ('sandwich.pdf', True),
+ ],
+)
def test_has_text(resources, test_file, expected):
pdf = Pdf.open(resources / test_file)
for p in pdf.pages:
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
index 33b949b..abe93cc 100644
--- a/tests/test_pdf.py
+++ b/tests/test_pdf.py
@@ -2,22 +2,33 @@
Testing focused on pikepdf.Pdf
"""
-import pytest
-from pikepdf import Pdf, PasswordError, Stream, PdfError
-
-import sys
import os
+import shutil
+import sys
from io import StringIO
+from pathlib import Path
from unittest.mock import Mock, patch
-import shutil
+
+import pytest
+
+import pikepdf
+from pikepdf import PasswordError, Pdf, PdfError, Stream
from pikepdf._cpphelpers import fspath # For py35
+# pylint: disable=redefined-outer-name
+
+
@pytest.fixture
def trivial(resources):
return Pdf.open(resources / 'pal-1bit-trivial.pdf')
+def test_new(outdir):
+ pdf = pikepdf.new()
+ pdf.save(outdir / 'new-empty.pdf')
+
+
def test_non_filename():
with pytest.raises(TypeError):
Pdf.open(42)
@@ -73,6 +84,15 @@ class TestPasswords:
Pdf.open(resources / 'graph-encrypted.pdf')
+class TestPermissions:
+ def test_some_permissions_missing(self, resources):
+ pdf = Pdf.open(resources / 'graph-encrypted.pdf', 'owner')
+ assert pdf.allow.print_highres == pdf.allow.modify_annotation == False
+
+ def test_permissions_all_true_not_encrypted(self, trivial):
+ assert all(trivial.allow.values())
+
+
class TestStreams:
def test_stream(self, resources):
with (resources / 'pal-1bit-trivial.pdf').open('rb') as stream:
@@ -86,6 +106,7 @@ class TestStreams:
def test_save_stream(self, trivial, outdir):
from io import BytesIO
+
pdf = trivial
pdf.save(outdir / 'nostream.pdf', static_id=True)
@@ -123,8 +144,7 @@ def test_show_xref(trivial):
trivial.show_xref_table()
-@pytest.mark.skipif(sys.version_info < (3, 6),
- reason='missing mock.assert_called')
+@pytest.mark.skipif(sys.version_info < (3, 6), reason='missing mock.assert_called')
def test_progress(trivial, outdir):
pdf = trivial
mock = Mock()
@@ -135,10 +155,7 @@ def test_progress(trivial, outdir):
def test_unicode_filename(resources, outdir):
target1 = outdir / '测试.pdf'
target2 = outdir / '通过考试.pdf'
- shutil.copy(
- fspath(resources / 'pal-1bit-trivial.pdf'),
- fspath(target1)
- )
+ shutil.copy(fspath(resources / 'pal-1bit-trivial.pdf'), fspath(target1))
pdf = Pdf.open(target1)
pdf.save(target2)
assert target2.exists()
@@ -149,12 +166,12 @@ def test_fileno_fails(resources):
with patch('os.dup') as dup:
dup.side_effect = OSError('assume dup fails')
with pytest.raises(OSError):
- pdf = Pdf.open(resources / 'pal-1bit-trivial.pdf')
+ Pdf.open(resources / 'pal-1bit-trivial.pdf')
with patch('os.dup') as dup:
dup.return_value = -1
with pytest.raises(RuntimeError):
- pdf = Pdf.open(resources / 'pal-1bit-trivial.pdf')
+ Pdf.open(resources / 'pal-1bit-trivial.pdf')
def test_min_and_force_version(trivial, outdir):
@@ -175,3 +192,51 @@ def test_min_and_force_version(trivial, outdir):
def test_normalize_linearize(trivial, outdir):
with pytest.raises(ValueError):
trivial.save(outdir / 'no.pdf', linearize=True, normalize_content=True)
+
+
+def test_make_stream(trivial, outdir):
+ pdf = trivial
+ stream = pdf.make_stream(b'q Q')
+ pdf.pages[0].Contents = stream
+ pdf.save(outdir / 's.pdf')
+
+
+def test_add_blank_page(trivial):
+ assert len(trivial.pages) == 1
+
+ invalid = [-1, 0, 2, 15000]
+ for n in invalid:
+ with pytest.raises(ValueError):
+ trivial.add_blank_page(page_size=(n, n))
+ trivial.add_blank_page()
+ assert len(trivial.pages) == 2
+
+
+def test_object_stream_mode_generated(trivial, outdir):
+ trivial.save(
+ outdir / '1.pdf',
+ fix_metadata_version=True,
+ object_stream_mode=pikepdf.ObjectStreamMode.generate,
+ )
+ assert b'/ObjStm' in (outdir / '1.pdf').read_bytes()
+
+ trivial.save(
+ outdir / '2.pdf',
+ fix_metadata_version=False,
+ object_stream_mode=pikepdf.ObjectStreamMode.generate,
+ )
+ assert b'/ObjStm' in (outdir / '2.pdf').read_bytes()
+
+
+def test_with_block(resources):
+ desc = ''
+ with pikepdf.open(resources / 'pal-1bit-trivial.pdf') as pdf:
+ desc = pdf.filename
+ assert pdf.filename != desc
+
+
+def test_with_block_abuse(resources):
+ with pikepdf.open(resources / 'pal-1bit-trivial.pdf') as pdf:
+ im0 = pdf.pages[0].Resources.XObject['/Im0']
+ with pytest.raises(PdfError):
+ im0.read_bytes()
diff --git a/tests/test_pdfa.py b/tests/test_pdfa.py
index 975b258..305b2d1 100644
--- a/tests/test_pdfa.py
+++ b/tests/test_pdfa.py
@@ -1,9 +1,11 @@
-import pytest
-from pikepdf import Pdf
import os
-from pathlib import Path
-from subprocess import run, PIPE, STDOUT
import xml.etree.ElementTree as ET
+from pathlib import Path
+from subprocess import PIPE, STDOUT, run
+
+import pytest
+
+from pikepdf import Pdf
try:
VERAPDF = Path(os.environ['HOME']) / 'verapdf' / 'verapdf'
diff --git a/tests/test_private_pdfs.py b/tests/test_private_pdfs.py
index e407fa2..25fdff9 100644
--- a/tests/test_private_pdfs.py
+++ b/tests/test_private_pdfs.py
@@ -1,13 +1,11 @@
+import gzip
+from pathlib import Path
+
import pytest
+
from pikepdf import Pdf, PdfError
-import os
-import platform
-import shutil
-from contextlib import suppress
-from shutil import copy
-import gzip
-from pathlib import Path
+# pylint: disable=redefined-outer-name
# Files with unknown copyright status can't be shared publicly
@@ -20,8 +18,8 @@ def private():
pytestmark = pytest.mark.skipif(
- not PRIVATE_RESOURCES.is_dir(),
- reason='private resources not available')
+ not PRIVATE_RESOURCES.is_dir(), reason='private resources not available'
+)
def test_pypdf2_issue_361(private):
diff --git a/tests/test_refcount.py b/tests/test_refcount.py
index a1b8912..879a7a9 100644
--- a/tests/test_refcount.py
+++ b/tests/test_refcount.py
@@ -1,14 +1,13 @@
import gc
-import sys
-import pytest
-from pikepdf import Pdf
-
-# This will break on pypy, but we're not quite targetting pypy...
from sys import getrefcount as refcount
+import pytest
+
+from pikepdf import Pdf
# Try to do some things without blowing up
+
def test_access_image(resources):
pdf = Pdf.open(resources / 'congress.pdf')
assert refcount(pdf) == 2 # refcount is always +1
@@ -66,7 +65,7 @@ def test_transfer_page(resources):
pdf2.pages.insert(2, page0)
p2p2 = pdf2.pages[2]
- assert refcount(pdf) == 4 # this, pdf, page0->pdf, pdf2's page0
+ assert refcount(pdf) == 3 # this, pdf, page0->pdf
assert refcount(p2p2) == 2
del pdf
diff --git a/tests/test_sanity.py b/tests/test_sanity.py
index df1f387..dcafb7c 100644
--- a/tests/test_sanity.py
+++ b/tests/test_sanity.py
@@ -2,19 +2,25 @@
A bunch of quick tests that confirm nothing is horribly wrong
"""
-import pytest
-
import gc
from contextlib import suppress
from shutil import copy
-import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
+from io import BytesIO
+import threading
+import os
+import time
+import signal
+
+import pytest
import pikepdf
-from pikepdf import Pdf, Object, Name, Stream
+from pikepdf import Name, Object, Pdf, Stream
def test_minimum_qpdf_version():
from pikepdf import _qpdf
+
assert _qpdf.qpdf_version() >= '7.0.0'
@@ -39,20 +45,24 @@ def test_create_pdf(outdir):
pdf = Pdf.new()
font = pdf.make_indirect(
- Object.parse(b"""
+ Object.parse(
+ b"""
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /WinAnsiEncoding
- >>"""))
+ >>"""
+ )
+ )
width, height = 100, 100
image_data = b"\xff\x7f\x00" * (width * height)
image = Stream(pdf, image_data)
- image.stream_dict = Object.parse(b"""
+ image.stream_dict = Object.parse(
+ b"""
<<
/Type /XObject
/Subtype /Image
@@ -60,16 +70,14 @@ def test_create_pdf(outdir):
/BitsPerComponent 8
/Width 100
/Height 100
- >>""")
+ >>"""
+ )
rfont = {'/F1': font}
xobj = {'/Im1': image}
- resources = {
- '/Font': rfont,
- '/XObject': xobj
- }
+ resources = {'/Font': rfont, '/XObject': xobj}
mediabox = [0, 0, 612, 792]
@@ -84,8 +92,8 @@ def test_create_pdf(outdir):
'/Type': Name('/Page'),
'/MediaBox': mediabox,
'/Contents': contents,
- '/Resources': resources
- }
+ '/Resources': resources,
+ }
qpdf_page_dict = page_dict
page = pdf.make_indirect(qpdf_page_dict)
@@ -125,7 +133,9 @@ def test_open_save(resources, outdir):
out = str(outdir / 'graph.pdf')
copy(str(resources / 'graph.pdf'), out)
src = Pdf.open(out)
- src.save(out)
+ with pytest.raises(ValueError):
+ src.save(out)
+ src.save(outdir / 'graph2.pdf')
def test_readme_example(resources, outdir):