From 7831a2b66aaf5feca70f5f9c98aed876aa7fc4fd Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index e0f367e..f0dd83d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,8 +52,6 @@ else: # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -101,7 +99,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From 3854d4afa80fbbd483bfd7025f92bca6bedf834e Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 4ec5ad1..bf31c04 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -111,7 +111,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation changelog tutorial objects -- cgit v1.2.3 From 8a90f32b9c611500d9ab57f3f7913c82271296e9 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Wed, 10 Oct 2018 08:17:05 -0700 Subject: drop setuptools_scm_git_archive from setup.py Pending resolution of #910742. Gbp-Pq: Name drop-setuptools_scm_git_archive-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index ab5c9c2..3793c56 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,6 @@ setup( setup_requires=[ 'pytest-runner', 'setuptools_scm', - 'setuptools_scm_git_archive', 'pybind11 >= 2.2.4, < 3' ], use_scm_version=True, -- cgit v1.2.3 From 61c86143d3d607bddeb5719d196aa40b91705fa4 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index e0f367e..f0dd83d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,8 +52,6 @@ else: # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -101,7 +99,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From 828fad4a786266ca250bc374c4e6c38af2b7d679 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 4ec5ad1..bf31c04 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -111,7 +111,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation changelog tutorial objects -- cgit v1.2.3 From 9862ef7a8d0428646d8a87cde1b67988ac6f95c0 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Wed, 10 Oct 2018 08:17:05 -0700 Subject: drop setuptools_scm_git_archive from setup.py Pending resolution of #910742. Gbp-Pq: Name drop-setuptools_scm_git_archive-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index ab5c9c2..3793c56 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,6 @@ setup( setup_requires=[ 'pytest-runner', 'setuptools_scm', - 'setuptools_scm_git_archive', 'pybind11 >= 2.2.4, < 3' ], use_scm_version=True, -- cgit v1.2.3 From 851e67ace6fe07fb4b0eea85a7b8ad7f99854317 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 35eff81..8d4fae5 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -47,12 +47,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -290,14 +284,3 @@ def test_remove_attribute_metadata(sandwich): # Ensure the whole node was deleted assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) - - -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) -- cgit v1.2.3 From 5ec77a64d5ab321a34b2c461e47512b2019daa06 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index e0f367e..f0dd83d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,8 +52,6 @@ else: # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -101,7 +99,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From 034dc9de536e40cc3442f5ddb74a66a14e83eec3 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 4ec5ad1..bf31c04 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -111,7 +111,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation changelog tutorial objects -- cgit v1.2.3 From 1e1a8768e729d8512fb9d754a75917ce825af74d Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Wed, 10 Oct 2018 08:17:05 -0700 Subject: drop setuptools_scm_git_archive from setup.py Pending resolution of #910742. Gbp-Pq: Name drop-setuptools_scm_git_archive-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index ab5c9c2..3793c56 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,6 @@ setup( setup_requires=[ 'pytest-runner', 'setuptools_scm', - 'setuptools_scm_git_archive', 'pybind11 >= 2.2.4, < 3' ], use_scm_version=True, -- cgit v1.2.3 From c8d9e8d54c445d6a9b4fa432d3b058fbd3971eff Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index db54463..9f85f83 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -49,12 +49,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -320,19 +314,3 @@ def test_remove_attribute_metadata(sandwich): # Ensure the whole node was deleted assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) - - -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] -- cgit v1.2.3 From 4c62ea7c4dca20325007bcfe4f43e8ab5881bc20 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 26 Jan 2019 12:54:11 -0700 Subject: Import pikepdf_1.0.5+dfsg.orig.tar.xz [dgit import orig pikepdf_1.0.5+dfsg.orig.tar.xz] --- .appveyor.yml | 61 ++ .git_archival.txt | 1 + .gitattributes | 4 + .gitignore | 37 + .pylintrc | 550 ++++++++++++ .readthedocs.yml | 10 + .travis.yml | 177 ++++ LICENSE.txt | 373 ++++++++ Makefile | 60 ++ README.md | 72 ++ debian/copyright | 770 +++++++++++++++++ docs/Makefile | 216 +++++ docs/_notebooks/pages.ipynb | 276 ++++++ docs/arch.rst | 33 + docs/changelog.rst | 256 ++++++ docs/conf.py | 333 ++++++++ docs/images/congress_im0.jpg | Bin 0 -> 26172 bytes docs/images/pike-cartoon.png | Bin 0 -> 148388 bytes docs/images/pike.jpg | Bin 0 -> 27097 bytes docs/images/pikemen.jpg | Bin 0 -> 29276 bytes docs/index.rst | 136 +++ docs/installation.rst | 153 ++++ docs/make.bat | 263 ++++++ docs/objects.rst | 69 ++ docs/pikepdf.rst | 128 +++ docs/resources.rst | 14 + docs/tutorial.rst | 42 + docs/tutorial/metadata.rst | 111 +++ docs/tutorial/page.rst | 175 ++++ docs/tutorial/pages.rst | 214 +++++ docs/tutorial/streams.rst | 89 ++ examples/find_links.py | 78 ++ licenses/license.wheel.txt | 659 ++++++++++++++ requirements/docs.txt | 7 + requirements/test.txt | 7 + setup.cfg | 31 + setup.py | 166 ++++ src/pikepdf/__init__.py | 41 + src/pikepdf/_cpphelpers.py | 47 + src/pikepdf/_methods.py | 270 ++++++ src/pikepdf/models/__init__.py | 114 +++ src/pikepdf/models/image.py | 626 ++++++++++++++ src/pikepdf/models/matrix.py | 125 +++ src/pikepdf/models/metadata.py | 630 ++++++++++++++ src/pikepdf/objects.py | 172 ++++ src/qpdf/object.cpp | 943 +++++++++++++++++++++ src/qpdf/object_convert.cpp | 138 +++ src/qpdf/object_repr.cpp | 244 ++++++ src/qpdf/pikepdf.h | 200 +++++ src/qpdf/qpdf.cpp | 582 +++++++++++++ src/qpdf/qpdf_inputsource.h | 137 +++ src/qpdf/qpdf_pagelist.cpp | 295 +++++++ src/qpdf/qpdf_pagelist.h | 37 + src/qpdf/shims.cpp | 45 + src/qpdf/shims.h | 30 + src/qpdf/utils.cpp | 105 +++ src/qpdf/utils.h | 15 + tests/conftest.py | 28 + tests/resources/cmyk-jpeg.pdf | Bin 0 -> 1461 bytes tests/resources/congress-gray.pdf | Bin 0 -> 97969 bytes tests/resources/congress.pdf | Bin 0 -> 193947 bytes tests/resources/formxobject.pdf | Bin 0 -> 2713 bytes tests/resources/fourpages.pdf | 82 ++ tests/resources/graph-encrypted.pdf | Bin 0 -> 293636 bytes tests/resources/graph.pdf | Bin 0 -> 296322 bytes tests/resources/image-mono-inline.pdf | 74 ++ tests/resources/invalid_creationdate.pdf | Bin 0 -> 965 bytes tests/resources/pal-1bit-rgb.pdf | Bin 0 -> 931 bytes tests/resources/pal-1bit-trivial.pdf | Bin 0 -> 931 bytes tests/resources/pal.pdf | Bin 0 -> 2918 bytes tests/resources/pike-jp2.pdf | Bin 0 -> 18152 bytes tests/resources/sandwich.pdf | Bin 0 -> 115546 bytes .../veraPDF test suite 6-2-10-t02-pass-a.pdf | Bin 0 -> 10049 bytes .../veraPDF test suite 6-2-3-3-t01-fail-c.pdf | 151 ++++ tests/test_dictionary.py | 37 + tests/test_formxobject.py | 76 ++ tests/test_image_access.py | 260 ++++++ tests/test_ipython.py | 24 + tests/test_metadata.py | 338 ++++++++ tests/test_object.py | 313 +++++++ tests/test_pages.py | 242 ++++++ tests/test_parsers.py | 101 +++ tests/test_pdf.py | 177 ++++ tests/test_pdfa.py | 64 ++ tests/test_private_pdfs.py | 30 + tests/test_refcount.py | 77 ++ tests/test_sanity.py | 137 +++ 87 files changed, 12578 insertions(+) create mode 100644 .appveyor.yml create mode 100644 .git_archival.txt create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .pylintrc create mode 100644 .readthedocs.yml create mode 100644 .travis.yml create mode 100644 LICENSE.txt create mode 100644 Makefile create mode 100644 README.md create mode 100644 debian/copyright create mode 100644 docs/Makefile create mode 100644 docs/_notebooks/pages.ipynb create mode 100644 docs/arch.rst create mode 100644 docs/changelog.rst create mode 100644 docs/conf.py create mode 100644 docs/images/congress_im0.jpg create mode 100644 docs/images/pike-cartoon.png create mode 100644 docs/images/pike.jpg create mode 100644 docs/images/pikemen.jpg create mode 100644 docs/index.rst create mode 100644 docs/installation.rst create mode 100644 docs/make.bat create mode 100644 docs/objects.rst create mode 100644 docs/pikepdf.rst create mode 100644 docs/resources.rst create mode 100644 docs/tutorial.rst create mode 100644 docs/tutorial/metadata.rst create mode 100644 docs/tutorial/page.rst create mode 100644 docs/tutorial/pages.rst create mode 100644 docs/tutorial/streams.rst create mode 100644 examples/find_links.py create mode 100644 licenses/license.wheel.txt create mode 100644 requirements/docs.txt create mode 100644 requirements/test.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 src/pikepdf/__init__.py create mode 100644 src/pikepdf/_cpphelpers.py create mode 100644 src/pikepdf/_methods.py create mode 100644 src/pikepdf/models/__init__.py create mode 100644 src/pikepdf/models/image.py create mode 100644 src/pikepdf/models/matrix.py create mode 100644 src/pikepdf/models/metadata.py create mode 100644 src/pikepdf/objects.py create mode 100644 src/qpdf/object.cpp create mode 100644 src/qpdf/object_convert.cpp create mode 100644 src/qpdf/object_repr.cpp create mode 100644 src/qpdf/pikepdf.h create mode 100644 src/qpdf/qpdf.cpp create mode 100644 src/qpdf/qpdf_inputsource.h create mode 100644 src/qpdf/qpdf_pagelist.cpp create mode 100644 src/qpdf/qpdf_pagelist.h create mode 100644 src/qpdf/shims.cpp create mode 100644 src/qpdf/shims.h create mode 100644 src/qpdf/utils.cpp create mode 100644 src/qpdf/utils.h create mode 100644 tests/conftest.py create mode 100644 tests/resources/cmyk-jpeg.pdf create mode 100644 tests/resources/congress-gray.pdf create mode 100644 tests/resources/congress.pdf create mode 100644 tests/resources/formxobject.pdf create mode 100644 tests/resources/fourpages.pdf create mode 100644 tests/resources/graph-encrypted.pdf create mode 100644 tests/resources/graph.pdf create mode 100644 tests/resources/image-mono-inline.pdf create mode 100644 tests/resources/invalid_creationdate.pdf create mode 100644 tests/resources/pal-1bit-rgb.pdf create mode 100644 tests/resources/pal-1bit-trivial.pdf create mode 100644 tests/resources/pal.pdf create mode 100644 tests/resources/pike-jp2.pdf create mode 100644 tests/resources/sandwich.pdf create mode 100644 tests/resources/veraPDF test suite 6-2-10-t02-pass-a.pdf create mode 100644 tests/resources/veraPDF test suite 6-2-3-3-t01-fail-c.pdf create mode 100644 tests/test_dictionary.py create mode 100644 tests/test_formxobject.py create mode 100644 tests/test_image_access.py create mode 100644 tests/test_ipython.py create mode 100644 tests/test_metadata.py create mode 100644 tests/test_object.py create mode 100644 tests/test_pages.py create mode 100644 tests/test_parsers.py create mode 100644 tests/test_pdf.py create mode 100644 tests/test_pdfa.py create mode 100644 tests/test_private_pdfs.py create mode 100644 tests/test_refcount.py create mode 100644 tests/test_sanity.py diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 0000000..98d4ff0 --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,61 @@ +version: '{build}' +image: Visual Studio 2015 +platform: +- x64 +- x86 +environment: + global: + DISTUTILS_USE_SDK: 1 + MSSdk: 1 + matrix: + - PYTHON: 35 + - PYTHON: 36 + - PYTHON: 37 + TWINE_PASSWORD: + secure: RZZXYbbTOzKMSE/GdzIG/x+eqehSEeu98j0Ggs+/fG8= +install: +- cmd: '"%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" %PLATFORM%' +- ps: | + $userpath = "$env:APPDATA\Python\Python$env:PYTHON\Scripts" + if ($env:PLATFORM -eq "x64") { + $env:PYTHON = "$env:PYTHON-x64" + $qpdfdll = "https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.3.0/qpdf-8.3.0-bin-msvc64.zip" + } else { + $qpdfdll = "https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.3.0/qpdf-8.3.0-bin-msvc32.zip" + } + $env:PATH = "C:\Python$env:PYTHON;C:\Python$env:PYTHON\Scripts;$userpath;$env:PATH" + echo $env:PATH + python -m pip install --disable-pip-version-check --upgrade pip setuptools wheel + pip install pybind11 + + Invoke-WebRequest -Uri $qpdfdll -OutFile "qpdf-release.zip" + 7z x "qpdf-release.zip" -oc:\ + $qpdfdir = Get-ChildItem c:\qpdf-* + Move-Item -Path $qpdfdir -Destination c:\qpdf + +build_script: +- ps: | + cp c:\qpdf\bin\qpdf21.dll src\pikepdf + $env:INCLUDE += ";c:\qpdf\include" + $env:LIB += ";c:\qpdf\lib" + $env:LIBPATH +=";c:\qpdf\lib" + python setup.py bdist_wheel + $wheel = Get-ChildItem -Path dist\pikepdf*.whl + pip install --verbose $wheel + +test_script: +- pip install -r requirements/test.txt +- pytest -n auto + +artifacts: +- path: dist\*.whl + name: pypi + +deploy_script: +- ps: | + if ($env:APPVEYOR_REPO_TAG -eq "true") + { + pip install twine + $env:TWINE_USERNAME = "ocrmypdf-travis" + twine upload dist\*.whl + } diff --git a/.git_archival.txt b/.git_archival.txt new file mode 100644 index 0000000..95cb3ee --- /dev/null +++ b/.git_archival.txt @@ -0,0 +1 @@ +ref-names: $Format:%D$ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..59d9b13 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +*.pdf binary +*.png binary +*.jpg binary +.git_archival.txt export-subst diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0d09e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Development +_build +*.py[cod] +*.so +*.egg-info +MANIFEST +.hypothesis/ +.ipynb_checkpoints/ +.idea/ +.vscode/ +_generate +build +.venv/ +*.sublime* +.eggs/ +.cache/ +dist/ +tmp/ +var/ +.pytest_cache/ +.coverage +coverage/ + +# Main directory testing +/*.pdf +/*.ipynb +/debug_tests.py +/TODO +/scratch.py + +# macOS +.DS_Store + +/__qpdf/ +/qpdf-7.0.0/ +/tests/resources/private/ +/tmp_rpms/ diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..85d1076 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,550 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code +extension-pkg-whitelist=pikepdf._qpdf + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=.git,tmp,usr,var,licenses,venv + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. +jobs=1 + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Specify a configuration file. +#rcfile= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + invalid-unicode-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + locally-enabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape, + C, + protected-access, + eval + + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio).You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=optparse.Values,sys.exit + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,io,builtins + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module +max-module-lines=1000 + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check=trailing-comma, + dict-separator + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[BASIC] + +# Naming style matching correct argument names +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style +#argument-rgx= + +# Naming style matching correct attribute names +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Naming style matching correct class attribute names +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style +#class-attribute-rgx= + +# Naming style matching correct class names +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming-style +#class-rgx= + +# Naming style matching correct constant names +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma +good-names=i, + j, + k, + ex, + Run, + _ + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# Naming style matching correct inline iteration names +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style +#inlinevar-rgx= + +# Naming style matching correct method names +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style +#method-rgx= + +# Naming style matching correct module names +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style +variable-rgx=[a-z_][a-z0-9_]{0,30}$ + + +[IMPORTS] + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] + +# Maximum number of arguments for function / method +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in a if statement +max-bool-expr=5 + +# Maximum number of branch for function / method body +max-branches=12 + +# Maximum number of locals for function / method body +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body +max-returns=6 + +# Maximum number of statements in function / method body +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=Exception diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..d9b418c --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,10 @@ +build: + image: latest + +python: + version: 3.6 + +formats: + - pdf + +requirements_file: requirements/docs.txt diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..104b46e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,177 @@ +language: python +dist: trusty +sudo: false +cache: + ccache: true + directories: + - $HOME/Library/Caches/Homebrew + +env: + global: + - QPDF_RELEASE=https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.3.0/qpdf-8.3.0.tar.gz + - TWINE_USERNAME=ocrmypdf-travis + # TWINE_PASSWORD + - secure: "d1PfCVoqvFFwAqm0QEozLLoUdEHaY0kAvawfc4lKdLSjI+yOJYoNdknU0r3TdwttNEF2VV+aY9q/4wVnYrEiF4L13E3s+UtDqIXwGk/b14JrdExIx/0yj642kbCJPycZqqRZgvGwYuhb6EF7e/QrsNYMwZ95E9oTyWa0ZaEkiPrrVJh9XSNDpw9I8REL3GecpfvF/GpHWX0VBHoaJfCgDzDDvHQPdfIXAZg+OLJOLNrR2ivvUD3gR371376fYPMPNsqMNqBghLdX8lnX2zkEc67An9ZBLi1dx46PhHjn8c06QOBQ25wcwtCxSnaXygkq5HXUXnpWmCbPcy3n98bJBE1P86M1eWo5c3KV4zwY3pvC6/ldFFAX0nC5Qr8xVpiBZIZKhqBEsX7HlCIRdN5OzmWXTkRhO05GtloH+IPuS8PH09vlaGfdCmBdJkvQjnkXL9Jdw5JJcIt9c//CgRjJ4CtHySA3I0XEnqbLHRhsYAyfJfM4ya3ou+eETpWVpnkZ4kbn8fuUkIpZL6YS9XtJCVCfh5uNpJ7BV0DzlZqdV//K3s9CTNyFac0L521YcFRwl0Nb72AlzbtGwWgWh1C8qmlJ/ENf0XI3dafvcqzPL61rwBlo0sah9DWxwUDWMUicUtp8qP5GK8VxHse+QlolJQVSb07jD6bf7+mILX0B3Mg=" + # RTD_TOKEN + - secure: "jD4S2pEvuf9oWv7RTWkdgrRw62WoUISiLFnPN9gPJ5ZeqdLeL8r3/+2x7tKa9xo6u6HWKVzo88+gg+HZCHIIZafWqxbhYsdl2g4dp3v807UVdp7rdqd7KkFl8/0XDR9p1J1g0RnRHp2qYtORqPZ0hustXnBQyiA0cA+a7qH9AWfhiU6iUV193YJAwYumSBeDAWgJN6OXAYKaZdLqynC/bmXYMfYM7Xg3sEnKv8eig+gQeSxyN2A/foTG/OgwUc8KVNTvO7w3xqXvfVUc7zm9P8Pdalay0V/8/0DAgZVSLjjnxqfOhmlDfE8OQeDLITBrD2GW0mPlSHP9ic0ymsltayR5Bg8opf3//+HBGoAyVDl0zwDVje4KX1QW2FzRri2ZQ+Lr25nvLkFIwQva2H+PuNUqB3Xw2JzZc4wsEqA83Y8Ijj8Cfc7Xd6hHIttWNe1D+n1tLSf123IW1QNMj7n2xr0AJi3ty1sTnr5WAgBEIR5WSy/roFdbHtLy/8wLE2AtKmhqKWndkSx17K45c35geuxmfthPrh6v1XPAgZZU0sza6dnrdD53WXwV8Z5+y4U+KD9YLf2gydfMSCrImdFWfjy3wMZci8Mn14uAZawwWyo2mfXvRXIVuUQdAx+uPnbkV4KzoAp7Xj7dJlpCxzphnX//1DLczOUJ6EdknbeIH6o=" + +matrix: + include: + - env: PYTHON=3.5 GCC=4.8 + os: linux + sudo: required + python: "3.5" + addons: &gcc48 + apt: + sources: [ubuntu-toolchain-r-test] + packages: [g++-4.8, libexempi-dev] # g++-4.8 supports c++11 but not c++14 at all; g++-4.9 won't work + config: + retries: true + + - env: PYTHON=3.6 GCC=4.8 + os: linux + sudo: required + python: "3.6" + addons: *gcc48 + + - env: PYTHON=3.6 GCC=8 + os: linux + sudo: required + python: "3.6" + addons: + apt: + sources: [ubuntu-toolchain-r-test] + packages: [g++-8, libexempi-dev] # g++-7 supports c++14 + config: + retries: true + + - env: PYTHON=3.7 GCC=5 + os: linux + sudo: required + dist: xenial + python: "3.7" + addons: &gcc5_xenial + apt: + sources: + - sourceline: 'ppa:deadsnakes/ppa' + packages: [python3.7-dev, python3.7-venv, g++-5, libexempi-dev] + config: + retries: true + + - os: osx + osx_image: xcode8.3 + language: generic + addons: + homebrew: + update: true + packages: + - ccache + - exempi + - python + - qpdf + + - stage: wheels + os: osx + osx_image: xcode8.3 + language: cpp # osx + language: python is broken + addons: + homebrew: + update: true + packages: + - python + - qpdf + install: + - clang --version + - $PYTHON_CMD --version + - echo "CC=$CC CXX=$CXX" + - $PYTHON_CMD -m pip install pybind11 cibuildwheel==0.10.0 setuptools_scm + script: + - $PYTHON_CMD setup.py sdist -d wheelhouse + - export CCACHE_BASEDIR=`python3 -c "import tempfile; import os; print(os.path.realpath(tempfile.gettempdir()))"` + - export CIBW_BEFORE_BUILD='pip install pybind11 setuptools_scm pytest-runner' + - export CIBW_SKIP="cp27-* cp34-*" + - cibuildwheel --output-dir wheelhouse + - | + if [[ $TRAVIS_TAG ]]; then + $PYTHON_CMD -m pip install twine + $PYTHON_CMD -m twine upload wheelhouse/*.whl wheelhouse/*.tar.gz + fi + + - stage: wheels + os: linux + python: "3.6" # only need one environment to build all linux wheels + sudo: required + services: + - docker + install: + - $PYTHON_CMD -m pip install cibuildwheel==0.10.0 + #- mkdir gcc-x86_64 && wget -q https://github.com/Noctem/pogeo-toolchain/releases/download/v1.4/gcc-7.2-binutils-2.29-centos5-x86-64.tar.bz2 -O - | tar xj -C gcc-x86_64 --strip-components=1 + #- mkdir ccache && wget -q https://www.samba.org/ftp/ccache/ccache-3.3.4.tar.bz2 -O - | tar xj -C ccache --strip-components=1 + - mkdir qpdf && wget -q $QPDF_RELEASE -O - | tar xz -C qpdf --strip-components=1 + script: + #- export CIBW_ENVIRONMENT='ARCH=x86_64 PATH="`pwd`/gcc-$ARCH/bin:$PATH" CC="ccache cc" CXX="ccache c++" CXXFLAGS="-static-libstdc++" LD_LIBRARY_PATH="`pwd`/gcc-$ARCH/lib" CCACHE_BASEDIR=`python -c "import tempfile; import os; print(os.path.realpath(tempfile.gettempdir()))"` CCACHE_DIR=/host'$HOME'/.ccache' + #- export CIBW_BEFORE_BUILD='pip install pybind11 && [ -d ccache/$ARCH ] || (unset CC CXX CXXFLAGS && mkdir ccache/$ARCH && cd ccache/$ARCH && ../configure && make install)' + #- export CIBW_ENVIRONMENT='ARCH=x86_64 PATH="`pwd`/gcc-$ARCH/bin:$PATH" CC="cc" CXX="c++" CXXFLAGS="-static-libstdc++" LD_LIBRARY_PATH="`pwd`/gcc-$ARCH/lib:$LD_LIBRARY_PATH"' + - export CIBW_BEFORE_BUILD='yum install -y libjpeg-devel zlib-devel && cd qpdf && ./autogen.sh && ./configure && make install && cd .. && pip install pybind11' + - export CIBW_SKIP="cp27-* cp34-* *i686" + - cibuildwheel --output-dir wheelhouse + - | + if [[ $TRAVIS_TAG ]]; then + $PYTHON_CMD -m pip install twine + $PYTHON_CMD -m twine upload wheelhouse/*.whl + fi + after_script: + - curl -X POST -d "token=$RTD_TOKEN" https://readthedocs.org/api/v2/webhook/pikepdf/39557/ + +stages: + - name: test + - name: wheels + + +before_install: +- | + if [ "$TRAVIS_OS_NAME" = "linux" ]; then + if [ -n "$GCC" ]; then + export CC="gcc-$GCC" + export CXX="g++-$GCC" + export LD_LIBRARY_PATH="/lib:/usr/lib:/usr/local/lib" + elif [ -n "$CLANG" ]; then + CC="ccache clang-$CLANG" + CXX="ccache clang++-$CLANG" + export CFLAGS="-Qunused-arguments" + export CXXFLAGS="-Qunused-arguments -stdlib=libc++" + fi + if [ "$PYTHON" == "3.5" ] || [ "$PYTHON" == "3.7" ]; then + python$PYTHON -m ensurepip + elif [ "$PYTHON" = "3.6" ]; then + pyenv global system $PYTHON # https://github.com/travis-ci/travis-ci/issues/8363 + fi + PYTHON_CMD=python$PYTHON + elif [ "$TRAVIS_OS_NAME" == "osx" ]; then + export PATH=/usr/local/opt/ccache/libexec:/usr/local/bin:$PATH + CC=clang + CXX=clang++ + PYTHON_CMD=python3 + fi + $PYTHON_CMD -m pip install --upgrade pip wheel + +install: +- | + if [ "$TRAVIS_OS_NAME" = "linux" ]; then + mkdir qpdf + wget -q $QPDF_RELEASE -O - | tar xz -C qpdf --strip-components=1 + cd qpdf/ + ./configure CC="ccache $CC" CXX="ccache $CXX" + make -j 2 + sudo make install + cd .. + elif [ "$TRAVIS_OS_NAME" = "osx" ]; then + : # no op for now + fi +- $PYTHON_CMD setup.py sdist +- $PYTHON_CMD -m pip install pybind11 +- $PYTHON_CMD -m pip install --verbose dist/*.tar.gz +- $PYTHON_CMD -m pip install -r requirements/test.txt + +script: +- python3 -m pytest diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..a612ad9 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f483ae9 --- /dev/null +++ b/Makefile @@ -0,0 +1,60 @@ +# This is really just for testing + +.PHONY: +all: build + +.PHONY: invalidate-cppcov +invalidate-cppcov: + find . -name "*.gcno" -print0 | xargs -0 rm + +.PHONY: build +build: invalidate-cppcov + python setup.py build_ext --inplace + +.PHONY: clean-coverage-pycov +clean-coverage-pycov: + rm -rf coverage/pycov + rm -f .coverage + +.PHONY: clean-coverage-cppcov +clean-coverage-cppcov: + rm -rf coverage/cppcov + find . -name "*.gcda" -print0 | xargs -0 rm + rm -f coverage/cpp.info + +.PHONY: clean-coverage +clean-coverage: clean-coverage-cppcov clean-coverage-pycov + +.PHONY: clean +clean: clean-coverage + python setup.py clean --all + +.PHONY: test +test: build + pytest -n auto + +.PHONY: pycov +pycov: clean-coverage-pycov + pytest --cov-report html --cov=src -n auto + +.PHONY: build-cppcov +build-cppcov: + env CFLAGS="-coverage" python setup.py build_ext --inplace + +coverage/cpp.info: clean-coverage-cppcov build-cppcov pycov + lcov --no-external --capture --directory . --output-file coverage/cppall.info + lcov --remove coverage/cppall.info '*/pybind11/*' -o coverage/cpp.info + +coverage/cppcov: coverage/cpp.info + -mkdir -p coverage/cppcov + genhtml coverage/cpp.info --output-directory coverage/cppcov + +.PHONY: cppcov +cppcov: clean-coverage-cppcov build-cppcov pycov coverage/cppcov + +.PHONY: coverage +coverage: cppcov pycov + +.PHONY: docs +docs: + $(MAKE) -C docs html diff --git a/README.md b/README.md new file mode 100644 index 0000000..3325a20 --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +pikepdf +======= + +**pikepdf** is a Python library for reading and writing PDF files. + +[![Travis CI build status (Linux and macOS)](https://img.shields.io/travis/pikepdf/pikepdf/master.svg?label=Linux%2fmacOS%20build)](https://travis-ci.org/pikepdf/pikepdf) [![AppVeyor CI build status (Windows)](https://img.shields.io/appveyor/ci/jbarlow83/pikepdf/master.svg?label=Windows%20build)](https://ci.appveyor.com/project/jbarlow83/pikepdf) [![PyPI](https://img.shields.io/pypi/v/pikepdf.svg)](https://pypi.org/project/pikepdf/) + +pikepdf is based on [QPDF](https://github.com/qpdf/qpdf), a powerful PDF manipulation and repair library. + +Python + QPDF = "py" + "qpdf" = "pyqpdf", which looks like a dyslexia test. Say it out loud, and it sounds like "pikepdf". + +```python +# Elegant, Pythonic API +pdf = pikepdf.open('input.pdf') +num_pages = len(pdf.pages) +del pdf.pages[-1] +pdf.save('output.pdf') +``` + +**To install:** + +Python 3.5, 3.6 and 3.7 are fully supported. + +```bash +pip install pikepdf +``` + +For users who want to build from source, see [installation](https://pikepdf.readthedocs.io/en/latest/index.html). + +pikepdf is [documented](https://pikepdf.readthedocs.io/en/latest/index.html) and actively maintained. Commercial support is available. + +Features +-------- + +This library is similar to PyPDF2 and pdfrw - it provides low level access to PDF features and allows editing and content transformation of existing PDFs. Some knowledge of the PDF specification may be helpful. It does not have the capability to render a PDF to image. + +Python 2.7 and earlier versions of Python 3 are not currently supported but support is probably not difficult to achieve. Pull requests are welcome. + +| **Feature** | **pikepdf** | **PyPDF2** | **pdfrw** | +|---------------------------------------------------------------------|-------------------------------------|-------------------------------------------|-----------------------------------------| +| Editing, manipulation and transformation of existing PDFs | ✔ | ✔ | ✔ | +| Based on an existing, mature PDF library | QPDF | ✘ | ✘ | +| Implementation | C++ and Python | Python | Python | +| PDF versions supported | 1.1 to 1.7 | 1.3? | 1.7 | +| Python versions supported | 3.5-3.7 | 2.6-3.6 | 2.6-3.6 | +| Supports password protected (encrypted) PDFs | ✔ (except public key) | Only obsolete RC4 | ✘ | +| Save and load PDF compressed object streams (PDF 1.5) | ✔ | ✘ | ✘ | +| Creates linearized ("fast web view") PDFs | ✔ | ✘ | ✘ | +| Actively maintained | ![pikepdf commit activity][pikepdf-commits] | ![PyPDF2 commit activity][pypdf2-commits] | ![pdfrw commit activity][pdfrw-commits] | +| Test suite coverage | ~86% | very low | unknown | +| Creates PDFs that pass PDF validation tests | ✔ | ✘ | ? | +| Modifies PDF/A without breaking PDF/A compliance | ✔ | ✘ | ? | +| Automatically repairs PDFs with internal errors | ✔ | ✘ | ✘ | +| PDF XMP metadata editing | ✔ | read-only | ✘ +| Documentation | ✔ | ✘ | ✔ | +| Integrates with Jupyter and IPython notebooks for rapid development | ✔ | ✘ | ✘ | + + +[pikepdf-commits]: https://img.shields.io/github/commit-activity/y/pikepdf/pikepdf.svg + +[pypdf2-commits]: https://img.shields.io/github/commit-activity/y/mstamy2/PyPDF2.svg + +[pdfrw-commits]: https://img.shields.io/github/commit-activity/y/pmaupin/pdfrw.svg + +License +------- + +pikepdf is provided under the [Mozilla Public License 2.0](https://www.mozilla.org/en-US/MPL/2.0/) license (MPL) that can be found in the LICENSE file. By using, distributing, or contributing to this project, you agree to the terms and conditions of this license. + +[Informally](https://www.mozilla.org/en-US/MPL/2.0/FAQ/), MPL 2.0 is a not a "viral" license. It may be combined with other work, including commercial software. However, you must disclose your modifications *to pikepdf* in source code form. In other works, fork this repository on GitHub or elsewhere and commit your contributions there, and you've satisfied your obligations. MPL 2.0 is compatible with the GPL and LGPL - see the [guidelines](https://www.mozilla.org/en-US/MPL/2.0/combining-mpl-and-gpl/) for notes on use in GPL. + +The `tests/resources/copyright` file describes licensing terms for the test suite and the provenance of test resources. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..3317fdd --- /dev/null +++ b/debian/copyright @@ -0,0 +1,770 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: pikepdf +Source: https://github.com/pikepdf/pikepdf + +Files: * +Copyright: (C) 2017 James R. Barlow +License: MPL-2.0 +Comment: + The file licenses/license.wheel.txt is relevant only when a binary + artifact is produced from the combination of the source code of + pikepdf and the source code of qpdf. Nothing in pikepdf is Apache + licensed. + +Files: debian/* +Copyright: (C) 2018 Sean Whitton +License: MPL-2.0 + +Files: docs/images/pike.jpg tests/resources/pike-jp2.pdf +Copyright: Public domain +License: public-domain + From the U.S. Fish and Wildlife Service National Image Library. + . + See: https://en.wikipedia.org/wiki/File:Esox_lucius1.jpg +Comment: Maximum resolution version is in debian/missing-sources/. + +Files: docs/images/pike-cartoon.png +Copyright: (C) 2017 creozavr +License: CC0-1.0 + See: https://pixabay.com/en/pike-fish-predator-shchuchin-2612354/ + +Files: docs/images/pikemen.jpg +Copyright: (C) 2009 Rama +License: CC-BY-SA 2.0 + See: https://commons.wikimedia.org/wiki/File:Pike_square_img_3653.jpg + +Files: tests/*.py +Copyright: (C) 2017 James R. Barlow +License: CC0-1.0 + +Files: tests/resources/* +Copyright: (C) 2017 James R. Barlow +License: CC-BY-4.0 + +Files: test/resources/congress.pdf docs/images/congress_im0.jpg +Copyright: Public domain +License: public-domain + From US Congressional Records. +Comment: Converted from JPEG to PDF. + +Files: test/resources/enron1.pdf +Copyright: EnronData.org +License: CC-BY-3.0 + See: https://enrondata.readthedocs.io/en/latest/data/edo-enron-email-pst-dataset/ +Comment: + enron*_gs.pdf: processed by Ghostscript 9.26. + Original obtained from http://datasets.opentestset.com/datasets/Enron_files/full/williams-b/Alstom%20Power.pdf + +Files: tests/resources/graph*.pdf +Copyright: Public domain +License: public-domain + Released into the public domain by author; see: + . +Comment: + For -encrypted.pdf, user password is "user" and owner password is "owner". + +Files: tests/resources/veraPDF*.pdf +Copyright: (C) 2015 veraPDF Consortium +License: CC-BY-4.0 +Comment: + Obtained from: https://github.com/veraPDF/veraPDF-corpus + +Files: tests/resources/sandwich.pdf +Copyright: (C) 1985 Forat Electronics +License: GFDL-1.2+ or CC-BY-SA-3.0 +Comment: + Created using ocrmypdf --pdf-renderer sandwich, to test Tesseract PDF + text encoding. + . + Originally obtained from: https://commons.wikimedia.org/wiki/File:LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg + . + A copy of that JPEG is included in debian/missing-sources/. + +License: MPL-2.0 + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. + . + On Debian systems the full text of the MPL-2.0 can be found in + /usr/share/common-licenses/MPL-2.0. + +License: CC0-1.0 + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + . + On Debian systems the full text of the CC0-1.0 license can be found + in /usr/share/common-licenses/CC0-1.0 + +License: CC-BY-4.0 + Creative Commons Attribution 4.0 International Public License + . + By exercising the Licensed Rights (defined below), You accept and agree + to be bound by the terms and conditions of this Creative Commons + Attribution 4.0 International Public License ("Public License"). To the + extent this Public License may be interpreted as a contract, You are + granted the Licensed Rights in consideration of Your acceptance of + these terms and conditions, and the Licensor grants You such rights in + consideration of benefits the Licensor receives from making the + Licensed Material available under these terms and conditions. + . + Section 1 -- Definitions. + . + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + . + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + . + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + . + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + . + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + . + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + . + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + . + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + . + i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + . + j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + . + k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + . + Section 2 -- Scope. + . + a. License grant. + . + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + . + a. reproduce and Share the Licensed Material, in whole or + in part; and + . + b. produce, reproduce, and Share Adapted Material. + . + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + . + 3. Term. The term of this Public License is specified in Section + 6(a). + . + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + . + 5. Downstream recipients. + . + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + . + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + . + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + . + b. Other rights. + . + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + . + 2. Patent and trademark rights are not licensed under this + Public License. + . + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + . + Section 3 -- License Conditions. + . + Your exercise of the Licensed Rights is expressly made subject to the + following conditions. + . + a. Attribution. + . + 1. If You Share the Licensed Material (including in modified + form), You must: + . + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + . + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + . + ii. a copyright notice; + . + iii. a notice that refers to this Public License; + . + iv. a notice that refers to the disclaimer of + warranties; + . + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + . + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + . + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + . + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + . + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + . + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + . + Section 4 -- Sui Generis Database Rights. + . + Where the Licensed Rights include Sui Generis Database Rights that + apply to Your use of the Licensed Material: + . + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + . + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + . + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + . + For the avoidance of doubt, this Section 4 supplements and does not + replace Your obligations under this Public License where the Licensed + Rights include other Copyright and Similar Rights. + . + Section 5 -- Disclaimer of Warranties and Limitation of Liability. + . + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + . + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + . + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + . + Section 6 -- Term and Termination. + . + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + . + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + . + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + . + 2. upon express reinstatement by the Licensor. + . + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + . + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + . + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + . + Section 7 -- Other Terms and Conditions. + . + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + . + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + . + Section 8 -- Interpretation. + . + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + . + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + . + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + . + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +License: GFDL-1.2+ + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.2 or + any later version published by the Free Software Foundation; with no + Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. + . + On Debian systems, the complete text of the GNU Free Documentation + License version 1.2 can be found in + "/usr/share/common-licenses/GFDL-1.2". + +License: CC-BY-SA-3.0 + THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS + CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS + PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE + WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS + PROHIBITED. + . + BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND + AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS + LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU + THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH + TERMS AND CONDITIONS. + . + 1. Definitions + . + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + . + b. "Collection" means a collection of literary or artistic works, such + as encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined below) for the purposes of this + License. + . + c. "Creative Commons Compatible License" means a license that is + listed at http://creativecommons.org/compatiblelicenses that has been + approved by Creative Commons as being essentially equivalent to this + License, including, at a minimum, because that license: (i) contains + terms that have the same purpose, meaning and effect as the License + Elements of this License; and, (ii) explicitly permits the relicensing + of adaptations of works made available under that license under this + License or a Creative Commons jurisdiction license with the same + License Elements as this License. + . + d. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + . + e. "License Elements" means the following high-level license + attributes as selected by Licensor and indicated in the title of this + License: Attribution, ShareAlike. + . + f. "Licensor" means the individual, individuals, entity or entities + that offer(s) the Work under the terms of this License. + . + g. "Original Author" means, in the case of a literary or artistic + work, the individual, individuals, entity or entities who created the + Work or if no individual or entity can be identified, the publisher; + and in addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + . + h. "Work" means the literary and/or artistic work offered under the + terms of this License including without limitation any production in + the literary, scientific and artistic domain, whatever may be the mode + or form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + . + i. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + . + j. "Publicly Perform" means to perform public recitations of the Work + and to communicate to the public those public recitations, by any + means or process, including by wire or wireless means or public + digital performances; to make available to the public Works in such a + way that members of the public may access these Works from a place and + at a place individually chosen by them; to perform the Work to the + public by any means or process and the communication to the public of + the performances of the Work, including by public digital performance; + to broadcast and rebroadcast the Work by any means including signs, + sounds or images. + . + k. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + . + 2. Fair Dealing Rights. Nothing in this License is intended to reduce, + limit, or restrict any uses free from copyright or rights arising from + limitations or exceptions that are provided for in connection with the + copyright protection under copyright law or other applicable laws. + . + 3. License Grant. Subject to the terms and conditions of this License, + Licensor hereby grants You a worldwide, royalty-free, non-exclusive, + perpetual (for the duration of the applicable copyright) license to + exercise the rights in the Work as stated below: + . + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + . + b. to create and Reproduce Adaptations provided that any such + Adaptation, including any translation in any medium, takes reasonable + steps to clearly label, demarcate or otherwise identify that changes + were made to the original Work. For example, a translation could be + marked "The original work was translated from English to Spanish," or + a modification could indicate "The original work has been modified."; + . + c. to Distribute and Publicly Perform the Work including as + incorporated in Collections; and, + . + d. to Distribute and Publicly Perform Adaptations. + . + e. For the avoidance of doubt: + . + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor reserves + the exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; + . + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You of + the rights granted under this License; and, + . + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise by + You of the rights granted under this License. + . + The above rights may be exercised in all media and formats whether now + known or hereafter devised. The above rights include the right to make + such modifications as are technically necessary to exercise the rights + in other media and formats. Subject to Section 8(f), all rights not + expressly granted by Licensor are hereby reserved. + . + 4. Restrictions. The license granted in Section 3 above is expressly + made subject to and limited by the following restrictions: + . + a. You may Distribute or Publicly Perform the Work only under the + terms of this License. You must include a copy of, or the Uniform + Resource Identifier (URI) for, this License with every copy of the + Work You Distribute or Publicly Perform. You may not offer or impose + any terms on the Work that restrict the terms of this License or the + ability of the recipient of the Work to exercise the rights granted to + that recipient under the terms of the License. You may not sublicense + the Work. You must keep intact all notices that refer to this License + and to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(c), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(c), as requested. + . + b. You may Distribute or Publicly Perform an Adaptation only under the + terms of: (i) this License; (ii) a later version of this License with + the same License Elements as this License; (iii) a Creative Commons + jurisdiction license (either this or a later license version) that + contains the same License Elements as this License (e.g., + Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible + License. If you license the Adaptation under one of the licenses + mentioned in (iv), you must comply with the terms of that license. If + you license the Adaptation under the terms of any of the licenses + mentioned in (i), (ii) or (iii) (the "Applicable License"), you must + comply with the terms of the Applicable License generally and the + following provisions: (I) You must include a copy of, or the URI for, + the Applicable License with every copy of each Adaptation You + Distribute or Publicly Perform; (II) You may not offer or impose any + terms on the Adaptation that restrict the terms of the Applicable + License or the ability of the recipient of the Adaptation to exercise + the rights granted to that recipient under the terms of the Applicable + License; (III) You must keep intact all notices that refer to the + Applicable License and to the disclaimer of warranties with every copy + of the Work as included in the Adaptation You Distribute or Publicly + Perform; (IV) when You Distribute or Publicly Perform the Adaptation, + You may not impose any effective technological measures on the + Adaptation that restrict the ability of a recipient of the Adaptation + from You to exercise the rights granted to that recipient under the + terms of the Applicable License. This Section 4(b) applies to the + Adaptation as incorporated in a Collection, but this does not require + the Collection apart from the Adaptation itself to be made subject to + the terms of the Applicable License. + . + c. If You Distribute, or Publicly Perform the Work or any Adaptations + or Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Ssection 3(b), in the case of an + Adaptation, a credit identifying the use of the Work in the Adaptation + (e.g., "French translation of the Work by Original Author," or + "Screenplay based on original Work by Original Author"). The credit + required by this Section 4(c) may be implemented in any reasonable + manner; provided, however, that in the case of a Adaptation or + Collection, at a minimum such credit will appear, if a credit for all + contributing authors of the Adaptation or Collection appears, then as + part of these credits and in a manner at least as prominent as the + credits for the other contributing authors. For the avoidance of + doubt, You may only use the credit required by this Section for the + purpose of attribution in the manner set out above and, by exercising + Your rights under this License, You may not implicitly or explicitly + assert or imply any connection with, sponsorship or endorsement by the + Original Author, Licensor and/or Attribution Parties, as appropriate, + of You or Your use of the Work, without the separate, express prior + written permission of the Original Author, Licensor and/or Attribution + Parties. + . + d. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + . + 5. Representations, Warranties and Disclaimer + . + UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, + LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR + WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, + STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF + TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, + NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, + OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT + DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED + WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + . + 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY + APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY + LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR + EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, + EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + . + 7. Termination + . + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this + License. Individuals or entities who have received Adaptations or + Collections from You under this License, however, will not have their + licenses terminated provided such individuals or entities remain in + full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 + will survive any termination of this License. + . + b. Subject to the above terms and conditions, the license granted here + is perpetual (for the duration of the applicable copyright in the + Work). Notwithstanding the above, Licensor reserves the right to + release the Work under different license terms or to stop distributing + the Work at any time; provided, however that any such election will + not serve to withdraw this License (or any other license that has + been, or is required to be, granted under the terms of this License), + and this License will continue in full force and effect unless + terminated as stated above. + . + 8. Miscellaneous + . + a. Each time You Distribute or Publicly Perform the Work or a + Collection, the Licensor offers to the recipient a license to the Work + on the same terms and conditions as the license granted to You under + this License. + . + b. Each time You Distribute or Publicly Perform an Adaptation, + Licensor offers to the recipient a license to the original Work on the + same terms and conditions as the license granted to You under this + License. + . + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + . + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + . + e. This License constitutes the entire agreement between the parties + with respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + . + f. The rights granted under, and the subject matter referenced, in + this License were drafted utilizing the terminology of the Berne + Convention for the Protection of Literary and Artistic Works (as + amended on September 28, 1979), the Rome Convention of 1961, the WIPO + Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty + of 1996 and the Universal Copyright Convention (as revised on July 24, + 1971). These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..4c1a0fe --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,216 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_example.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_example.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/python_example" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/python_example" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/_notebooks/pages.ipynb b/docs/_notebooks/pages.ipynb new file mode 100644 index 0000000..fa42f8b --- /dev/null +++ b/docs/_notebooks/pages.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manipulating pages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "pikepdf presents the pages in a PDF through the ``Pdf.pages`` property, which\n", + "follows the ``list`` protocol. As such page numbers begin at 0.\n", + "\n", + "Let's look at a simple PDF that contains four pages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pikepdf import Pdf\n", + "pdf = Pdf.open('../../tests/resources/fourpages.pdf')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many pages?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(pdf.pages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thanks to IPython's rich Python object representations you can view the PDF while you work on it if you execute this IPython notebook. Click the *View PDF* link below to view the file. **You can view the PDF after change you make.** If you're reading this documentation online or as part of distribution, you won't see the rich representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also examine individual pages, which we'll explore in the next section. Suffice to say that you can access pages by indexing them and slicing them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf.pages[-1].MediaBox" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose the file was scanned backwards. We can easily reverse it in place - maybe it was scanned backwards, a common problem with automatic document scanners. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf.pages.reverse()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pretty nice, isn't it? Of course, the pages in this file are in correct order, so let's put them back." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf.pages.reverse()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Removing and adding pages is easy too." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del pdf.pages[1:3] # Remove pages 2-3 labeled \"second page\" and \"third page\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We've trimmed down the file to its essential first and last page. Now, let's add some content from another file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "appendix = Pdf.open('../../tests/resources/sandwich.pdf')\n", + "pdf.pages.extend(appendix.pages)\n", + "graph = Pdf.open('../../tests/resources/graph.pdf')\n", + "pdf.pages.insert(1, graph.pages[0])\n", + "pdf" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + "Naturally, you can save your changes with ``.save(filename_or_stream)``. ``filename`` can be a :class:`pathlib.Path`, which we accept everywhere. (Saving is commented out to avoid upsetting the documentation generator.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pdf.save('output.pdf')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using counting numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because PDF pages are usually numbered in counting numbers (1, 2, 3...), pikepdf\n", + "provides a convenience accessor ``.p()`` that uses counting numbers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf.pages.p(1) # The first page in the document\n", + "pdf.pages[0] # Also the first page in the document\n", + ";" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To avoid confusion, the ``.p()`` accessor does not accept Python slices, and ``.p(0)`` raises an exception.\n", + "\n", + "PDFs may define their own numbering scheme or different numberings for\n", + "different sections. ``.pages`` does not look up this information." + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. note::\n", + "\n", + " Because of technical limitations in underlying libraries, pikepdf keeps the\n", + " original PDF from which a page from open, even if the reference to the PDF\n", + " is garbage collected." + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. warning::\n", + "\n", + " It's possible to obtain page information through the PDF ``/Root`` object as\n", + " well, but not recommend. The internal consistency of the various ``/Page``\n", + " and ``/Pages`` is not guaranteed when accessed in this manner, and in some\n", + " PDFs the data structure for these is fairly complex. Use the ``.pages``\n", + " interface." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "pikepdf", + "language": "python", + "name": "pikepdf" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/arch.rst b/docs/arch.rst new file mode 100644 index 0000000..41afb7a --- /dev/null +++ b/docs/arch.rst @@ -0,0 +1,33 @@ +Architecture +============ + +pikepdf uses `pybind11 `_ to bind the +C++ interface of QPDF. pybind11 was selected after evaluating Cython, CFFI and +SWIG as possible binding solutions. + +In addition to bindings pikepdf includes support code written in a mix of C++ +and Python, mainly to present a clean Pythonic interface to C++ and implement +higher level functionality. + +Internals +--------- + +Internally the package presents a module named ``pikepdf`` from which objects +can be imported. The C++ extension module is currently named ``pikepdf._qpdf``. +Users of ``pikepdf`` should not directly access ``_qpdf`` since it is an +internal interface. + +Thread safety +------------- + +Because of the global interpreter lock (GIL), it is safe to read pikepdf +objects across Python threads. Also because of the GIL, there may not be much +performance gain from doing so. + +If one or more threads will be modifying pikepdf objects, you will have to +coordinate read and write access with a :class:`threading.Lock`. + +It is not currently possible to pickle pikepdf objects or marshall them across +process boundaries (as would be required to use pikepdf in +:mod:`multiprocessing`). If this were implemented, it would not be much more +efficient than saving a full PDF and sending it to another process. diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..0ac5e8e --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1,256 @@ +.. _changelog: + +Changelog +######### + +pikepdf releases use the `semantic versioning `_ policy. + +The pikepdf API (as provided by ``import pikepdf``) is quite stable and is in production use. + +Note that the C++ extension module ``pikepdf._qpdf`` is a private interface within pikepdf that applications should not use directly. + +v1.0.5 +====== + +* Fixed an issue where an invalid date in XMP metadata would cause an exception when updating DocumentInfo. For now, we warn that some DocumentInfo is not convertible. (In the future, we should also check if the XMP date is valid, because it probably is not.) + +* Rebuilt the binary wheels with libqpdf 8.3.0. libqpdf 8.2.1 is still supported. + +v1.0.4 +====== + +* Updates to tests/resources (provenance of one test file, replaced another test file with a synthetic one) + +v1.0.3 +====== + +* Fixed regression on negative indexing of pages. + +v1.0.2 +====== + +* Fixed an issue where invalid values such as out of range years (e.g. 0) in DocumentInfo would raise exceptions when using DocumentInfo to populate XMP metadata with ``.load_from_docinfo``. + +v1.0.1 +====== + +* Fixed an exception with handling metadata that contains the invalid XML entity ``�`` (an escaped NUL) + +v1.0.0 +====== + +* Changed version to 1.0. + +v0.10.2 +======= + +Fixes +----- + +* Fixed segfault when overwriting the pikepdf file that is currently open on Linux. + +* Fixed removal of an attribute metadata value when values were present on the same node. + +v0.10.1 +======= + +Fixes +----- + +* Avoid canonical XML since it is apparently too strict for XMP. + +v0.10.0 +======= + +Fixes +----- + +* Fixed several issues related to generating XMP metadata that passed veraPDF validation. + +* Fixed a random test suite failure for very large negative integers. + +* The lxml library is now required. + +v0.9.2 +====== + +Fixes +----- + +* Added all of the commonly used XML namespaces to XMP metadata handling, so we are less likely to name something 'ns1', etc. + +* Skip a test that fails on Windows. + +* Fixed build errors in documentation. + +v0.9.1 +====== + +Fixes +----- + +* Fix ``Object.write()`` accepting positional arguments it wouldn't use + +* Fix handling of XMP data with timezones (or missing timezone information) in a few cases + +* Fix generation of XMP with invalid XML characters if the invalid characters were inside a non-scalar object + +v0.9.0 +====== + +Updates +------- + +* New API to access and edit PDF metadata and make consistent edits to the new and old style of PDF metadata. + +* 32-bit binary wheels are now available for Windows + +* PDFs can now be saved in QPDF's "qdf" mode + +* The Python package defusedxml is now required + +* The Python package python-xmp-toolkit and its dependency libexempi are suggested for testing, but not required + +Fixes +----- + +* Fixed handling of filenames that contain multibyte characters on non-UTF-8 systems + +Breaking +-------- + +* The ``Pdf.metadata`` property was removed, and replaced with the new metadata API + +* ``Pdf.attach()`` has been removed, because the interface as implemented had no way to deal with existing attachments. + +v0.3.7 +====== + +* Add API for inline images to unparse themselves + +v0.3.6 +====== + +* Performance of reading files from memory improved to avoid unnecessary copies. + +* It is finally possible to use ``for key in pdfobj`` to iterate contents of PDF Dictionary, Stream and Array objects. Generally these objects behave more like Python containers should now. + +* Package API declared beta. + +v0.3.5 +====== + +Breaking +-------- + +* ``Pdf.save(...stream_data_mode=...)`` has been dropped in favor of the newer ``compress_streams=`` and ``stream_decode_level`` parameters. + +Fixes +----- + +* A use-after-free memory error that caused occasional segfaults and "QPDFFakeName" errors when opening from stream objects has been resolved. + +v0.3.4 +====== + +Updates +------- + +* pybind11 vendoring has ended now that v2.2.4 has been released + +v0.3.3 +====== + +Breaking +-------- + +* libqpdf 8.2.1 is now required + +Updates +------- + +* Improved support for working with JPEG2000 images in PDFs +* Added progress callback for saving files, ``Pdf.save(..., progress=)`` +* Updated pybind11 subtree + +Fixes +----- + +* ``del obj.AttributeName`` was not implemented. The attribute interface is now consistent +* Deleting named attributes now defers to the attribute dictionary for Stream objects, as get/set do +* Fixed handling of JPEG2000 images where metadata must be retrieved from the file + +v0.3.2 +====== + +Updates +------- + +* Added support for direct image extraction of CMYK and grayscale JPEGs, where previously only RGB (internally YUV) was supported +* ``Array()`` now creates an empty array properly +* The syntax ``Name.Foo in Dictionary()``, e.g. ``Name.XObject in page.Resources``, now works + +v0.3.1 +====== + +Breaking +-------- + +* ``pikepdf.open`` now validates its keyword arguments properly, potentially breaking code that passed invalid arguments +* libqpdf 8.1.0 is now required - libqpdf 8.1.0 API is now used for creating Unicode strings +* If a non-existent file is opened with ``pikepdf.open``, a ``FileNotFoundError`` is raised instead of a generic error +* We are now *temporarily* vendoring a copy of pybind11 since its master branch contains unreleased and important fixes for Python 3.7. + +Updates +------- + +* The syntax ``Name.Thing`` (e.g. ``Name.DecodeParms``) is now supported as equivalent to ``Name('/Thing')`` and is the recommended way to refer names within a PDF +* New API ``Pdf.remove_unneeded_resources()`` which removes objects from each page's resource dictionary that are not used in the page. This can be used to create smaller files. + +Fixes +----- + +* Fixed an error parsing inline images that have masks +* Fixed several instances of catching C++ exceptions by value instead of by reference + +v0.3.0 +====== + +Breaking +-------- + +* Modified ``Object.write`` method signature to require ``filter`` and ``decode_parms`` as keyword arguments +* Implement automatic type conversion from the PDF Null type to ``None`` +* Removed ``Object.unparse_resolved`` in favor of ``Object.unparse(resolved=True)`` +* libqpdf 8.0.2 is now required at minimum + +Updates +------- + +* Improved IPython/Jupyter interface to directly export temporary PDFs +* Updated to qpdf 8.1.0 in wheels +* Added Python 3.7 support for Windows +* Added a number of missing options from QPDF to ``Pdf.open`` and ``Pdf.save`` +* Added ability to delete a slice of pages +* Began using Jupyter notebooks for documentation + +v0.2.2 +====== + +* Added Python 3.7 support to build and test (not yet available for Windows, due to lack of availability on Appveyor) +* Removed setter API from ``PdfImage`` because it never worked anyway +* Improved handling of ``PdfImage`` with trivial palettes + +v0.2.1 +====== + +* ``Object.check_owner`` renamed to ``Object.is_owned_by`` +* ``Object.objgen`` and ``Object.get_object_id`` are now public functions +* Major internal reorganization with ``pikepdf.models`` becoming the submodule that holds support code to ease access to PDF objects as opposed to wrapping QPDF. + +v0.2.0 +====== + +* Implemented automatic type conversion for ``int``, ``bool`` and ``Decimal``, eliminating the ``pikepdf.{Integer,Boolean,Real}`` types. Removed a lot of associated numerical code. + +Everything before v0.2.0 can be considered too old to document. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..e0f367e --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,333 @@ +# -*- coding: utf-8 -*- +# +# pikepdf documentation build configuration file, created by +# sphinx-quickstart on Fri Feb 26 00:29:33 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +from pkg_resources import get_distribution +import subprocess +from unittest.mock import MagicMock + +on_rtd = os.environ.get('READTHEDOCS') == 'True' +if on_rtd: + # Borrowed from https://github.com/YannickJadoul/Parselmouth/blob/master/docs/conf.py + rtd_version = os.environ.get('READTHEDOCS_VERSION') + setup_py_version = subprocess.check_output([sys.executable, 'setup.py', '--version'], cwd='..').decode('ascii').strip() + + if rtd_version == 'stable': + branch = None + try: + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pikepdf=={}'.format(setup_py_version)]) + except subprocess.CalledProcessError: + branch = 'master' + else: + branch = 'master' if rtd_version == 'latest' else rtd_version + + if branch is not None: + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--only-binary', 'pikepdf', 'pikepdf']) + + class Mock(MagicMock): + @classmethod + def __getattr__(cls, name): + return MagicMock() + + MOCK_MODULES = ['libxmp'] + sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) + +else: + sys.path.insert(0, os.path.abspath(os.path.join('..', 'installed'))) + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) + +import pikepdf + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.autosummary', + 'sphinx.ext.napoleon', + 'IPython.sphinxext.ipython_console_highlighting', + 'IPython.sphinxext.ipython_directive' +] + +ipython_execlines = [ + 'import pikepdf', + 'from pikepdf import Pdf' +] + +autosummary_generate = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pikepdf' +copyright = u'2018, James R. Barlow' +author = u'James R. Barlow' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. + +release = get_distribution('pikepdf').version +version = '.'.join(release.split('.')[:2]) + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build', '**.ipynb_checkpoints', '_notebooks'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pikepdfdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pikepdf.tex', u'pikepdf Documentation', + u'James R. Barlow', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pikepdf', u'pikepdf Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pikepdf', u'pikepdf Documentation', + author, 'pikepdf', 'Python bindings for QPDF.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} diff --git a/docs/images/congress_im0.jpg b/docs/images/congress_im0.jpg new file mode 100644 index 0000000..fd93a60 Binary files /dev/null and b/docs/images/congress_im0.jpg differ diff --git a/docs/images/pike-cartoon.png b/docs/images/pike-cartoon.png new file mode 100644 index 0000000..1bae2b3 Binary files /dev/null and b/docs/images/pike-cartoon.png differ diff --git a/docs/images/pike.jpg b/docs/images/pike.jpg new file mode 100644 index 0000000..1596943 Binary files /dev/null and b/docs/images/pike.jpg differ diff --git a/docs/images/pikemen.jpg b/docs/images/pikemen.jpg new file mode 100644 index 0000000..b38a759 Binary files /dev/null and b/docs/images/pikemen.jpg differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..4ec5ad1 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,136 @@ +pikepdf Documentation +===================== + +.. figure:: /images/pike.jpg + :align: right + :alt: A northern pike + :figwidth: 30% + + A northern pike, or *esox lucius*. [#img1]_ + +**pikepdf** is a Python library allowing creation, manipulation and repair of +PDFs. It provides a Pythonic wrapper around the C++ PDF content transformation +library, `QPDF `_. + +Python + QPDF = "py" + "qpdf" = "pyqpdf", which looks like a dyslexia test and +is no fun to type. But say "pyqpdf" out loud, and it sounds like "pikepdf". + +At a glance +----------- + +pikepdf is a library intended developers who want to create, manipulate, parse, +repair, and abuse the PDF format. It supports reading and write PDFs, including +creating from scratch. Thanks to QPDF, it supports linearizing PDFs and access +to encrypted PDFs. + +.. code-block:: python + + # Rotate all pages in a file by 180 degrees + import pikepdf + my_pdf = pikepdf.Pdf.open('test.pdf') + for page in my_pdf.pages: + page.Rotate = 180 + my_pdf.save('test-rotated.pdf') + +It is a low level library that requires knowledge of PDF internals and some +familiarity with the PDF specification [#pdfrm]_. + +pikepdf would help you build apps that do things like: + +.. figure:: /images/pike-cartoon.png + :align: right + :alt: A cartoon sketch of a pike + :figwidth: 30% + + Pike fish are tough, hard-fighting, aggressive predators. [#img3]_ + +* :ref:`Copy pages ` from one PDF into another +* :ref:`Split ` and :ref:`merge ` PDFs +* Extract content from a PDF such as text or :ref:`images ` +* Replace content, such as :ref:`replacing an image ` without + altering the rest of the file +* Repair, reformat or :meth:`linearize ` PDFs +* Change the size of pages and reposition content +* Optimize PDFs similar to Acrobat's features by downsampling images, + deduplicating +* Calculate how much to charge for a scanning project based on the materials + scanned +* Alter a PDF to meet a target specification such as PDF/A or PDF/X +* Add or modify PDF :ref:`metadata ` +* Create well-formed but invalid PDFs for testing purposes + +What it cannot do: + +.. figure:: /images/pikemen.jpg + :align: right + :alt: A square of pikemen, carrying pikes + :figwidth: 30% + + Pikemen bracing for a calvary charge, carrying pikes. [#img2]_ + +* Rasterize PDF pages for display (that is, produce an image that shows what + a PDF page looks like at a particular resolution/zoom level) – use + Ghostscript instead +* Convert from PDF to other similar paper capture formats like epub, XPS, DjVu, + Postscript – use MuPDF or PyMuPDF +* Print to paper + +If you only want to generate PDFs and not read or modify them, consider +reportlab (a "write-only" PDF generator). + +Requirements +~~~~~~~~~~~~ + +pikepdf currently requires **Python 3.5+**. As this is a new library there are +no plans to support Python 2.7 or older versions in the 3.x family, but pull +requests to backport would be considered. + +Similar libraries +~~~~~~~~~~~~~~~~~ + +Unlike similar Python libraries such as PyPDF2 and pdfrw, pikepdf is not pure +Python. Both were designed prior to Python wheels which has made Python +extension libraries much easier to work with. By leveraging the existing mature +code base of QPDF, despite being new, pikepdf is already more capable than both +in many respects – for example, it can read compress object streams, repair +damaged PDFs in many cases, and linearize PDFs. Unlike those libraries, it's not +pure Python: it is impure and proud of it. + +In use +~~~~~~ + +pikepdf is used by the same author's `OCRmyPDF +`_ to inspect input PDFs, graft the +generated OCR layers on to page content, and output PDFs. Its code contains main +practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and +``optimize.py``. pikepdf is also used in the test suite. + + +.. toctree:: + :maxdepth: 2 + :caption: Introduction + :name: intro_toc + + installation + changelog + tutorial + objects + +.. toctree:: + :maxdepth: 2 + :caption: Reference + :name: reference_toc + + pikepdf + arch + resources + +.. rubric:: References + +.. [#img1] `Public domain image `_. + +.. [#img3] `CC0 iamge `_. + +.. [#img2] `CC-BY-SA 2.0 image `_. + +.. [#pdfrm] `PDF 32000-1:2008 `_. diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..75e51ed --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,153 @@ +Installation +============ + +.. |latest| image:: https://img.shields.io/pypi/v/pikepdf.svg + :alt: pikepdf latest released version on PyPI + +|latest| + +Most users on Linux, macOS or Windows with x64 systems should take advantage of +the binary wheels. + +.. code-block:: bash + + pip install pikepdf + +64-bit wheels are available for Windows, Linux and macOS. + +32-bit wheels are available for Windows, for use with the 32-bit version of +Python (regardless of the bitness of Windows). 32-bit wheels for Linux will be +added if anyone uses them. + +Binary wheels should work on most systems work on Linux distributions 2007 +and newer, macOS 10.11 and newer (for Homebrew), Windows 7 and newer. + +Managed distributions +--------------------- + +pikepdf is not yet widely distributed, but a few Linux distributions do make it +available. + +**Debian** + +.. |deb-experimental| image:: https://repology.org/badge/version-for-repo/debian_experimental/pikepdf.svg + :alt: Debian experimental + +|deb-experimental| + +.. code-block:: bash + + apt-get -t experimental install pikepdf + +**Fedora 29** + +.. |fedora| image:: https://repology.org/badge/version-only-for-repo/fedora_29/python:pikepdf.svg + :alt: Fedora 29 + +|fedora| + +.. code-block:: bash + + dnf install python-pikepdf + +**ArchLinux** + +Available in `ArchLinux User Repository `_. + +.. code-block:: bash + + pacman -S pikepdf + +Building from source +-------------------- + +**Requirements** + +.. |qpdf-version| replace:: 8.2.1 + +pikepdf requires: + +- a C++11 compliant compiler - GCC (4.8 and up) and clang (3.3 and up); C++14 + is recommended and will produced smaller binaries +- `pybind11 `_ +- libqpdf |qpdf-version| or higher from the + `QPDF `_ project. +- defusedxml - Python package + +On Linux the library and headers for libqpdf must be installed because pikepdf +compiles code against it and links to it. + +Check `Repology for QPDF `_ to +see if a recent version of QPDF is available for your platform. Otherwise you +must +`build QPDF from source `_. +(Consider using the binary wheels, which bundle the required version of +libqpdf.) + +**GCC and Clang** + +- clone this repository +- install libjpeg, zlib and libqpdf on your platform, including headers +- ``pip install .`` + +.. note:: + + pikepdf should be built with the same compiler and linker as libqpdf; to be + precise both **must** use the same C++ ABI. On some platforms, setup.py may + not pick the correct compiler so one may need to set environment variables + ``CC`` and ``CXX`` to redirect it. If the wrong compiler is selected, + ``import pikepdf._qpdf`` will throw an ``ImportError`` about a missing + symbol. + +**On Windows (requires Visual Studio 2015)** + +.. |msvc-zip| replace:: qpdf-|qpdf-version|-bin-msvc64.zip + +pikepdf requires a C++11 compliant compiler (i.e. Visual Studio 2015 on +Windows). See our continuous integration build script in ``.appveyor.yml`` +for detailed and current instructions. Or use the wheels which save this pain. + +These instructions require the precompiled binary ``qpdf.dll``. See the QPDF +documentation if you also need to build this DLL from source. Both should be +built with the same compiler. You may not mix and match MinGW and Visual C++ +for example. + +Running a regular ``pip install`` command will detect the +version of the compiler used to build Python and attempt to build the +extension with it. We must force the use of Visual Studio 2015. + +- clone this repository +- ``"%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" x64`` +- ``set DISTUTILS_USE_SDK=1`` +- ``set MSSdk=1`` +- download |msvc-zip| from the `QPDF releases page `_ +- extract ``bin\qpdfXX.dll`` from the zip file above, where XX is the version + of the ABI, and copy it to the ``src/pikepdf`` folder in the repository +- run ``pip install .`` in the root directory of the repository + +.. note:: + + The user compiling ``pikepdf`` to must have registry editing rights on the + machine to be able to run the ``vcvarsall.bat`` script. + +.. note:: + + If you are attempting to build pikepdf because you want to use OCRmyPDF, + **OCRmyPDF is not supported on Windows** at this time. + +Windows runtime requirements +---------------------------- + +On Windows, the Visual C++ 2015 redistributable packages are a runtime +requirement for this project. It can be found +`here `__. + +Building the documentation +-------------------------- + +Documentation is generated using Sphinx and you are currently reading it. To +regenerate it: + +- ``pip install -r requirements/docs.txt`` +- ``cd pikepdf/docs`` +- ``make html`` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..460bef9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_example.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_example.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/objects.rst b/docs/objects.rst new file mode 100644 index 0000000..c08f460 --- /dev/null +++ b/docs/objects.rst @@ -0,0 +1,69 @@ +pikepdf Object Model +******************** + +This section covers the object model pikepdf uses in more detail. + +A :class:`pikepdf.Object` is a Python wrapper around a C++ ``QPDFObjectHandle`` +which, as the name suggests, is a handle (or pointer) to a data structure in +memory, or possibly a reference to data that exists in a file. Importantly, an +object can be a scalar quantity (like a string) or a compound quantity (like a +list or dict, that contains other objects). The fact that the C++ class involved +here is an object *handle* is an implementation detail; it shouldn't matter for +a pikepdf user. + +The simplest types in PDFs are directly represented as Python types: ``int``, +``bool``, and ``None`` stand for PDF integers, booleans and the "null". +:class:`~decimal.Decimal` is used for floating point numbers in PDFs. If a +value in a PDF is assigned to a Python ``float``, pikepdf will convert it to +``Decimal``. + +Types that are not directly convertible to Python are represented as +:class:`pikepdf.Object`, a compound object that offers a superset of methods, +some work only if the underlying type is suitable. You can use the EAFP +idiom or ``isinstance`` to determine the type more precisely. This partly +reflects the fact that the PDF specification allows many data fields to be +one of several types. + +For convenience, the ``repr()`` of a ``pikepdf.Object`` will display a +Python expression that replicates the existing object (when possible), so it +will say: + +.. code-block:: python + + >>> catalog_name = pdf.root.Type + pikepdf.Name("/Catalog") + >>> isinstance(catalog_name, pikepdf.Name) + True + >>> isinstance(catalog_name, pikepdf.Object) + True + + +Making PDF objects +================== + +You may construct a new object with one of the classes: + +* :class:`pikepdf.Array` +* :class:`pikepdf.Dictionary` +* :class:`pikepdf.Name` - the type used for keys in PDF Dictionary objects +* :class:`pikepdf.String` - a text string + (treated as ``bytes`` and ``str`` depending on context) + +These may be thought of as subclasses of ``pikepdf.Object``. (Internally they +**are** ``pikepdf.Object``.) + +There are a few other classes for special PDF objects that don't +map to Python as neatly. + +* ``pikepdf.Operator`` - a special object involved in processing content + streams +* ``pikepdf.Stream`` - a special object similar to a ``Dictionary`` with + binary data attached +* ``pikepdf.InlineImage`` - an image that is embedded in content streams + +The great news is that it's often unnecessary to construct ``pikepdf.Object`` +objects when working with pikepdf. Python types are transparently *converted* to +the appropriate pikepdf object when passed to pikepdf APIs – when possible. +However, pikepdf sends ``pikepdf.Object`` types back to Python on return calls, +in most cases, because pikepdf needs to keep track of objects that came from +PDFs originally. diff --git a/docs/pikepdf.rst b/docs/pikepdf.rst new file mode 100644 index 0000000..750a3db --- /dev/null +++ b/docs/pikepdf.rst @@ -0,0 +1,128 @@ +pikepdf API +*********** + +Primary objects +=============== + +.. autoclass:: pikepdf.Pdf + :members: + +.. autofunction:: pikepdf.open + +.. autoclass:: pikepdf.ObjectStreamMode + + Options for saving object streams within PDFs, which are more a compact + way of saving certains types of data that was added in PDF 1.5. All + modern PDF viewers support object streams, but some third party tools + and libraries cannot read them. + + .. attribute:: disable + + Disable the use of object streams. If any object streams exist in the + file, remove them when the file is saved. + + .. attribute:: preserve + + Preserve any existing object streams in the original file. This is + the default behavior. + + .. attribute:: generate + + Generate object streams. + +.. autoclass:: pikepdf.StreamDecodeLevel + + .. attribute:: none + + Do not attempt to apply any filters. Streams + remain as they appear in the original file. Note that + uncompressed streams may still be compressed on output. You can + disable that by calling setCompressStreams(false). + + .. attribute:: generalized + + This is the default. libqpdf will apply + LZWDecode, ASCII85Decode, ASCIIHexDecode, and FlateDecode + filters on the input. When combined with + setCompressStreams(true), which the default, the effect of this + is that streams filtered with these older and less efficient + filters will be recompressed with the Flate filter. As a + special case, if a stream is already compressed with + FlateDecode and setCompressStreams is enabled, the original + compressed data will be preserved. + + .. attribute:: specialized + + In addition to uncompressing the + generalized compression formats, supported non-lossy + compression will also be be decoded. At present, this includes + the RunLengthDecode filter. + + .. attribute:: all + + In addition to generalized and non-lossy + specialized filters, supported lossy compression filters will + be applied. At present, this includes DCTDecode (JPEG) + compression. Note that compressing the resulting data with + DCTDecode again will accumulate loss, so avoid multiple + compression and decompression cycles. This is mostly useful for + retrieving image data. + +.. autoexception:: pikepdf.PdfError + +.. autoexception:: pikepdf.PasswordError + +Object construction +=================== + +.. autoclass:: pikepdf.Object + :members: + +.. autoclass:: pikepdf.Name + :members: __new__ + +.. autoclass:: pikepdf.String + :members: __new__ + +.. autoclass:: pikepdf.Array + :members: __new__ + +.. autoclass:: pikepdf.Dictionary + :members: __new__ + +.. autoclass:: pikepdf.Stream + :members: __new__ + +.. autoclass:: pikepdf.Operator + :members: + +Support models +============== + +.. autofunction:: pikepdf.parse_content_stream + +.. autoclass:: pikepdf.PdfMatrix + :members: + + .. attribute:: a + + .. attribute:: b + + .. attribute:: c + + .. attribute:: d + + .. attribute:: e + + .. attribute:: f + + Return one of the six "active values" of the matrix. + +.. autoclass:: pikepdf.PdfImage + :members: + +.. autoclass:: pikepdf.PdfInlineImage + :members: + +.. autoclass:: pikepdf.models.PdfMetadata + :members: diff --git a/docs/resources.rst b/docs/resources.rst new file mode 100644 index 0000000..0bcc3c7 --- /dev/null +++ b/docs/resources.rst @@ -0,0 +1,14 @@ +Resources +========= + +* `QPDF Manual`_ + +* `PDF 1.7`_ ISO Specification PDF 32000-1:2008 + +* `Adobe extensions`_ to the PDF specification + +.. _QPDF Manual: http://qpdf.sourceforge.net/files/qpdf-manual.html + +.. _PDF 1.7: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf + +.. _Adobe extensions: https://www.adobe.com/devnet/pdf/pdf_reference.html diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000..98a721d --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,42 @@ +Tutorial +******** + +**Opening and saving** + +In contrast to better known PDF libraries, pikepdf uses a single object to +represent a PDF, whether reading, writing or merging. We have cleverly named +this :class:`pikepdf.Pdf`. + +.. code-block:: python + + from pikepdf import Pdf + new_pdf = Pdf.new() + sample_pdf = Pdf.open('sample.pdf') + sample_pdf.save('sample2.pdf') + +You may of course use ``from pikepdf import Pdf as ...`` if the short class +name conflicts or ``from pikepdf import Pdf as PDF`` if you prefer uppercase. + +:func:`pikepdf.open` is a shorthand for ``Pdf.open``. + +The PDF class API follows the example of the widely-used +`Pillow image library `_. For clarity +there is no default constructor since the arguments used for creation and +opening are different. ``Pdf.open()`` also accepts seekable streams as input, +and ``Pdf.save()`` accepts streams as output. + +**Topics** + +This tutorial begins on the assumption that working with pages - splitting +and merging, saving and loading, is the most basic thing users want to do. +(The ``qpdf`` commandline tool, on which pikepdf is based, also does an +excellent job of file level PDF handling.) What pikepdf does is make qpdf's +powerful API more accessible. + +.. toctree:: + :maxdepth: 1 + + tutorial/pages + tutorial/page + tutorial/streams + tutorial/metadata diff --git a/docs/tutorial/metadata.rst b/docs/tutorial/metadata.rst new file mode 100644 index 0000000..a0745f1 --- /dev/null +++ b/docs/tutorial/metadata.rst @@ -0,0 +1,111 @@ +.. _metadata: + +PDF Metadata +============ + +The primary metadata in a PDF is stored in an XMP (Extensible Metadata +Platform) Metadata stream, where XMP is a metadata specification in XML format. +For full information on XMP, see Adobe's `XMP Developer Center +`_. It supercedes the older Document Info +dictionaries, which are removed in the PDF 2.0 specification. The XMP data entry +is optional and does not appear in all PDFs. + +The `XMP Specification`_ also provides useful information. + +pikepdf provides an interface to simplify viewing and making minor edits to XMP. +In particular, compound quantities may be read, but only scalar quantities can +be modified. + +For more complex changes consider using the ``python-xmp-toolkit`` library and +its libexempi dependency; but note that it is not capable of synchronizing +changes to the older DocumentInfo metadata. + +.. _XMP Specification: https://wwwimages2.adobe.com/content/dam/acom/en/devnet/xmp/pdfs/XMP%20SDK%20Release%20cc-2016-08/XMPSpecificationPart1.pdf + +.. _accessmetadata: + +Accessing metadata +------------------ + +The XMP metadata stream is attached the PDF's root object, but to simplify +management of this, use :meth:`pikepdf.Pdf.open_metadata`. The returned +:class:`pikepdf.models.PdfMetadata` object may be used for reading, or entered +with a ``with`` block to modify and commit changes. If you use this interface, +pikepdf will synchronize changes to new and old metadata. + +A PDF must still be saved after metadata is changed. + +.. ipython:: + + In [1]: pdf = pikepdf.open('../tests/resources/sandwich.pdf') + + In [2]: meta = pdf.open_metadata() + + In [3]: meta['xmp:CreatorTool'] + Out[3]: 'ocrmypdf 5.3.3 / Tesseract OCR-PDF 3.05.01' + +If no XMP metadata exists, an empty XMP metadata container will be created. + +Open metadata in a ``with`` block to open it for editing. When the block is +exited, changes are committed (updating XMP and the Document Info dictionary) +and attached to the PDF object. The PDF must still be saved. If an exception +occurs in the block, changes are discarded. + +.. ipython:: + + In [4]: with pdf.open_metadata() as meta: + ...: meta['dc:title'] = "Let's change the title" + ...: + +The list of available metadata fields may be found in the `XMP Specification`_. + +Checking PDF/A conformance +-------------------------- + +The metadata interface can also test if a file **claims** to be conformant +to the PDF/A specification. + +.. ipython:: + + In [9]: pdf = pikepdf.open('../tests/resources/veraPDF test suite 6-2-10-t02-pass-a.pdf') + + In [10]: meta = pdf.open_metadata() + + In [11]: meta.pdfa_status + Out[11]: '1B' + +.. note:: + + Note that this property merely *tests* if the file claims to be conformant to + the PDF/A standard. Use a tool such as veraPDF to verify conformance. + +The Document Info dictionary +---------------------------- + +The Document Info block is an older, now deprecated object in which metadata +may be stored. The Document Info is not attached to the /Root object. +It may be accessed using the ``.docinfo`` property. If no Document Info exists, +touching the ``.docinfo`` will properly initialize an empty one. + +Here is an example of a Document Info block. + +.. ipython:: + + In [12]: pdf = pikepdf.open('../tests/resources/sandwich.pdf') + + In [12]: pdf.docinfo + Out[12]: + pikepdf.Dictionary({ + "/CreationDate": "D:20170911132748-07'00'", + "/Creator": "ocrmypdf 5.3.3 / Tesseract OCR-PDF 3.05.01", + "/ModDate": "D:20170911132748-07'00'", + "/Producer": "GPL Ghostscript 9.21" + }) + +It is permitted in pikepdf to directly interact with Document Info as with +other PDF dictionaries. However, it is better to use ``.open_metadata()`` +because that interface will apply changes to both XMP and Document Info in a +consistent manner. + +You may copy from data from a Document Info object in the current PDF or another +PDF into XMP metadata using :meth:`~pikepdf.models.PdfMetadata.load_from_docinfo`. diff --git a/docs/tutorial/page.rst b/docs/tutorial/page.rst new file mode 100644 index 0000000..73a2dc4 --- /dev/null +++ b/docs/tutorial/page.rst @@ -0,0 +1,175 @@ +Examining a page +================ + +Pages are dictionaries +---------------------- + +In PDFs, the main data structure is the **dictionary**, a key-value data +structure much like a Python ``dict`` or ``attrdict``. The major difference is +that the keys can only be **names**, while values can be any type, including +other dictionaries. + +PDF dictionaries are represented as :class:`pikepdf.Dictionary`, and names +are of type :class:`pikepdf.Name`. A page is just another dictionary, with a +few required fields that give it special status as a page. + +A :class:`pikepdf.Name` that is, usually, an ASCII-encoded string beginning with +"/" followed by a capital letter. + +.. ipython:: + + In [1]: from pikepdf import Pdf + + In [1]: example = Pdf.open('../tests/resources/congress.pdf') + + In [1]: page1 = example.pages[0] + + In [1]: page1 + +Item and attribute notation +--------------------------- + +Dictionary keys may be looked up using keys (``page1['/MediaBox']``) or +attributes (``page1.MediaBox``). The two conventions are equivalent. + +.. ipython:: + + In [1]: page1.MediaBox + + In [1]: page1['/MediaBox'] + +By convention, pikepdf uses attribute notation for keys in the PDF +specification and item notation for internal names within a PDF. For example + +.. ipython:: + :verbatim: + + In [1]: page1.Resources.XObject['/Im0'] + +Here ``'/Im0'`` is an arbitrary name generated by the program that produced this +PDF, rather than a name in the specification like ``Resources`` and ``XObject``. +Item notation here would be quite cumbersome: +``['/Resources']['/XObject]['/Im0']`` (not recommended). + +Attribute notation is convenient, but not robust if elements are missing. For +elements that are not always present, you can use ``.get()``, which behaves like +``dict.get()`` in core Python. A library such as `glom +`_ might help when working with complex +structured data that is not always present. + +repr() output +------------- + +Returning to the page's output: + +.. ipython:: + + In [1]: page1 + +The angle brackets in the output indicate that this object cannot be +constructed with a Python expression because it contains a reference. When +angle brackets are omitted from the ``repr()`` of a pikepdf object, then the +object can be replicated with a Python expression, such as +``eval(repr(x)) == x``. + +In Jupyter and IPython, pikepdf will instead attempt to display a preview of +the PDF page. An explicit ``repr(page)`` will show the text representation. + +This page's MediaBox is a direct object. The MediaBox describes +the size of the page in PDF coordinates (1/72 inch multiplied by the value of +the page's ``/UserUnit``, if present). + +.. ipython:: + + In [1]: import pikepdf + + In [1]: page1.MediaBox + + In [1]: pikepdf.Array([ 0, 0, 200, 304 ]) + +The page's ``/Contents`` key contains instructions for drawing the page content. +Also attached to this page is a ``/Resources`` dictionary, which contains a +single XObject image. The image is compressed with the ``/DCTDecode`` filter, +meaning it is encoded with the :abbr:`DCT (discrete cosine transform)`, so it is +a JPEG. [#]_ + +.. [#] Without the JFIF header. + + +Viewing images +-------------- + +pikepdf provides a helper class :class:`~pikepdf.PdfImage` for manipulating +PDF images. + +.. ipython:: + + In [1]: from pikepdf import PdfImage + + In [1]: pdfimage = PdfImage(page1.Resources.XObject['/Im0']) + + In [1]: pdfimage + Out[1]: + +In Jupyter (or IPython with a suitable configuration) the image will be +displayed. + +|im0| + +.. |im0| image:: /images/congress_im0.jpg + :width: 2in + +You can also inspect the properties of the image. The parameters are similar +to Pillow's. + +.. ipython:: + + In [1]: pdfimage.colorspace + + In [1]: pdfimage.width, pdfimage.height + +.. note:: + + ``.width`` and ``.height`` are the resolution of the image in pixels, not + the size of the image in page coordinates. + +.. _extract_image: + +Extracting images +----------------- + +Extracting images is straightforward. :meth:`~pikepdf.PdfImage.extract_to` will +extract images to streams, such as an open file. Where possible, ``extract_to`` +writes compressed data directly to the stream without transcoding. The return +value is the file extension that was extracted. + +.. ipython:: + :verbatim: + + In [1]: pdfimage.extract_to(stream=open('file.jpg', 'w')) + +You can also retrieve the image as a Pillow image: + +.. ipython:: + :verbatim: + + In [1]: pdfimage.as_pil_image() + +.. note:: + + This simple example PDF displays a single full page image. Some PDF creators + will paint a page using multiple images, and features such as layers, + transparency and image masks. Accessing the first image on a page is like an + HTML parser that scans for the first ```` tag it finds. A lot + more could be happening. There can be multiple images drawn multiple times + on a page, vector art, overdrawing, masking, and transparency. A set of + resources can be grouped together in a "Form XObject" (not to be confused + with a PDF Form), and drawn at all once. Images can be referenced by + multiple pages. + +.. _replace_image: + +Replacing an image +------------------ + +See ``test_image_access.py::test_image_replace``. diff --git a/docs/tutorial/pages.rst b/docs/tutorial/pages.rst new file mode 100644 index 0000000..b36cdb5 --- /dev/null +++ b/docs/tutorial/pages.rst @@ -0,0 +1,214 @@ +Manipulating pages +------------------ + +pikepdf presents the pages in a PDF through the :attr:`pikepdf.Pdf.pages` +property, which follows the ``list`` protocol. As such page numbers begin at 0. + +Since one of the most things people want to do is split and merge PDF pages, +we'll by exploring that. + +Let’s look at a simple PDF that contains four pages. + +.. ipython:: + + In [1]: from pikepdf import Pdf + + In [2]: pdf = Pdf.open('../tests/resources/fourpages.pdf') + +How many pages? + +.. ipython:: + + In [2]: len(pdf.pages) + +Thanks to IPython’s rich Python object representations you can view the PDF +while you work on it if you execute this example in a Jupyter notebook. Click +the *View PDF* link below to view the file. **You can view the PDF after each +change you make.** If you’re reading this documentation online or as part of +distribution, you won’t see the rich representation. + +.. ipython:: + :verbatim: + + In [1]: pdf + Out[1]: View PDF + +You can also examine individual pages, which we’ll explore in the next +section. Suffice to say that you can access pages by indexing them and +slicing them. + +.. ipython:: + + In [1]: pdf.pages[-1].MediaBox + +Reversing the order of pages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose the file was scanned backwards. We can easily reverse it in +place - maybe it was scanned backwards, a common problem with automatic +document scanners. + +.. ipython:: + + In [1]: pdf.pages.reverse() + +.. ipython:: + + In [1]: pdf + +Pretty nice, isn’t it? Of course, the pages in this file are in correct +order, so let’s put them back. + +.. ipython:: + + In [1]: pdf.pages.reverse() + +Deleting pages +~~~~~~~~~~~~~~ + +Removing and adding pages is easy too. + +.. ipython:: + + In [1]: del pdf.pages[1:3] # Remove pages 2-3 labeled "second page" and "third page" + +.. ipython:: + + In [1]: pdf + +We’ve trimmed down the file to its essential first and last page. + +.. _copyother: + +Copying pages from other PDFs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now, let’s add some content from another file. Because ``pdf.pages`` behaves +like a list, we can use ``pages.extend()`` on another file's pages. + +.. ipython:: + + In [1]: pdf = Pdf.open('../tests/resources/fourpages.pdf') + + In [1]: appendix = Pdf.open('../tests/resources/sandwich.pdf') + + In [2]: pdf.pages.extend(appendix.pages) + +We can use ``pages.insert()`` to insert into one of more pages into a specific +position, bumping everything else ahead. + +.. ipython:: + + In [3]: graph = Pdf.open('../tests/resources/graph.pdf') + + In [4]: pdf.pages.insert(1, graph.pages[0]) + + In [5]: len(pdf.pages) + +We can also replace specific pages with assignment (or slicing). + +.. ipython:: + + In [1]: congress = Pdf.open('../tests/resources/congress.pdf') + + In [1]: pdf.pages[2] = congress.pages[0] + +Saving changes +~~~~~~~~~~~~~~ + +Naturally, you can save your changes with :meth:`pikepdf.Pdf.save`. +``filename`` can be a :class:`pathlib.Path`, which we accept everywhere. (Saving +is commented out to avoid upsetting the documentation generator.) + +.. ipython:: + :verbatim: + + In [1]: pdf.save('output.pdf') + +You may save a file multiple times, and you may continue modifying it after +saving. + +.. _splitpdf: + +Split a PDF one page PDFs +~~~~~~~~~~~~~~~~~~~~~~~~~ + +All we need is a new PDF to hold the destination page. + +.. ipython:: + :verbatim: + + In [1]: pdf = Pdf.open('../tests/resources/fourpages.pdf') + + In [5]: for n, page in enumerate(pdf.pages): + ...: dst = Pdf.new() + ...: dst.pages.append(page) + ...: dst.save('{:02d}.pdf'.format(n)) + +.. note:: + + This example will transfer data associated with each page, so + that every page stands on its own. It will *not* transfer some metadata + associated with the PDF as a whole, such the list of bookmarks. + +.. _mergepdf: + +Merging a PDF from several files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You might be able to guess. + +.. ipython:: + :verbatim: + + In [1]: from glob import glob + + In [1]: pdf = Pdf.new() + + In [1]: for file in glob('*.pdf'): + ...: src = Pdf.open(file) + ...: pdf.pages.extend(src.pages) + + In [1]: pdf.save('merged.pdf') + +.. note:: + + This code sample does not deduplicate objects. The resulting file may be + large if the source files have content in common. + +Using counting numbers +~~~~~~~~~~~~~~~~~~~~~~ + +Because PDF pages are usually numbered in counting numbers (1, 2, 3…), +pikepdf provides a convenience accessor ``.p()`` that uses counting +numbers: + +.. ipython:: + :verbatim: + + In [1]: pdf.pages.p(1) # The first page in the document + + In [1]: pdf.pages[0] # Also the first page in the document + +To avoid confusion, the ``.p()`` accessor does not accept Python slices, +and ``.p(0)`` raises an exception. It is also not possible to delete using it. + +PDFs may define their own numbering scheme or different numberings for +different sections, such as using Roman numerals for an introductory section. +``.pages`` does not look up this information. + +.. note:: + + Because of technical limitations in underlying libraries, pikepdf keeps the + source PDF open when a content is copied from it to another PDF, even when + all Python variables pointing to the source are removed. If a PDF is being assembled from many sources, then + all of those sources are held open in memory. This memory can be released + by saving and re-opening the PDF. + +.. warning:: + + It's possible to obtain page information through the PDF ``/Root`` object as + well, but not recommend. The internal consistency of the various ``/Page`` + and ``/Pages`` is not guaranteed when accessed in this manner, and in some + PDFs the data structure for these is fairly complex. Use the ``.pages`` + interface. diff --git a/docs/tutorial/streams.rst b/docs/tutorial/streams.rst new file mode 100644 index 0000000..830578d --- /dev/null +++ b/docs/tutorial/streams.rst @@ -0,0 +1,89 @@ +Working with PDF Streams +======================== + +A :class:`pikepdf.Stream` object works like a PDF dictionary with some encoded +bytes attached. The dictionary is metadata that describes how the stream is +encoded. PDF can, and regularly does, use a variety of encoding filters. A +stream can be encoded with one or more filters. Images are a type of stream +object. + +Most of the interesting content in a PDF (images and content streams) are +inside page objects. + +Because the PDF specification unfortunately defines several terms involve the +word stream, let's attempt to clarify: + +stream object + A PDF object that contains binary data and a metadata dictionary to describes + it, represented as :class:`pikepdf.Stream`. In HTML this is equivalent to + a ```` with inline image data. + +object stream + A stream object (not a typo, an object stream really is a type of stream + object) in a PDF that contains a number of other objects in a + PDF, grouped together for better compression. In pikepdf there is an option + to save PDFs with this feature enabled to improve compression. Otherwise, + this is just a detail about how PDF files are encoded. + +content stream + A stream object that contains some instructions to draw graphics + and text on a page, or inside a Form XObject. In HTML this is equivalent to + the HTML file itself. Content streams do not cross pages. + +Form XObject + A group of images, text and drawing commands that can be rendered elsewhere + in a PDF as a group. This is often used when a group of objects are needed + at different scales or multiple pages. In HTML this is like an ````. + +Reading stream objects +---------------------- + +Fortunately, :meth:`pikepdf.Stream.read_bytes` will apply all filters +and decode the uncompressed bytes, or throw an error if this is not possible. +:meth:`pikepdf.Stream.read_raw_bytes` provides access to the compressed bytes. + +For example, we can read the XMP metadata, however it is encoded, from a PDF +with the following: + +.. code-block:: python + + >>> xmp = example.root.Metadata.read_bytes() + >>> type(xmp) + bytes + >>> print(xmp.decode()) + + + + + + 2017-09-11T13:27:48-07:00 + 2017-09-11T13:27:48-07:00 + ocrmypdf 5.3.3 / Tesseract OCR-PDF 3.05.01 + + Untitled + + + + +That lets us see a few facts about this file. It was created by OCRmyPDF +and Tesseract OCR's PDF generator. Ghostscript was used to convert it to +PDF-A (the ``xmlns:pdfaid`` tag). + +Of course, it would be far more convenient to use the pikepdf +:ref:`metadata` interface than manual parse this XML object. It just +so happens this is a human readable object found in most PDFs. + +Parsing content streams +----------------------- + +When a stream object is a content stream, you probably want to parse the +content stream to interpret it. + +pikepdf provides a C++ optimized content stream parser. + +.. code-block:: python + + >>> pdf = pikepdf.open(input_pdf) + >>> page = pdf.pages[0] + >>> for operands, command in parse_content_stream(page): + >>> print(command) diff --git a/examples/find_links.py b/examples/find_links.py new file mode 100644 index 0000000..4d69c17 --- /dev/null +++ b/examples/find_links.py @@ -0,0 +1,78 @@ +# Copyright (c) 2019, James R. Barlow + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Use pikepdf to find links in a PDF""" + +import argparse +import pikepdf +from pikepdf import Name + +parser = argparse.ArgumentParser(description="Find URIs in a PDF") +parser.add_argument('input_file') + + +def check_action(action): + if action.Type != Name.Action: + return + if action.S == Name.URI: + yield str(bytes(action.URI), encoding='ascii') + + +def check_object_aa(obj): + if Name.AA in obj: + for name, action in obj.AA.items(): + yield from check_action(action) + + +def check_page_annots(pdf, page): + if Name.Annots not in page: + return + annots = page.Annots + for annot in annots: + if annot.Type != Name.Annot: + continue + if annot.Subtype == Name.Link: + link_annot = annot + if Name.A in link_annot: + action = link_annot.A + yield from check_action(action) + yield from check_object_aa(annot) + + +def check_page(pdf, page): + yield from check_object_aa(page) + + +def gather_links(pdf): + for page in pdf.pages: + yield from check_page(pdf, page) + yield from check_page_annots(pdf, page) + + +def main(): + args = parser.parse_args() + pdf = pikepdf.open(args.input_file) + links = gather_links(pdf) + for link in links: + print(link) + + +if __name__ == "__main__": + main() diff --git a/licenses/license.wheel.txt b/licenses/license.wheel.txt new file mode 100644 index 0000000..0a79b2f --- /dev/null +++ b/licenses/license.wheel.txt @@ -0,0 +1,659 @@ +This license text file is for Python wheels built from the pikepdf source +distribution. See LICENSE.txt in the pikepdf source distribution for its +primary source license. + +pikepdf wheel license +===================== + +This Python wheel is a Executable Form and/or Object form distribution +of pikepdf and qpdf, prepared for ease of use. + +It includes primarily code from two open source projects: +- pikepdf (https://github.com/pikepdf/pikepdf), licensed under +Mozilla Public License Version 2.0 +- qpdf (https://github.com/qpdf/qpdf), licensed under +Apache Software License Version 2.0 + +Both licenses apply to the installation, use and distribution of this +wheel. Refer to their web pages for full details of license and +copyright information. + +The full text of each license agreement follows, along with the Apache NOTICE +file from QPDF. + +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +QPDF NOTICE +=========== + +QPDF is copyright (c) 2005-2018 Jay Berkenbilt + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. + +Versions of qpdf prior to version 7 were released under the terms of version 2.0 +of the Artistic License. At your option, you may continue to consider qpdf to be +licensed under those terms. Please see the manual for additional information. + +The qpdf distribution includes a copy of [qtest](http://qtest.qbilt.org), which +is released under the terms of the [version 2.0 of the Artistic +license](https://opensource.org/licenses/Artistic-2.0), which can be found at +https://opensource.org/licenses/Artistic-2.0. + +The Rijndael encryption implementation used as the basis for AES encryption and +decryption support comes from Philip J. Erdelsky's public domain implementation. +The files `libqpdf/rijndael.cc` and `libqpdf/qpdf/rijndael.h` remain in the +public domain. They were obtained from +* http://www.efgh.com/software/rijndael.htm +* http://www.efgh.com/software/rijndael.txt + +The embedded sha2 code comes from sphlib 3.0 +* http://www.saphir2.com/sphlib/ + +That code has the following license: + ``` + Copyright (c) 2007-2011 Projet RNRT SAPHIR + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ``` diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000..e683077 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,7 @@ +defusedxml +ipython +matplotlib +pybind11 # not strictly necessary if pybind11 is vendored +setuptools_scm +sphinx >= 1.4 +sphinx-rtd-theme diff --git a/requirements/test.txt b/requirements/test.txt new file mode 100644 index 0000000..e798212 --- /dev/null +++ b/requirements/test.txt @@ -0,0 +1,7 @@ +attrs >= 17.4.0 +hypothesis >= 3.56.9 +Pillow >= 5.0.0 +pytest >= 3.6.0, < 4.1.0 +pytest-xdist >= 1.22.2 +pytest-helpers-namespace >= 2017.11.11 +pytest-timeout >= 1.3.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..50244a4 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,31 @@ +[metadata] +license_file = licenses/license.wheel.txt + +[aliases] +test=pytest + +[tool:pytest] +norecursedirs = lib .pc .git venv +testpaths = tests + +[coverage:run] + +[coverage:report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + +omit = + src/pikepdf/_boneyard.py + +[coverage:html] +directory = coverage/pycov diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ab5c9c2 --- /dev/null +++ b/setup.py @@ -0,0 +1,166 @@ +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext +import sys +import setuptools +from os.path import join, dirname, exists +from glob import glob + + +class get_pybind_include(object): + """Helper class to determine the pybind11 include path + + The purpose of this class is to postpone importing pybind11 + until it is actually installed, so that the ``get_include()`` + method can be invoked. """ + + def __init__(self, user=False): + self.user = user + + def __str__(self): + # If we are vendoring use the vendored version + if exists('src/vendor/pybind11'): + return 'src/vendor/pybind11/include' + import pybind11 + return pybind11.get_include(self.user) + + +ext_modules = [ + Extension( + 'pikepdf._qpdf', + glob('src/qpdf/*.cpp'), + depends=glob('src/qpdf/*.h'), + include_dirs=[ + # Path to pybind11 headers + get_pybind_include(), + get_pybind_include(user=True) + ], + libraries=['qpdf'], + language='c++' + ), +] + + +# As of Python 3.6, CCompiler has a `has_flag` method. +# cf http://bugs.python.org/issue26689 +def has_flag(compiler, flagname): + """Return a boolean indicating whether a flag name is supported on + the specified compiler. + """ + import tempfile + with tempfile.NamedTemporaryFile('w', suffix='.cpp') as tmpf: + tmpf.write('int main (int argc, char **argv) { return 0; }') + try: + compiler.compile([tmpf.name], extra_postargs=[flagname]) + except setuptools.distutils.errors.CompileError: + return False + return True + + +def cpp_flag(compiler): + """Return the -std=c++[11/14] compiler flag. + + The c++14 is preferred over c++11 (when it is available). + """ + if has_flag(compiler, '-std=c++14'): + return '-std=c++14' + elif has_flag(compiler, '-std=c++11'): + return '-std=c++11' + else: + raise RuntimeError('Unsupported compiler -- at least C++11 support ' + 'is needed!') + + +class BuildExt(build_ext): + """A custom build extension for adding compiler-specific options.""" + c_opts = { + 'msvc': ['/EHsc'], + 'unix': [], + } + + if sys.platform == 'darwin': + c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] + + def build_extensions(self): + ct = self.compiler.compiler_type + opts = self.c_opts.get(ct, []) + if ct == 'unix': + opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) + opts.append(cpp_flag(self.compiler)) + if has_flag(self.compiler, '-fvisibility=hidden'): + opts.append('-fvisibility=hidden') + elif ct == 'msvc': + opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) + for ext in self.extensions: + ext.extra_compile_args = opts + build_ext.build_extensions(self) + +setup_py_cwd = dirname(__file__) + +with open(join(setup_py_cwd, 'requirements/docs.txt')) as f: + docs_require = [ + line.strip() for line in f + if line.strip() and not line.strip().startswith('#') + ] + + +with open(join(setup_py_cwd, 'requirements/test.txt')) as f: + tests_require = [ + line.strip() for line in f + if line.strip() and not line.strip().startswith('#') + ] + +with open(join(setup_py_cwd, 'README.md'), encoding='utf-8') as f: + readme = f.read() + +setup( + name='pikepdf', + author='James R. Barlow', + author_email='jim@purplerock.ca', + url='https://github.com/pikepdf/pikepdf', + description='Read and write PDFs with Python, powered by qpdf', + long_description=readme, + long_description_content_type='text/markdown', + ext_modules=ext_modules, + install_requires=[ + 'defusedxml >= 0.5.0', + 'lxml >= 4.0', + ], + extras_require={ + 'docs': docs_require + }, + cmdclass={'build_ext': BuildExt}, + zip_safe=False, + python_requires='>=3.5', + setup_requires=[ + 'pytest-runner', + 'setuptools_scm', + 'setuptools_scm_git_archive', + 'pybind11 >= 2.2.4, < 3' + ], + use_scm_version=True, + tests_require=tests_require, + package_dir={'': 'src'}, + packages=setuptools.find_packages('src'), + package_data={ + '': ['*.txt'], + 'pikepdf': ['qpdf21.dll'] + }, + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: C++", + "Topic :: Multimedia :: Graphics", + "Topic :: Software Development :: Libraries", + ], + project_urls={ + 'Documentation': 'https://pikepdf.readthedocs.io/', + 'Source': 'https://github.com/pikepdf/pikepdf', + 'Tracker': 'https://github.com/pikepdf/pikepdf/issues' + } +) diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py new file mode 100644 index 0000000..2e42605 --- /dev/null +++ b/src/pikepdf/__init__.py @@ -0,0 +1,41 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + + +from pkg_resources import ( + get_distribution as _get_distribution, + DistributionNotFound +) + +try: + from . import _qpdf +except ImportError: + raise ImportError("pikepdf's extension library failed to import") + +from ._qpdf import ( + PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel +) +from .objects import ( + Object, ObjectType, Name, String, Array, Dictionary, Stream, Operator +) +from .models import ( + PdfImage, PdfInlineImage, UnsupportedImageTypeError, PdfMatrix, + parse_content_stream +) + +from . import _methods + +try: + __version__ = _get_distribution(__name__).version +except DistributionNotFound: + __version__ = "Not installed" + +__libqpdf_version__ = _qpdf.qpdf_version() + + +def open(*args, **kwargs): # pylint: disable=redefined-builtin + "Alias for :func:`pikepdf.Pdf.open`." + return Pdf.open(*args, **kwargs) diff --git a/src/pikepdf/_cpphelpers.py b/src/pikepdf/_cpphelpers.py new file mode 100644 index 0000000..d975657 --- /dev/null +++ b/src/pikepdf/_cpphelpers.py @@ -0,0 +1,47 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + +""" +Support functions called by the C++ library binding layer. Not intended to be +called from Python, and subject to change at any time. +""" + +import os +import sys + + +# Provide os.fspath equivalent for Python <3.6 +if sys.version_info[0:2] <= (3, 5): # pragma: no cover + def fspath(path): + '''https://www.python.org/dev/peps/pep-0519/#os''' + import pathlib + if isinstance(path, (str, bytes)): + return path + + # Work from the object's type to match method resolution of other magic + # methods. + path_type = type(path) + try: + path = path_type.__fspath__(path) + except AttributeError: + # Added for Python 3.5 support. + if isinstance(path, pathlib.Path): + return str(path) + elif hasattr(path_type, '__fspath__'): + raise + else: + if isinstance(path, (str, bytes)): + return path + else: + raise TypeError("expected __fspath__() to return str or bytes, " + "not " + type(path).__name__) + + raise TypeError( + "expected str, bytes, pathlib.Path or os.PathLike object, not " + + path_type.__name__) + +else: + fspath = os.fspath diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py new file mode 100644 index 0000000..6c2b90b --- /dev/null +++ b/src/pikepdf/_methods.py @@ -0,0 +1,270 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + +""" +In several cases the implementation of some higher levels features might as +well be in Python. Fortunately we can attach Python methods to C++ class +bindings after the fact. + +We can also move the implementation to C++ if desired. +""" + +from tempfile import NamedTemporaryFile +from subprocess import run, PIPE +from io import BytesIO + +from collections.abc import KeysView + +import inspect + +from . import Pdf, Dictionary, Array, Name, Stream, Object +from ._qpdf import _ObjectMapping +from .models import PdfMetadata + + +# pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object + +def extends(cls_cpp): + """Attach methods of a Python support class to an existing class + + This monkeypatches all methods defined in the support class onto an + existing class. Example: + + .. code-block:: python + + @extends(ClassDefinedInCpp) + class SupportClass: + def foo(self): + pass + + The method 'foo' will be monkeypatched on ClassDefinedInCpp. SupportClass + has no meaning on its own and should not be used, but gets returned from + this function so IDE code inspection doesn't get too confused. + + We don't subclass because it's much more convenient to monkeypatch Python + methods onto the existing Python binding of the C++ class. For one thing, + this allows the implementation to be moved from Python to C++ or vice + versa. It saves having to implement an intermediate subclass and then + ensures that the superclass never 'leaks' to pikepdf users. + + Any existing methods may be used, regardless of whether they defined + elsewhere in the support class or in the target class. + """ + + def real_class_extend(cls, cls_cpp=cls_cpp): + for name, fn in inspect.getmembers(cls, inspect.isfunction): + fn.__qualname__ = fn.__qualname__.replace( + cls.__name__, cls_cpp.__name__) + setattr(cls_cpp, name, fn) + for name, fn in inspect.getmembers(cls, inspect.isdatadescriptor): + setattr(cls_cpp, name, fn) + def block_init(self): + raise NotImplementedError(self.__class__.__name__ + '.__init__') + cls.__init__ = block_init + return cls + return real_class_extend + + +def _single_page_pdf(page): + """Construct a single page PDF from the provided page in memory""" + pdf = Pdf.new() + pdf.pages.append(page) + bio = BytesIO() + pdf.save(bio) + bio.seek(0) + return bio.read() + + +def _mudraw(buffer, fmt): + """Use mupdf draw to rasterize the PDF in the memory buffer""" + with NamedTemporaryFile(suffix='.pdf') as tmp_in: + tmp_in.write(buffer) + tmp_in.seek(0) + tmp_in.flush() + + proc = run( + ['mudraw', '-F', fmt, '-o', '-', tmp_in.name], + stdout=PIPE, stderr=PIPE + ) + if proc.stderr: + raise RuntimeError(proc.stderr.decode()) + return proc.stdout + + +@extends(Object) +class Extend_Object: + + def _repr_mimebundle_(self, **kwargs): + """Present options to IPython for rich display of this object + + See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display + """ + + include = kwargs['include'] + exclude = kwargs['exclude'] + include = set() if include else include + exclude = set() if exclude is None else exclude + + data = {} + if '/Type' not in self: + return data + + if self.Type == '/Page': + bundle = {'application/pdf', 'image/png'} + if include: + bundle = bundle & include + bundle = bundle - exclude + pagedata = _single_page_pdf(self) + if 'application/pdf' in bundle: + data['application/pdf'] = pagedata + if 'image/png' in bundle: + try: + data['image/png'] = _mudraw(pagedata, 'png') + except (FileNotFoundError, RuntimeError): + pass + return data + + +@extends(Pdf) +class Extend_Pdf: + + def _repr_mimebundle_(self, **kwargs): + """ + Present options to IPython for rich display of this object + + See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display + """ + + bio = BytesIO() + self.save(bio) + bio.seek(0) + + data = {'application/pdf': bio.read()} + return data + + def open_metadata( + self, + set_pikepdf_as_editor=True, + update_docinfo=True + ): + """ + Open the PDF's XMP metadata for editing + + Recommend for use in a ``with`` block. Changes are committed to the + PDF when the block exits. + + Example: + >>> with pdf.open_metadata() as meta: + meta['dc:title'] = 'Set the Dublic Core Title' + meta['dc:description'] = 'Put the Abstract here' + + Args: + set_pikepdf_as_editor (bool): Update the metadata to show that this + version of pikepdf is the most software to modify the metadata. + Recommended, except for testing. + + update_docinfo (bool): Update the deprecated PDF DocumentInfo block + to be consistent with XMP. + + Returns: + pikepdf.models.PdfMetadata + """ + return PdfMetadata( + self, + pikepdf_mark=set_pikepdf_as_editor, + sync_docinfo=update_docinfo + ) + + def _attach(self, *, basename, filebytes, mime=None, desc=''): + """ + Attach a file to this PDF + + Args: + basename (str): The basename (filename withouth path) to name the + file. Not necessarily the name of the file on disk. Will be s + hown to the user by the PDF viewer. filebytes (bytes): The file + contents. + + mime (str or None): A MIME type for the filebytes. If omitted, we try + to guess based on the standard library's + :func:`mimetypes.guess_type`. If this cannot be determined, the + generic value `application/octet-stream` is used. This value is + used by PDF viewers to decide how to present the information to + the user. + + desc (str): A extended description of the file contents. PDF viewers + also display this information to the user. In Acrobat DC this is + hidden in a context menu. + + The PDF will also be modified to request the PDF viewer to display the + list of attachments when opened, as opposed to other viewing modes. Some + PDF viewers will not make it obvious to the user that attachments are + present unless this is done. This behavior may be overridden by changing + ``pdf.Root.PageMode`` to some other valid value. + + """ + + if '/Names' not in self.Root: + self.Root.Names = self.make_indirect(Dictionary()) + if '/EmbeddedFiles' not in self.Root: + self.Root.Names.EmbeddedFiles = self.make_indirect(Dictionary()) + if '/Names' not in self.Root.Names.EmbeddedFiles: + self.Root.Names.EmbeddedFiles.Names = Array() + + if '/' in basename or '\\' in basename: + raise ValueError("basename should be a basename (no / or \\)") + + if not mime: + from mimetypes import guess_type + mime, _encoding = guess_type(basename) + if not mime: + mime = 'application/octet-stream' + + filestream = Stream(self, filebytes) + filestream.Subtype = Name('/' + mime) + + filespec = Dictionary({ + '/Type': Name.Filespec, + '/F': basename, + '/UF': basename, + '/Desc': desc, + '/EF': Dictionary({ + '/F': filestream + }) + }) + + # names = self.Root.Names.EmbeddedFiles.Names.as_list() + # names.append(filename) # Key + # names.append(self.make_indirect(filespec)) + self.Root.Names.EmbeddedFiles.Names = Array([ + basename, # key + self.make_indirect(filespec) + ]) + + if '/PageMode' not in self.Root: + self.Root.PageMode = Name.UseAttachments + +@extends(_ObjectMapping) +class Extend_ObjectMapping: + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + + def keys(self): + return KeysView(self) + + def values(self): + return (v for _k, v in self.items()) diff --git a/src/pikepdf/models/__init__.py b/src/pikepdf/models/__init__.py new file mode 100644 index 0000000..b0d27bc --- /dev/null +++ b/src/pikepdf/models/__init__.py @@ -0,0 +1,114 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + +from .. import Object, ObjectType, PdfError + +from .matrix import PdfMatrix +from .image import PdfImage, PdfInlineImage, UnsupportedImageTypeError +from .metadata import PdfMetadata + + +def parse_content_stream(page_or_stream, operators=''): + """ + Parse a PDF content stream into a sequence of instructions. + + A PDF content stream is list of instructions that describe where to render + the text and graphics in a PDF. This is the starting point for analyzing + PDFs. + + If the input is a page and page.Contents is an array, then the content + stream is automatically treated as one coalesced stream. + + Each instruction contains at least one operator and zero or more operands. + + Args: + page_or_stream (pikepdf.Object): A page object, or the content + stream attached to another object such as a Form XObject. + operators (str): A space-separated string of operators to whitelist. + For example 'q Q cm Do' will return only operators + that pertain to drawing images. Use 'BI ID EI' for inline images. + All other operators and associated tokens are ignored. If blank, + all tokens are accepted. + + Returns: + list: List of ``(operands, command)`` tuples where ``command`` is an + operator (str) and ``operands`` is a tuple of str; the PDF drawing + command and the command's operands, respectively. + + Example: + + >>> pdf = pikepdf.Pdf.open(input_pdf) + >>> page = pdf.pages[0] + >>> for operands, command in parse_content_stream(page): + >>> print(command) + + """ + + if not isinstance(page_or_stream, Object): + raise TypeError("stream must a PDF object") + + if page_or_stream._type_code != ObjectType.stream \ + and page_or_stream.get('/Type') != '/Page': + raise TypeError("parse_content_stream called on page or stream object") + + try: + if page_or_stream.get('/Type') == '/Page': + page = page_or_stream + instructions = page._parse_page_contents_grouped(operators) + else: + stream = page_or_stream + instructions = Object._parse_stream_grouped(stream, operators) + except PdfError as e: + # This is the error message for qpdf >= 7.0. It was different in 6.x + # but we no longer support 6.x + if 'ignoring non-stream while parsing' in str(e): + raise TypeError("parse_content_stream called on non-stream Object") + raise e from e + + return instructions + + +class _Page: + def __init__(self, obj): + self.obj = obj + + def __getattr__(self, item): + return getattr(self.obj, item) + + def __setattr__(self, item, value): + if item == 'obj': + object.__setattr__(self, item, value) + elif hasattr(self.obj, item): + setattr(self.obj, item, value) + else: + raise AttributeError(item) + + def __repr__(self): + return repr(self.obj).replace( + 'pikepdf.Dictionary', 'pikepdf.Page', 1) + + @property + def mediabox(self): + return self.obj.MediaBox + + def has_text(self): + """Check if this page print text + + Search the content stream for any of the four text showing operators. + We ignore text positioning operators because some editors might + generate maintain these even if text is deleted etc. + + This cannot detect raster text (text in a bitmap), text rendered as + curves. It also cannot determine if the text is visible to the user. + + :return: True if there is text + """ + text_showing_operators = """TJ " ' Tj""" + text_showing_insts = parse_content_stream( + self.obj, text_showing_operators) + if len(text_showing_insts) > 0: + return True + return False diff --git a/src/pikepdf/models/image.py b/src/pikepdf/models/image.py new file mode 100644 index 0000000..8ecb571 --- /dev/null +++ b/src/pikepdf/models/image.py @@ -0,0 +1,626 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + +from io import BytesIO +from itertools import zip_longest +from abc import ABC, abstractmethod +import struct + +from decimal import Decimal + +from .. import ( + Object, Array, PdfError, Name, Dictionary, Stream +) + +class DependencyError(Exception): + pass + +class UnsupportedImageTypeError(Exception): + pass + + +def array_str(value): + if isinstance(value, (list, Array)): + return [str(item) for item in value] + if isinstance(value, Name): + return [str(value)] + raise NotImplementedError(value) + + +def array_str_colorspace(value): + if isinstance(value, (list, Array)): + items = [item for item in value] + if len(items) == 4 and items[0] == '/Indexed': + result = [str(items[n]) for n in range(3)] + result.append(bytes(items[3])) + return result + return array_str(items) + return array_str(value) + + +def dict_or_array_dict(value): + if isinstance(value, list): + return value + if isinstance(value, Dictionary): + return [value.as_dict()] + if isinstance(value, Array): + return [v.as_list() for v in value] + raise NotImplementedError(value) + + +def metadata_from_obj(obj, name, type_, default): + val = getattr(obj, name, default) + try: + return type_(val) + except TypeError: + if val is None: + return None + raise NotImplementedError('Metadata access for ' + name) + + +class PdfImageBase(ABC): + + SIMPLE_COLORSPACES = ('/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray') + + @abstractmethod + def _metadata(self, name, type_, default): + pass + + @property + def width(self): + """Width of the image data in pixels""" + return self._metadata('Width', int, None) + + @property + def height(self): + """Height of the image data in pixels""" + return self._metadata('Height', int, None) + + @property + def image_mask(self): + """``True`` if this is an image mask""" + return self._metadata('ImageMask', bool, False) + + @property + def _bpc(self): + """Bits per component for this image (low-level)""" + return self._metadata('BitsPerComponent', int, None) + + @property + def _colorspaces(self): + """Colorspace (low-level)""" + return self._metadata('ColorSpace', array_str_colorspace, []) + + @property + def filters(self): + """List of names of the filters that we applied to encode this image""" + return self._metadata('Filter', array_str, []) + + @property + def decode_parms(self): + """List of the /DecodeParms, arguments to filters""" + return self._metadata('DecodeParms', dict_or_array_dict, []) + + @property + def colorspace(self): + """PDF name of the colorspace that best describes this image""" + if self.image_mask: + return None # Undefined for image masks + if self._colorspaces: + if self._colorspaces[0] in self.SIMPLE_COLORSPACES: + return self._colorspaces[0] + if self._colorspaces[0] == '/DeviceCMYK': + return self._colorspaces[0] + if self._colorspaces[0] == '/Indexed' \ + and self._colorspaces[1] in self.SIMPLE_COLORSPACES: + return self._colorspaces[1] + if self._colorspaces[0] == '/ICCBased': + icc = self._colorspaces[1] + return icc.stream_dict.get('/Alternate', '') + raise NotImplementedError( + "not sure how to get colorspace: " + repr(self._colorspaces)) + + @property + def bits_per_component(self): + """Bits per component of this image""" + if self._bpc is None: + return 1 if self.image_mask else 8 + return self._bpc + + @property + @abstractmethod + def is_inline(self): + pass + + @property + def indexed(self): + """``True`` if the image has a defined color palette""" + return '/Indexed' in self._colorspaces + + @property + def size(self): + """Size of image as (width, height)""" + return self.width, self.height + + @property + def mode(self): + """``PIL.Image.mode`` equivalent for this image""" + m = '' + if self.indexed: + m = 'P' + elif self.bits_per_component == 1: + m = '1' + elif self.bits_per_component == 8: + if self.colorspace == '/DeviceRGB': + m = 'RGB' + elif self.colorspace == '/DeviceGray': + m = 'L' + elif self.colorspace == '/DeviceCMYK': + m = 'CMYK' + if m == '': + raise NotImplementedError("Not sure how to handle PDF image of this type") + return m + + @property + def filter_decodeparms(self): + """PDF has a lot of optional data structures concerning /Filter and + /DecodeParms. /Filter can be absent or a name or an array, /DecodeParms + can be absent or a dictionary (if /Filter is a name) or an array (if + /Filter is an array). When both are arrays the lengths match. + + Normalize this into: + [(/FilterName, {/DecodeParmName: Value, ...}), ...] + + The order of /Filter matters as indicates the encoding/decoding sequence. + + """ + return list(zip_longest(self.filters, self.decode_parms, fillvalue={})) + + @property + def palette(self): + """Retrieves the color palette for this image + + :returns: (base_colorspace: str, palette: bytes) + :rtype: tuple + """ + + if not self.indexed: + return None + _idx, base, hival, lookup = None, None, None, None + try: + _idx, base, hival, lookup = self._colorspaces + except ValueError as e: + raise ValueError('Not sure how to interpret this palette') from e + base = str(base) + hival = int(hival) + lookup = bytes(lookup) + if not base in self.SIMPLE_COLORSPACES: + raise NotImplementedError("not sure how to interpret this palette") + if base == '/DeviceRGB': + base = 'RGB' + elif base == '/DeviceGray': + base = 'L' + return base, lookup + + @abstractmethod + def as_pil_image(self): + pass + + +class PdfImage(PdfImageBase): + """Support class to provide a consistent API for manipulating PDF images + + The data structure for images inside PDFs is irregular and flexible, + making it difficult to work with without introducing errors for less + typical cases. This class addresses these difficulties by providing a + regular, Pythonic API similar in spirit (and convertible to) the Python + Pillow imaging library. + """ + + def __new__(cls, obj): + instance = super().__new__(cls) + instance.__init__(obj) + if '/JPXDecode' in instance.filters: + instance = super().__new__(PdfJpxImage) + instance.__init__(obj) + return instance + + def __init__(self, obj): + """Construct a PDF image from a Image XObject inside a PDF + + ``pim = PdfImage(page.Resources.XObject['/ImageNN'])`` + + Args: + obj (pikepdf.Object): an Image XObject + + """ + if isinstance(obj, Stream) and \ + obj.stream_dict.get("/Subtype") != "/Image": + raise TypeError("can't construct PdfImage from non-image") + self.obj = obj + + @classmethod + def _from_pil_image(cls, *, pdf, page, name, image): # pragma: no cover + """Insert a PIL image into a PDF (rudimentary) + + :param pdf: the PDF to attach the image to + :type pdf: pikepdf.Pdf + :param page: the page to attach the image to + :param name: the name to set the image + :param image: image + :type image: PIL.Image.Image + """ + + data = image.tobytes() + + imstream = Stream(pdf, data) + imstream.Type = Name('/XObject') + imstream.Subtype = Name('/Image') + if image.mode == 'RGB': + imstream.ColorSpace = Name('/DeviceRGB') + elif image.mode in ('1', 'L'): + imstream.ColorSpace = Name('/DeviceGray') + imstream.BitsPerComponent = 1 if image.mode == '1' else 8 + imstream.Width = image.width + imstream.Height = image.height + + page.Resources.XObject[name] = imstream + + return cls(imstream) + + def _metadata(self, name, type_, default): + return metadata_from_obj(self.obj, name, type_, default) + + @property + def is_inline(self): + """``False`` for image XObject""" + return False + + def _extract_direct(self, *, stream): + """Attempt to extract the image directly to a usable image file + + If there is no way to extract the image without decompressing or + transcoding then raise an exception. The type and format of image + generated will vary. + + Args: + stream: Writable stream to write data to + """ + + def normal_dct_rgb(): + # Normal DCTDecode RGB images have the default value of + # /ColorTransform 1 and are actually in YUV. Such a file can be + # saved as a standard JPEG. RGB JPEGs without YUV conversion can't + # be saved as JPEGs, and are probably bugs. Some software in the + # wild actually produces RGB JPEGs in PDFs (probably a bug). + return (self.mode == 'RGB' and + self.filter_decodeparms[0][1].get('/ColorTransform', 1)) + + def normal_dct_cmyk(): + # Normal DCTDecode CMYKs have /ColorTransform 0 and can be saved. + # There is a YUVK colorspace but CMYK JPEGs don't generally use it + return (self.mode == 'CMYK' and + self.filter_decodeparms[0][1].get('/ColorTransform', 0)) + + if self.filters == ['/CCITTFaxDecode']: + data = self.obj.read_raw_bytes() + stream.write(self._generate_ccitt_header(data)) + stream.write(data) + return '.tif' + elif self.filters == ['/DCTDecode'] and ( + self.mode == 'L' or normal_dct_rgb() or normal_dct_cmyk): + buffer = self.obj.get_raw_stream_buffer() + stream.write(buffer) + return '.jpg' + + raise UnsupportedImageTypeError() + + def _extract_transcoded(self): + from PIL import Image + im = None + if self.mode == 'RGB' and self.bits_per_component == 8: + # No point in accessing the buffer here, size qpdf decodes to 3-byte + # RGB and Pillow needs RGBX for raw access + data = self.read_bytes() + im = Image.frombytes('RGB', self.size, data) + elif self.mode in ('L', 'P') and self.bits_per_component == 8: + buffer = self.get_stream_buffer() + stride = 0 # tell Pillow to calculate stride from line width + ystep = 1 # image is top to bottom in memory + im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride, + ystep) + if self.mode == 'P': + base_mode, palette = self.palette + if base_mode in ('RGB', 'L'): + im.putpalette(palette, rawmode=base_mode) + else: + raise NotImplementedError('palette with ' + base_mode) + elif self.mode == '1' and self.bits_per_component == 1: + data = self.read_bytes() + im = Image.frombytes('1', self.size, data) + + elif self.mode == 'P' and self.bits_per_component == 1: + data = self.read_bytes() + im = Image.frombytes('1', self.size, data) + + base_mode, palette = self.palette + if not (palette == b'\x00\x00\x00\xff\xff\xff' + or palette == b'\x00\xff'): + raise NotImplementedError( + 'monochrome image with nontrivial palette') + + return im + + def extract_to(self, *, stream): + """Attempt to extract the image directly to a usable image file + + If possible, the compressed data is extracted and inserted into + a compressed image file format without transcoding the compressed + content. If this is not possible, the data will be decompressed + and extracted to an appropriate format. + + Because it is not known until attempted what image format will be + extracted, users should not assume what format they are getting back. + When saving the image to a file, use a temporary filename, and then + rename the file to its final name based on the returned file extension. + + Args: + stream: Writable stream to write data to + + Returns: + str: The file format extension + """ + + try: + return self._extract_direct(stream=stream) + except UnsupportedImageTypeError: + pass + + im = self._extract_transcoded() + if im: + im.save(stream, format='png') + return '.png' + + raise UnsupportedImageTypeError(repr(self)) + + def read_bytes(self): + """Decompress this image and return it as unencoded bytes""" + return self.obj.read_bytes() + + def get_stream_buffer(self): + """Access this image with the buffer protocol""" + return self.obj.get_stream_buffer() + + def as_pil_image(self): + """Extract the image as a Pillow Image, using decompression as necessary + + Returns: + PIL.Image.Image + """ + from PIL import Image + + try: + bio = BytesIO() + self._extract_direct(stream=bio) + bio.seek(0) + return Image.open(bio) + except UnsupportedImageTypeError: + pass + + im = self._extract_transcoded() + if not im: + raise UnsupportedImageTypeError(repr(self)) + + return im + + def _generate_ccitt_header(self, data): + """Construct a CCITT G3 or G4 header from the PDF metadata""" + # https://stackoverflow.com/questions/2641770/ + # https://www.itu.int/itudoc/itu-t/com16/tiff-fx/docs/tiff6.pdf + + if not self.decode_parms: + raise ValueError("/CCITTFaxDecode without /DecodeParms") + + if self.decode_parms[0].get("/K", 1) < 0: + ccitt_group = 4 # Pure two-dimensional encoding (Group 4) + else: + ccitt_group = 3 + black_is_one = self.decode_parms[0].get("/BlackIs1", False) + white_is_zero = 1 if black_is_one else 0 + + img_size = len(data) + tiff_header_struct = '<' + '2s' + 'H' + 'L' + 'H' + 'HHLL' * 8 + 'L' + tiff_header = struct.pack( + tiff_header_struct, + b'II', # Byte order indication: Little endian + 42, # Version number (always 42) + 8, # Offset to first IFD + 8, # Number of tags in IFD + 256, 4, 1, self.width, # ImageWidth, LONG, 1, width + 257, 4, 1, self.height, # ImageLength, LONG, 1, length + 258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1 + 259, 3, 1, ccitt_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding + 262, 3, 1, int(white_is_zero), # Thresholding, SHORT, 1, 0 = WhiteIsZero + 273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, length of header + 278, 4, 1, self.height, + 279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image + 0 # last IFD + ) + return tiff_header + + def show(self): + """Show the image however PIL wants to""" + self.as_pil_image().show() + + def __repr__(self): + return ''.format( + self.mode, self.width, self.height, hex(id(self))) + + def _repr_png_(self): + """Display hook for IPython/Jupyter""" + b = BytesIO() + im = self.as_pil_image() + im.save(b, 'PNG') + return b.getvalue() + + +class PdfJpxImage(PdfImage): + + def __init__(self, obj): + super().__init__(obj) + self.pil = self.as_pil_image() + + def _extract_direct(self, *, stream): + buffer = self.obj.get_raw_stream_buffer() + stream.write(buffer) + return '.jp2' + + @property + def _colorspaces(self): + # (PDF 1.7 Table 89) If ColorSpace is present, any colour space + # specifications in the JPEG2000 data shall be ignored. + super_colorspaces = super()._colorspaces + if super_colorspaces: + return super_colorspaces + if self.pil.mode == 'L': + return ['/DeviceGray'] + elif self.pil.mode == 'RGB': + return ['/DeviceRGB'] + raise NotImplementedError('Complex JP2 colorspace') + + @property + def _bpc(self): + # (PDF 1.7 Table 89) If the image stream uses the JPXDecode filter, this + # entry is optional and shall be ignored if present. The bit depth is + # determined by the conforming reader in the process of decoding the + # JPEG2000 image. + return 8 + + @property + def indexed(self): + # Nothing in the spec precludes an Indexed JPXDecode image, except for + # the fact that doing so is madness. Let's assume it no one is that + # insane. + return False + + def __repr__(self): + return ''.format( + self.mode, self.width, self.height, hex(id(self))) + + +class PdfInlineImage(PdfImageBase): + """Support class for PDF inline images""" + + # Inline images can contain abbreviations that we write automatically + ABBREVS = { + b'/W': b'/Width', + b'/H': b'/Height', + b'/BPC': b'/BitsPerComponent', + b'/IM': b'/ImageMask', + b'/CS': b'/ColorSpace', + b'/F': b'/Filter', + b'/DP': b'/DecodeParms', + b'/G': b'/DeviceGray', + b'/RGB': b'/DeviceRGB', + b'/CMYK': b'/DeviceCMYK', + b'/I': b'/Indexed', + b'/AHx': b'/ASCIIHexDecode', + b'/A85': b'/ASCII85Decode', + b'/LZW': b'/LZWDecode', + b'/RL': b'/RunLengthDecode', + b'/CCF': b'/CCITTFaxDecode', + b'/DCT': b'/DCTDecode' + } + + def __init__(self, *, image_data, image_object: tuple): + """ + Args: + image_data: data stream for image, extracted from content stream + image_object: the metadata for image, also from content stream + """ + + # Convert the sequence of pikepdf.Object from the content stream into + # a dictionary object by unparsing it (to bytes), eliminating inline + # image abbreviations, and constructing a bytes string equivalent to + # what an image XObject would look like. Then retrieve data from there + + self._data = image_data + self._image_object = image_object + + reparse = b' '.join(self._unparse_obj(obj) for obj in image_object) + try: + reparsed_obj = Object.parse(b'<< ' + reparse + b' >>') + except PdfError as e: + raise PdfError( + "parsing inline " + reparse.decode('unicode_escape')) from e + self.obj = reparsed_obj + self.pil = None + + @classmethod + def _unparse_obj(cls, obj): + if isinstance(obj, Object): + if isinstance(obj, Name): + name = obj.unparse(resolved=True) + assert isinstance(name, bytes) + return cls.ABBREVS.get(name, name) + else: + return obj.unparse(resolved=True) + elif isinstance(obj, bool): + return b'true' if obj else b'false' # Lower case for PDF spec + elif isinstance(obj, (int, Decimal, float)): + return str(obj).encode('ascii') + else: + raise NotImplementedError(repr(obj)) + + + def _metadata(self, name, type_, default): + return metadata_from_obj(self.obj, name, type_, default) + + def unparse(self): + tokens = [] + tokens.append(b'BI') + metadata = [] + for metadata_obj in self._image_object: + unparsed = self._unparse_obj(metadata_obj) + assert isinstance(unparsed, bytes) + metadata.append(unparsed) + tokens.append(b' '.join(metadata)) + tokens.append(b'ID') + tokens.append(self._data._inline_image_raw_bytes()) + tokens.append(b'EI') + return b'\n'.join(tokens) + + @property + def is_inline(self): + return True + + def __repr__(self): + mode = '?' + try: + mode = self.mode + except Exception: + pass + return ''.format( + mode, self.width, self.height, hex(id(self))) + + def as_pil_image(self): + from PIL import Image + + if self.pil: + return self.pil + + raise NotImplementedError('not yet') + + def extract_to(self, *, stream): # pylint: disable=unused-argument + raise UnsupportedImageTypeError("inline images don't support extract") + + def read_bytes(self): + raise NotImplementedError("qpdf returns compressed") + #return self._data._inline_image_bytes() + + def get_stream_buffer(self): + raise NotImplementedError("qpdf returns compressed") + #return memoryview(self._data.inline_image_bytes()) diff --git a/src/pikepdf/models/matrix.py b/src/pikepdf/models/matrix.py new file mode 100644 index 0000000..d68fae6 --- /dev/null +++ b/src/pikepdf/models/matrix.py @@ -0,0 +1,125 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + +from math import cos, sin, pi + +class PdfMatrix: + """ + Support class for PDF content stream matrices + + PDF content stream matrices are 3x3 matrices summarized by a shorthand + ``(a, b, c, d, e, f)`` which correspond to the first two column vectors. + The final column vector is always ``(0, 0, 1)`` since this is using + `homogenous coordinates `_. + + PDF uses row vectors. That is, ``vr @ A'`` gives the effect of transforming + a row vector ``vr=(x, y, 1)`` by the matrix ``A'``. Most textbook + treatments use ``A @ vc`` where the column vector ``vc=(x, y, 1)'``. + + (``@`` is the Python matrix multiplication operator added in Python 3.5.) + + Addition and other operations are not implemented because they're not that + meaningful in a PDF context (they can be defined and are mathematically + meaningful in general). + + PdfMatrix objects are immutable. All transformations on them produce a new + matrix. + + """ + + def __init__(self, *args): + if not args: + self.values = ((1, 0, 0), (0, 1, 0), (0, 0, 1)) + elif len(args) == 6: + a, b, c, d, e, f = map(float, args) + self.values = ((a, b, 0), + (c, d, 0), + (e, f, 1)) + elif isinstance(args[0], PdfMatrix): + self.values = args[0].values + elif len(args[0]) == 6: + a, b, c, d, e, f = map(float, args[0]) + self.values = ((a, b, 0), + (c, d, 0), + (e, f, 1)) + elif len(args[0]) == 3 and len(args[0]) == 3: + self.values = (tuple(args[0][0]), + tuple(args[0][1]), + tuple(args[0][2])) + else: + raise ValueError('arguments') + + @staticmethod + def identity(): + """Constructs and returns an identity matrix""" + return PdfMatrix() + + def __matmul__(self, other): + """Multiply this matrix by another matrix + + Can be used to concatenate transformations. + + """ + a = self.values + b = other.values + return PdfMatrix( + [[sum([float(i) * float(j) + for i, j in zip(row, col)] + ) for col in zip(*b)] + for row in a] + ) + + def scaled(self, x, y): + """Concatenates a scaling matrix on this matrix""" + return self @ PdfMatrix((x, 0, 0, y, 0, 0)) + + def rotated(self, angle_degrees_ccw): + """Concatenates a rotation matrix on this matrix""" + angle = angle_degrees_ccw / 180.0 * pi + c, s = cos(angle), sin(angle) + return self @ PdfMatrix((c, s, -s, c, 0, 0)) + + def translated(self, x, y): + """Translates this matrix""" + return self @ PdfMatrix((1, 0, 0, 1, x, y)) + + @property + def shorthand(self): + """Return the 6-tuple (a,b,c,d,e,f) that describes this matrix""" + return (self.a, self.b, self.c, self.d, self.e, self.f) + + @property + def a(self): + return self.values[0][0] + + @property + def b(self): + return self.values[0][1] + + @property + def c(self): + return self.values[1][0] + + @property + def d(self): + return self.values[1][1] + + @property + def e(self): + return self.values[2][0] + + @property + def f(self): + return self.values[2][1] + + def encode(self): + """Encode this matrix in binary suitable for including in a PDF""" + return '{:.6f} {:.6f} {:.6f} {:.6f} {:.6f} {:.6f}'.format( + self.a, self.b, self.c, self.d, self.e, self.f + ).encode() + + def __repr__(self): + return 'pikepdf.Matrix(' + repr(self.values) + ')' diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py new file mode 100644 index 0000000..9f98b8a --- /dev/null +++ b/src/pikepdf/models/metadata.py @@ -0,0 +1,630 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2018, James R. Barlow (https://github.com/jbarlow83/) + +from collections import namedtuple +from collections.abc import MutableMapping +from datetime import datetime +from functools import wraps +from io import BytesIO +from pkg_resources import ( + get_distribution as _get_distribution, + DistributionNotFound +) +from warnings import warn +import re +import sys + +from lxml import etree +from lxml.etree import QName, XMLSyntaxError +from defusedxml.lxml import parse + +from .. import Stream, Name, String, PdfError + +XMP_NS_DC = "http://purl.org/dc/elements/1.1/" +XMP_NS_PDF = "http://ns.adobe.com/pdf/1.3/" +XMP_NS_PDFA_ID = "http://www.aiim.org/pdfa/ns/id/" +XMP_NS_PDFX_ID = "http://www.npes.org/pdfx/ns/id/" +XMP_NS_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/" +XMP_NS_PRISM2 = "http://prismstandard.org/namespaces/basic/2.0/" +XMP_NS_PRISM3 = "http://prismstandard.org/namespaces/basic/3.0/" +XMP_NS_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +XMP_NS_XMP = "http://ns.adobe.com/xap/1.0/" +XMP_NS_XMP_MM = "http://ns.adobe.com/xap/1.0/mm/" +XMP_NS_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/" + +DEFAULT_NAMESPACES = [ + ('adobe:ns:meta/', 'x'), + (XMP_NS_DC, 'dc'), + (XMP_NS_PDF, 'pdf'), + (XMP_NS_PDFA_ID, 'pdfaid'), + (XMP_NS_PDFX_ID, 'pdfxid'), + (XMP_NS_PHOTOSHOP, 'photoshop'), + (XMP_NS_PRISM2, 'prism2'), + (XMP_NS_PRISM3, 'prism3'), + (XMP_NS_RDF, 'rdf'), + (XMP_NS_XMP, 'xmp'), + (XMP_NS_XMP_MM, 'xmpMM'), + (XMP_NS_XMP_RIGHTS, 'xmpRights'), +] + +for _uri, _prefix in DEFAULT_NAMESPACES: + etree.register_namespace(_prefix, _uri) + +# This one should not be registered +XMP_NS_XML = "http://www.w3.org/XML/1998/namespace" + +XPACKET_BEGIN = b"""\n""" + +XMP_EMPTY = b""" + + + +""" + +XPACKET_END = b"""\n\n""" + +XmpContainer = namedtuple('XmpContainer', ['rdf_type', 'py_type', 'insert_fn']) + + +class AltList(list): + pass + + +XMP_CONTAINERS = [ + XmpContainer('Alt', AltList, AltList.append), + XmpContainer('Bag', set, set.add), + XmpContainer('Seq', list, list.append), +] + +LANG_ALTS = frozenset([ + str(QName(XMP_NS_DC, 'title')), + str(QName(XMP_NS_DC, 'description')), + str(QName(XMP_NS_DC, 'rights')), + str(QName(XMP_NS_XMP_RIGHTS, 'UsageTerms')), +]) + +# These are the illegal characters in XML 1.0. (XML 1.1 is a bit more permissive, +# but we'll be strict to ensure wider compatibility.) +re_xml_illegal_chars = re.compile( + r"(?u)[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]" +) +re_xml_illegal_bytes = re.compile( + br"[^\x09\x0A\x0D\x20-\xFF]|�" +) + +# Repeat this to avoid circular from top package's pikepdf.__version__ +try: + pikepdf_version = _get_distribution(__name__).version +except DistributionNotFound: + pikepdf_version = "" + + +def encode_pdf_date(d: datetime) -> str: + """Encode Python datetime object as PDF date string + + From Adobe pdfmark manual: + (D:YYYYMMDDHHmmSSOHH'mm') + D: is an optional prefix. YYYY is the year. All fields after the year are + optional. MM is the month (01-12), DD is the day (01-31), HH is the + hour (00-23), mm are the minutes (00-59), and SS are the seconds + (00-59). The remainder of the string defines the relation of local + time to GMT. O is either + for a positive difference (local time is + later than GMT) or - (minus) for a negative difference. HH' is the + absolute value of the offset from GMT in hours, and mm' is the + absolute value of the offset in minutes. If no GMT information is + specified, the relation between the specified time and GMT is + considered unknown. Regardless of whether or not GMT + information is specified, the remainder of the string should specify + the local time. + """ + + pdfmark_date_fmt = r'%Y%m%d%H%M%S' + s = d.strftime(pdfmark_date_fmt) + tz = d.strftime('%z') + if tz: + sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5] + s += "{}{}'{}'".format(sign, tz_hours, tz_mins) + return s + + +def decode_pdf_date(s: str) -> datetime: + """Decode a pdfmark date to a Python datetime object + + A pdfmark date is a string in a paritcular format. See the pdfmark + Reference for the specification. + """ + if isinstance(s, String): + s = str(s) + if s.startswith('D:'): + s = s[2:] + + # Literal Z00'00', is incorrect but found in the wild, + # probably made by OS X Quartz -- standardize + if s.endswith("Z00'00'"): + s = s.replace("Z00'00'", '+0000') + elif s.endswith('Z'): + s = s.replace('Z', '+0000') + s = s.replace("'", "") # Remove apos from PDF time strings + try: + return datetime.strptime(s, r'%Y%m%d%H%M%S%z') + except ValueError: + return datetime.strptime(s, r'%Y%m%d%H%M%S') + + +class AuthorConverter: + @staticmethod + def xmp_from_docinfo(docinfo_val): + return [docinfo_val] + + @staticmethod + def docinfo_from_xmp(xmp_val): + if isinstance(xmp_val, str): + return xmp_val + else: + return '; '.join(xmp_val) + + +if sys.version_info < (3, 7): + def fromisoformat(datestr): + # strptime %z can't parse a timezone with punctuation + if re.search(r'[+-]\d{2}[-:]\d{2}$', datestr): + datestr = datestr[:-3] + datestr[-2:] + try: + return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S%z") + except ValueError: + return datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%S") +else: + fromisoformat = datetime.fromisoformat + +class DateConverter: + @staticmethod + def xmp_from_docinfo(docinfo_val): + if docinfo_val == '': + return '' + return decode_pdf_date(docinfo_val).isoformat() + + @staticmethod + def docinfo_from_xmp(xmp_val): + if xmp_val.endswith('Z'): + xmp_val = xmp_val[:-1] + '+00:00' + dateobj = fromisoformat(xmp_val) + return encode_pdf_date(dateobj) + + +def ensure_loaded(fn): + @wraps(fn) + def wrapper(self, *args, **kwargs): + if not self._xmp: + self._load() + return fn(self, *args, **kwargs) + return wrapper + + +class PdfMetadata(MutableMapping): + """Read and edit the metadata associated with a PDF + + The PDF specification contain two types of metadata, the newer XMP + (Extensible Metadata Platform, XML-based) and older DocumentInformation + dictionary. The PDF 2.0 specification removes the DocumentInformation + dictionary. + + This primarily works with XMP metadata, but includes methods to generate + XMP from DocumentInformation and will also coordinate updates to + DocumentInformation so that the two are kept consistent. + + XMP metadata fields may be accessed using the full XML namespace URI or + the short name. For example ``metadata['dc:description']`` + and ``metadata['{http://purl.org/dc/elements/1.1/}description']`` + both refer to the same field. Several common XML namespaces are registered + automatically. + + See the XMP specification for details of allowable fields. + + To update metadata, use a with block. + + .. code-block:: python + + with pdf.open_metadata() as records: + records['dc:title'] = 'New Title' + + See Also: + :meth:`pikepdf.Pdf.open_metadata` + """ + + DOCINFO_MAPPING = [ + (XMP_NS_DC, 'creator', Name.Author, AuthorConverter), + (XMP_NS_DC, 'description', Name.Subject, None), + (XMP_NS_DC, 'title', Name.Title, None), + (XMP_NS_PDF, 'Keywords', Name.Keywords, None), + (XMP_NS_PDF, 'Producer', Name.Producer, None), + (XMP_NS_XMP, 'CreateDate', Name.CreationDate, DateConverter), + (XMP_NS_XMP, 'CreatorTool', Name.Creator, None), + (XMP_NS_XMP, 'ModifyDate', Name.ModDate, DateConverter), + ] + + NS = {prefix: uri for uri, prefix in DEFAULT_NAMESPACES} + REVERSE_NS = {uri: prefix for uri, prefix in DEFAULT_NAMESPACES} + + def __init__(self, pdf, pikepdf_mark=True, sync_docinfo=True): + self._pdf = pdf + self._xmp = None + self.mark = pikepdf_mark + self.sync_docinfo = sync_docinfo + self._updating = False + + def load_from_docinfo(self, docinfo, delete_missing=False, raise_failure=False): + """Populate the XMP metadata object with DocumentInfo + + Arguments: + docinfo: a DocumentInfo, e.g pdf.docinfo + delete_missing: if the entry is not DocumentInfo, delete the equivalent + from XMP + raise_failure: if True, raise any failure to convert docinfo; + otherwise warn and continue + + A few entries in the deprecated DocumentInfo dictionary are considered + approximately equivalent to certain XMP records. This method copies + those entries into the XMP metadata. + """ + for uri, shortkey, docinfo_name, converter in self.DOCINFO_MAPPING: + qname = QName(uri, shortkey) + # docinfo might be a dict or pikepdf.Dictionary, so lookup keys + # by str(Name) + val = docinfo.get(str(docinfo_name)) + if val is None: + if delete_missing and qname in self: + del self[qname] + continue + try: + val = str(val) + if converter: + val = converter.xmp_from_docinfo(val) + if not val: + continue + self[qname] = val + except (ValueError, AttributeError) as e: + msg = "The metadata field {} could not be copied to XMP".format(docinfo_name) + if raise_failure: + raise ValueError(msg) from e + else: + warn(msg) + + def _load(self): + try: + data = self._pdf.Root.Metadata.read_bytes() + except AttributeError: + data = XMP_EMPTY + self._load_from(data) + + def _load_from(self, data): + try: + self._xmp = parse(BytesIO(data)) + except XMLSyntaxError: + data = re_xml_illegal_bytes.sub(b'', data) + try: + self._xmp = parse(BytesIO(data)) + except XMLSyntaxError as e: + raise PdfError() from e + pis = self._xmp.xpath('/processing-instruction()') + for pi in pis: + etree.strip_tags(self._xmp, pi.tag) + + @ensure_loaded + def __enter__(self): + self._updating = True + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + if exc_type is not None: + return + self._apply_changes() + finally: + self._updating = False + + def _update_docinfo(self): + """Update the PDF's DocumentInfo dictionary to match XMP metadata + + The standard mapping is described here: + https://www.pdfa.org/pdfa-metadata-xmp-rdf-dublin-core/ + """ + self._pdf.docinfo # Touch object to ensure it exists + for uri, element, docinfo_name, converter in self.DOCINFO_MAPPING: + qname = QName(uri, element) + try: + value = self[qname] + except KeyError: + if docinfo_name in self._pdf.docinfo: + del self._pdf.docinfo[docinfo_name] + continue + if converter: + try: + value = converter.docinfo_from_xmp(value) + except ValueError: + warn("The DocumentInfo field {} could not be updated from XMP".format(docinfo_name)) + value = None + if value is None: + if docinfo_name in self._pdf.docinfo: + del self._pdf.docinfo[docinfo_name] + continue + value = re_xml_illegal_chars.sub('', value) + try: + # Try to save pure ASCII + self._pdf.docinfo[docinfo_name] = value.encode('ascii') + except UnicodeEncodeError: + # qpdf will serialize this as a UTF-16 with BOM string + self._pdf.docinfo[docinfo_name] = value + + def _get_xml_bytes(self, xpacket=True): + data = BytesIO() + if xpacket: + data.write(XPACKET_BEGIN) + self._xmp.write(data, encoding='utf-8', pretty_print=True) + if xpacket: + data.write(XPACKET_END) + data.seek(0) + xml_bytes = data.read() + return xml_bytes + + def _apply_changes(self): + """Serialize our changes back to the PDF in memory + + Depending how we are initialized, leave our metadata mark and producer. + """ + if self.mark: + self[QName(XMP_NS_XMP, 'MetadataDate')] = datetime.now().isoformat() + self[QName(XMP_NS_PDF, 'Producer')] = 'pikepdf ' + pikepdf_version + xml = self._get_xml_bytes() + self._pdf.Root.Metadata = Stream(self._pdf, xml) + self._pdf.Root.Metadata[Name.Type] = Name.Metadata + self._pdf.Root.Metadata[Name.Subtype] = Name.XML + if self.sync_docinfo: + self._update_docinfo() + + def _qname(self, name): + """Convert name to an XML QName + + e.g. pdf:Producer -> {http://ns.adobe.com/pdf/1.3/}Producer + """ + if isinstance(name, QName): + return name + if not isinstance(name, str): + raise TypeError("{} must be str".format(name)) + if name == '': + return name + if name.startswith('{'): + return name + prefix, tag = name.split(':', maxsplit=1) + uri = self.NS[prefix] + return QName(uri, tag) + + def _prefix_from_uri(self, uriname): + """Given a fully qualified XML name, find a prefix + + e.g. {http://ns.adobe.com/pdf/1.3/}Producer -> pdf:Producer + """ + uripart, tag = uriname.split('}', maxsplit=1) + uri = uripart.replace('{', '') + return self.REVERSE_NS[uri] + ':' + tag + + def _get_subelements(self, node): + """Gather the sub-elements attached to a node + + Gather rdf:Bag and and rdf:Seq into set and list respectively. For + alternate languages values, take the first language only for + simplicity. + """ + items = node.find('rdf:Alt', self.NS) + if items is not None: + try: + return items[0].text + except IndexError: + return '' + + for xmlcontainer, container, insertfn in XMP_CONTAINERS: + items = node.find('rdf:{}'.format(xmlcontainer), self.NS) + if items is None: + continue + result = container() + for item in items: + insertfn(result, item.text) + return result + return '' + + def _get_elements(self, name=''): + """Get elements from XMP + + Core routine to find elements matching name within the XMP and yield + them. + + For XMP spec 7.9.2.2, rdf:Description with property attributes, + we yield the node which will have the desired as one of its attributes. + qname is returned so that the node.attrib can be used to locate the + source. + + For XMP spec 7.5, simple valued XMP properties, we yield the node, + None, and the value. For structure or array valued properties we gather + the elements. We ignore qualifiers. + + Args: + name (str): a prefixed name or QName to look for within the + data section of the XMP; looks for all data keys if omitted + + Yields: + tuple: (node, qname_attrib, value, parent_node) + + """ + qname = self._qname(name) + rdf = self._xmp.find('.//rdf:RDF', self.NS) + for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS): + if qname and qname in rdfdesc.keys(): + yield (rdfdesc, qname, rdfdesc.get(qname), rdf) + elif not qname: + for k, v in rdfdesc.items(): + if v: + yield (rdfdesc, k, v, rdf) + xpath = qname if name else '*' + for node in rdfdesc.findall(xpath, self.NS): + if node.text and node.text.strip(): + yield (node, None, node.text, rdfdesc) + continue + values = self._get_subelements(node) + yield (node, None, values, rdfdesc) + + def _get_element_values(self, name=''): + yield from (v[2] for v in self._get_elements(name)) + + @ensure_loaded + def __contains__(self, key): + try: + return any(self._get_element_values(key)) + except KeyError: + return False + + @ensure_loaded + def __getitem__(self, key): + try: + return next(self._get_element_values(key)) + except StopIteration: + raise KeyError(key) + + @ensure_loaded + def __iter__(self): + for node, attrib, _val, _parents in self._get_elements(): + if attrib: + yield attrib + else: + yield node.tag + + @ensure_loaded + def __len__(self): + return len(list(iter(self))) + + @ensure_loaded + def __setitem__(self, key, val): + if not self._updating: + raise RuntimeError("Metadata not opened for editing, use with block") + + def clean(s): + return re_xml_illegal_chars.sub('', s) + + def add_array(node, items): + rdf_type = next( + c.rdf_type for c in XMP_CONTAINERS if isinstance(items, c.py_type) + ) + seq = etree.SubElement(node, QName(XMP_NS_RDF, rdf_type)) + if rdf_type == 'Alt': + attrib = {QName(XMP_NS_XML, 'lang'): 'x-default'} + else: + attrib = None + for item in items: + el = etree.SubElement(seq, QName(XMP_NS_RDF, 'li'), attrib=attrib) + el.text = clean(item) + + try: + # Locate existing node to replace + node, attrib, _oldval, _parent = next(self._get_elements(key)) + if attrib: + if not isinstance(val, str): + raise TypeError(val) + node.set(attrib, clean(val)) + elif isinstance(val, (list, set)): + for child in node.findall('*'): + node.remove(child) + add_array(node, val) + elif isinstance(val, str): + for child in node.findall('*'): + node.remove(child) + if str(self._qname(key)) in LANG_ALTS: + add_array(node, AltList([clean(val)])) + else: + node.text = clean(val) + else: + raise TypeError(val) + except StopIteration: + # Insert a new node + rdf = self._xmp.find('.//rdf:RDF', self.NS) + if str(self._qname(key)) in LANG_ALTS: + val = AltList([clean(val)]) + if isinstance(val, (list, set)): + rdfdesc = etree.SubElement( + rdf, QName(XMP_NS_RDF, 'Description'), + attrib={ + QName(XMP_NS_RDF, 'about'): '', + }, + ) + node = etree.SubElement(rdfdesc, self._qname(key)) + add_array(node, val) + elif isinstance(val, str): + rdfdesc = etree.SubElement( + rdf, QName(XMP_NS_RDF, 'Description'), + attrib={ + QName(XMP_NS_RDF, 'about'): '', + self._qname(key): clean(val) + }, + ) + else: + raise TypeError(val) + + @ensure_loaded + def __delitem__(self, key): + if not self._updating: + raise RuntimeError("Metadata not opened for editing, use with block") + try: + node, attrib, _oldval, parent = next(self._get_elements(key)) + if attrib: # Inline + del node.attrib[attrib] + if len(node.attrib) == 1 and len(node) == 0 and QName(XMP_NS_RDF, 'about') in node.attrib: + # The only thing left on this node is rdf:about="", so remove it + parent.remove(node) + else: + parent.remove(node) + except StopIteration: + raise KeyError(key) + + @property + @ensure_loaded + def pdfa_status(self): + """Returns the PDF/A conformance level claimed by this PDF, or False + + A PDF may claim to PDF/A compliant without this being true. Use an + independent verifier such as veraPDF to test if a PDF is truly + conformant. + + Returns: + str: The conformance level of the PDF/A, or an empty string if the + PDF does not claim PDF/A conformance. Possible valid values + are: 1A, 1B, 2A, 2B, 2U, 3A, 3B, 3U. + """ + key_part = QName(XMP_NS_PDFA_ID, 'part') + key_conformance = QName(XMP_NS_PDFA_ID, 'conformance') + try: + return self[key_part] + self[key_conformance] + except KeyError: + return '' + + @property + @ensure_loaded + def pdfx_status(self): + """Returns the PDF/X conformance level claimed by this PDF, or False + + A PDF may claim to PDF/X compliant without this being true. Use an + independent verifier such as veraPDF to test if a PDF is truly + conformant. + + Returns: + str: The conformance level of the PDF/X, or an empty string if the + PDF does not claim PDF/X conformance. + """ + pdfx_version = QName(XMP_NS_PDFX_ID, 'GTS_PDFXVersion') + try: + return self[pdfx_version] + except KeyError: + return '' + + @ensure_loaded + def __str__(self): + return self._get_xml_bytes(xpacket=False).decode('utf-8') diff --git a/src/pikepdf/objects.py b/src/pikepdf/objects.py new file mode 100644 index 0000000..a888b97 --- /dev/null +++ b/src/pikepdf/objects.py @@ -0,0 +1,172 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + +"""Provide classes to stand in for PDF objects + +The purpose of these is to provide nice-looking classes to allow explicit +construction of PDF objects and more pythonic idioms and facilitate discovery +by documentation generators. + +It's also a place to narrow the scope of input types to those more easily +converted to C++. + +In reality all of these return objects of class pikepdf.Object or rather +QPDFObjectHandle which is a generic type. + +""" + +from . import _qpdf +from ._qpdf import Object, ObjectType + +# pylint: disable=unused-import +from ._qpdf import Operator + + +class _ObjectMeta(type): + """Supports instance checking""" + + def __instancecheck__(cls, instance): + if type(instance) != Object: + return False + return cls.object_type == instance._type_code + + +class _NameObjectMeta(_ObjectMeta): + """Supports usage pikepdf.Name.Whatever -> Name('/Whatever')""" + + def __getattr__(self, attr): + return Name('/' + attr) + + def __setattr__(self, name, value): + raise TypeError("Attributes may not be set on pikepdf.Name") + + def __getitem__(self, item): + if item.startswith('/'): + item = item[1:] + raise TypeError( + ( + "pikepdf.Name is not subscriptable. You probably meant:\n" + " pikepdf.Name.{}\n" + "or\n" + " pikepdf.Name('/{}')\n" + ).format(item, item) + ) + + +class Name(metaclass=_NameObjectMeta): + """Constructs a PDF Name object + + Names can be constructed with two notations: + + 1. ``Name.Resources`` + + 2. ``Name('/Resources')`` + + The two are semantically equivalent. The former is preferred for names + that are normally expected to be in a PDF. The latter is preferred for + dynamic names and attributes. + """ + object_type = ObjectType.name + + def __new__(cls, name): + # QPDF_Name::unparse ensures that names are always saved in a UTF-8 + # compatible way, so we only need to guard the input. + if isinstance(name, bytes): + raise TypeError("Name should be str") + return _qpdf._new_name(name) + + +class String(metaclass=_ObjectMeta): + """Constructs a PDF String object""" + object_type = ObjectType.string + + def __new__(cls, s): + """ + Args: + s (str or bytes): The string to use. String will be encoded for + PDF, bytes will be constructed without encoding. + + Returns: + pikepdf.Object + """ + if isinstance(s, bytes): + return _qpdf._new_string(s) + return _qpdf._new_string_utf8(s) + + +class Array(metaclass=_ObjectMeta): + """Constructs a PDF Array object""" + object_type = ObjectType.array + + def __new__(cls, a=None): + """ + Args: + a (iterable): A list of objects. All objects must be either + `pikepdf.Object` or convertible to `pikepdf.Object`. + + Returns: + pikepdf.Object + """ + + if isinstance(a, (str, bytes)): + raise TypeError('Strings cannot be converted to arrays of chars') + if a is None: + a = [] + return _qpdf._new_array(a) + + +class Dictionary(metaclass=_ObjectMeta): + """Constructs a PDF Dictionary object""" + object_type = ObjectType.dictionary + + def __new__(cls, d=None, **kwargs): + """ + Constructs a PDF Dictionary from either a Python ``dict`` or keyword + arguments. + + These two examples are equivalent: + + .. code-block:: python + + pikepdf.Dictionary({'/NameOne': 1, '/NameTwo': 'Two'}) + + pikepdf.Dictionary(NameOne=1, NameTwo='Two') + + In either case, the keys must be strings, and the strings + correspond to the desired Names in the PDF Dictionary. The values + must all be convertible to `pikepdf.Object`. + + Returns: + pikepdf.Object + """ + if kwargs and d is not None: + raise ValueError('Unsupported parameters') + if kwargs: + # Add leading slash + # Allows Dictionary(MediaBox=(0,0,1,1), Type=Name('/Page')... + return _qpdf._new_dictionary( + {('/' + k) : v for k, v in kwargs.items()}) + if not d: + d = {} + return _qpdf._new_dictionary(d) + + +class Stream(metaclass=_ObjectMeta): + """Constructs a PDF Stream object""" + object_type = ObjectType.stream + + def __new__(cls, owner, obj): + """ + Args: + owner (pikepdf.Pdf): The Pdf to which this stream shall be attached. + obj (bytes or list): If ``bytes``, the data bytes for the stream. + If ``list``, a list of ``(operands, operator)`` tuples such + as returned by :func:`pikepdf.parse_content_stream`. + + Returns: + pikepdf.Object + """ + return _qpdf._new_stream(owner, obj) diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp new file mode 100644 index 0000000..392d9ff --- /dev/null +++ b/src/qpdf/object.cpp @@ -0,0 +1,943 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "pikepdf.h" + +/* +Type table + +See objects.rst. In short and with technical details: + +These QPDF types are directly mapped to a native Python equivalent. The C++ +object is never returned to Python; a Python object is returned instead. +Adding one of these to a QPDF container type causes the appropriate conversion. + Boolean <-> bool + Integer <-> int + Real <-> Decimal + Real <- float + Null <-> None + +PDF semantics dictate that setting a dictionary key to Null deletes the key. + + d['/Key'] = None # would delete /Key + +For Python users appears to have an unxpected side effect, so this action is +prohibited. You cannot set keys to None. + +pikepdf.String is a "type" that can be converted with str() or bytes() as +needed. + +*/ + +class PyParserCallbacks : public QPDFObjectHandle::ParserCallbacks { +public: + using QPDFObjectHandle::ParserCallbacks::ParserCallbacks; + virtual ~PyParserCallbacks() = default; + + void handleObject(QPDFObjectHandle h) override { + PYBIND11_OVERLOAD_PURE_NAME( + void, + QPDFObjectHandle::ParserCallbacks, + "handle_object", /* Python name */ + handleObject, /* C++ name */ + h + ); + } + + void handleEOF() override { + PYBIND11_OVERLOAD_PURE_NAME( + void, + QPDFObjectHandle::ParserCallbacks, + "handle_eof", /* Python name */ + handleEOF, /* C++ name; trailing comma needed for macro */ + ); + } +}; + + +class OperandGrouper : public QPDFObjectHandle::ParserCallbacks { +public: + OperandGrouper(const std::string& operators) + : parsing_inline_image(false), count(0) + { + std::istringstream f(operators); + std::string s; + while (std::getline(f, s, ' ')) { + this->whitelist.insert(s); + } + } + + void handleObject(QPDFObjectHandle obj) override + { + this->count++; + if (obj.getTypeCode() == QPDFObject::object_type_e::ot_operator) { + std::string op = obj.getOperatorValue(); + + // If we have a whitelist and this operator is not on the whitelist, + // discard it and all the tokens we collected + if (!this->whitelist.empty()) { + if (op[0] == 'q' || op[0] == 'Q') { + // We have token with multiple stack push/pops + if (this->whitelist.count("q") == 0 && this->whitelist.count("Q") == 0) { + this->tokens.clear(); + return; + } + } else if (this->whitelist.count(op) == 0) { + this->tokens.clear(); + return; + } + } + if (op == "BI") { + this->parsing_inline_image = true; + } else if (this->parsing_inline_image) { + if (op == "ID") { + this->inline_metadata = this->tokens; + } else if (op == "EI") { + auto PdfInlineImage = py::module::import("pikepdf").attr("PdfInlineImage"); + auto kwargs = py::dict(); + kwargs["image_data"] = this->tokens.at(0); + kwargs["image_object"] = this->inline_metadata; + auto iimage = PdfInlineImage(**kwargs); + + // Package as list with single element for consistency + auto iimage_list = py::list(); + iimage_list.append(iimage); + + auto instruction = py::make_tuple( + iimage_list, + QPDFObjectHandle::newOperator("INLINE IMAGE") + ); + this->instructions.append(instruction); + + this->parsing_inline_image = false; + this->inline_metadata.clear(); + } + } else { + py::list operand_list = py::cast(this->tokens); + auto instruction = py::make_tuple(operand_list, obj); + this->instructions.append(instruction); + } + this->tokens.clear(); + } else { + this->tokens.push_back(obj); + } + } + + void handleEOF() override + { + if (!this->tokens.empty()) + this->warning = "Unexpected end of stream"; + } + + py::list getInstructions() const + { + return this->instructions; + } + + std::string getWarning() const + { + return this->warning; + } + +private: + std::set whitelist; + std::vector tokens; + bool parsing_inline_image; + std::vector inline_metadata; + py::list instructions; + uint count; + std::string warning; +}; + + +size_t list_range_check(QPDFObjectHandle h, int index) +{ + if (!h.isArray()) + throw py::value_error("object is not an array"); + if (index < 0) + index += h.getArrayNItems(); // Support negative indexing + if (!(0 <= index && index < h.getArrayNItems())) + throw py::index_error("index out of range"); + return (size_t)index; +} + + +bool objecthandle_equal(QPDFObjectHandle self, QPDFObjectHandle other) +{ + StackGuard sg(" objecthandle_equal"); + + // Uninitialized objects are never equal + if (!self.isInitialized() || !other.isInitialized()) + return false; + + // Indirect objects (objid != 0) with the same obj-gen are equal and same owner + // are equal (in fact, they are identical; they reference the same underlying + // QPDFObject, even if the handles are different). + // This lets us compare deeply nested and cyclic structures without recursing + // into them. + if (self.getObjectID() != 0 + && other.getObjectID() != 0 + && self.getOwningQPDF() == other.getOwningQPDF()) { + return self.getObjGen() == other.getObjGen(); + } + + // If 'self' is a numeric type, convert both to Decimal objects + // and compare them as such. + if (self.getTypeCode() == QPDFObject::object_type_e::ot_integer || + self.getTypeCode() == QPDFObject::object_type_e::ot_real || + self.getTypeCode() == QPDFObject::object_type_e::ot_boolean) { + try { + auto a = decimal_from_pdfobject(self); + auto b = decimal_from_pdfobject(other); + py::object pyresult = a.attr("__eq__")(b); + bool result = pyresult.cast(); + return result; + } catch (const py::type_error&) { + return false; + } + } + + // Apart from numeric types, disimilar types are never equal + if (self.getTypeCode() != other.getTypeCode()) + return false; + + switch (self.getTypeCode()) { + case QPDFObject::object_type_e::ot_null: + return true; // Both must be null + case QPDFObject::object_type_e::ot_boolean: + return self.getBoolValue() == other.getBoolValue(); + case QPDFObject::object_type_e::ot_name: + return self.getName() == other.getName(); + case QPDFObject::object_type_e::ot_operator: + return self.getOperatorValue() == other.getOperatorValue(); + case QPDFObject::object_type_e::ot_string: + { + // We don't know what encoding the string is in + // This ensures UTF-16 coded ASCII strings will compare equal to + // UTF-8/ASCII coded. + return self.getStringValue() == other.getStringValue() || + self.getUTF8Value() == other.getUTF8Value(); + } + case QPDFObject::object_type_e::ot_array: + { + // Call operator==() on each element of the arrays, meaning this + // recurses into this function + return (self.getArrayAsVector() == other.getArrayAsVector()); + } + case QPDFObject::object_type_e::ot_dictionary: + { + // Call operator==() on each element of the arrays, meaning this + // recurses into this function + return (self.getDictAsMap() == other.getDictAsMap()); + } + default: + break; + } + return false; +} + + +bool operator==(QPDFObjectHandle self, QPDFObjectHandle other) +{ + // A lot of functions in QPDFObjectHandle are not tagged const where they + // should be, but are const-safe + return objecthandle_equal(self, other); +} + + +bool object_has_key(QPDFObjectHandle h, std::string const& key) +{ + if (!h.isDictionary() && !h.isStream()) + throw py::value_error("object is not a dictionary or a stream"); + QPDFObjectHandle dict = h.isStream() ? h.getDict() : h; + return dict.hasKey(key); +} + + +QPDFObjectHandle object_get_key(QPDFObjectHandle h, std::string const& key) +{ + if (!h.isDictionary() && !h.isStream()) + throw py::value_error("object is not a dictionary or a stream"); + QPDFObjectHandle dict = h.isStream() ? h.getDict() : h; + if (!dict.hasKey(key)) + throw py::key_error(key); + return dict.getKey(key); +} + +void object_set_key(QPDFObjectHandle h, std::string const& key, QPDFObjectHandle& value) +{ + if (!h.isDictionary() && !h.isStream()) + throw py::value_error("object is not a dictionary or a stream"); + if (value.isNull()) + throw py::value_error("PDF Dictionary keys may not be set to None - use 'del' to remove"); + + // For streams, the actual dictionary is attached to stream object + QPDFObjectHandle dict = h.isStream() ? h.getDict() : h; + + // A stream dictionary has no owner, so use the stream object in this comparison + dict.replaceKey(key, value); +} + +void object_del_key(QPDFObjectHandle h, std::string const& key) +{ + if (!h.isDictionary() && !h.isStream()) + throw py::value_error("object is not a dictionary or a stream"); + // For streams, the actual dictionary is attached to stream object + QPDFObjectHandle dict = h.isStream() ? h.getDict() : h; + + if (!dict.hasKey(key)) + throw py::key_error(key); + + dict.removeKey(key); +} + +std::pair object_get_objgen(QPDFObjectHandle h) +{ + auto objgen = h.getObjGen(); + return std::pair(objgen.getObj(), objgen.getGen()); +} + + +void init_object(py::module& m) +{ + py::enum_(m, "ObjectType") + .value("uninitialized", QPDFObject::object_type_e::ot_uninitialized) + .value("reserved", QPDFObject::object_type_e::ot_reserved) + .value("null", QPDFObject::object_type_e::ot_null) + .value("boolean", QPDFObject::object_type_e::ot_boolean) + .value("integer", QPDFObject::object_type_e::ot_integer) + .value("real", QPDFObject::object_type_e::ot_real) + .value("string", QPDFObject::object_type_e::ot_string) + .value("name", QPDFObject::object_type_e::ot_name) + .value("array", QPDFObject::object_type_e::ot_array) + .value("dictionary", QPDFObject::object_type_e::ot_dictionary) + .value("stream", QPDFObject::object_type_e::ot_stream) + .value("operator", QPDFObject::object_type_e::ot_operator) + .value("inlineimage", QPDFObject::object_type_e::ot_inlineimage); + + + py::class_>(m, "Buffer", py::buffer_protocol()) + .def_buffer([](Buffer &b) -> py::buffer_info { + return py::buffer_info( + b.getBuffer(), + sizeof(unsigned char), + py::format_descriptor::format(), + 1, + { b.getSize() }, + { sizeof(unsigned char) } + ); + }); + + py::bind_vector>(m, "_ObjectList"); + py::bind_map>(m, "_ObjectMapping"); + + py::class_(m, "Object") + .def_property_readonly("_type_code", &QPDFObjectHandle::getTypeCode) + .def_property_readonly("_type_name", &QPDFObjectHandle::getTypeName) + .def("is_owned_by", + [](QPDFObjectHandle &h, std::shared_ptr possible_owner) { + return (h.getOwningQPDF() == possible_owner.get()); + }, + "Test if this object is owned by the indicated *possible_owner*." + ) + .def_property_readonly("is_indirect", &QPDFObjectHandle::isIndirect) + .def("__repr__", &objecthandle_repr) + .def("__hash__", + [](QPDFObjectHandle &self) -> py::int_ { + py::object hash = py::module::import("builtins").attr("hash"); + + //Objects which compare equal must have the same hash value + switch (self.getTypeCode()) { + case QPDFObject::object_type_e::ot_null: + return py::int_(0); + case QPDFObject::object_type_e::ot_string: + { + return hash(py::bytes(self.getUTF8Value())); + } + case QPDFObject::object_type_e::ot_name: + return hash(py::bytes(self.getName())); + case QPDFObject::object_type_e::ot_operator: + return hash(py::bytes(self.getOperatorValue())); + case QPDFObject::object_type_e::ot_array: + case QPDFObject::object_type_e::ot_dictionary: + case QPDFObject::object_type_e::ot_stream: + case QPDFObject::object_type_e::ot_inlineimage: + throw py::value_error("Can't hash mutable object"); + default: + break; + } + throw std::logic_error("don't know how to hash this"); + } + ) + .def("__eq__", + [](QPDFObjectHandle &self, QPDFObjectHandle &other) { + return (self == other); // overloaded + } + ) + .def("__eq__", + [](QPDFObjectHandle &self, py::str other) { + std::string utf8_other = other.cast(); + switch (self.getTypeCode()) { + case QPDFObject::object_type_e::ot_string: + return self.getUTF8Value() == utf8_other; + case QPDFObject::object_type_e::ot_name: + return self.getName() == utf8_other; + default: + return false; + } + } + ) + .def("__eq__", + [](QPDFObjectHandle &self, py::bytes other) { + std::string bytes_other = other.cast(); + switch (self.getTypeCode()) { + case QPDFObject::object_type_e::ot_string: + return self.getStringValue() == bytes_other; + case QPDFObject::object_type_e::ot_name: + return self.getName() == bytes_other; + default: + return false; + } + } + ) + .def("__eq__", + [](QPDFObjectHandle &self, py::object other) -> py::object { + QPDFObjectHandle q_other; + try { + q_other = objecthandle_encode(other); + } catch (const py::cast_error&) { + return py::globals()["__builtins__"].attr("NotImplemented"); + } + bool result = (self == objecthandle_encode(other)); + return py::bool_(result); + } + ) + .def("__len__", + [](QPDFObjectHandle &h) { + if (h.isDictionary()) + return (Py_ssize_t)h.getDictAsMap().size(); // getKeys constructs a new object, so this is better + else if (h.isArray()) + return (Py_ssize_t)h.getArrayNItems(); + throw py::value_error("length not defined for object"); + } + ) + .def("__getitem__", + [](QPDFObjectHandle &h, std::string const& key) { + return object_get_key(h, key); + } + ) + .def("__getitem__", + [](QPDFObjectHandle &h, QPDFObjectHandle &name) { + return object_get_key(h, name.getName()); + } + ) + .def("__setitem__", + [](QPDFObjectHandle &h, std::string const& key, QPDFObjectHandle &value) { + object_set_key(h, key, value); + }, + "assign dictionary key to new object", + py::keep_alive<1, 3>() + ) + .def("__setitem__", + [](QPDFObjectHandle &h, QPDFObjectHandle &name, QPDFObjectHandle &value) { + object_set_key(h, name.getName(), value); + }, + "assign dictionary key to new object", + py::keep_alive<1, 3>() + ) + .def("__setitem__", + [](QPDFObjectHandle &h, std::string const& key, py::object pyvalue) { + auto value = objecthandle_encode(pyvalue); + object_set_key(h, key, value); + } + ) + .def("__setitem__", + [](QPDFObjectHandle &h, QPDFObjectHandle &name, py::object pyvalue) { + auto value = objecthandle_encode(pyvalue); + object_set_key(h, name.getName(), value); + } + ) + .def("__delitem__", + [](QPDFObjectHandle &h, std::string const& key) { + object_del_key(h, key); + }, + "delete a dictionary key" + ) + .def("__delitem__", + [](QPDFObjectHandle &h, QPDFObjectHandle &name) { + object_del_key(h, name.getName()); + }, + "delete a dictionary key" + ) + .def("__getattr__", + [](QPDFObjectHandle &h, std::string const& name) { + QPDFObjectHandle value; + std::string key = "/" + name; + try { + value = object_get_key(h, key); + } catch (const py::key_error &e) { + if (std::isupper(name[0])) + throw py::attr_error(e.what()); + else + throw py::attr_error(name); + } + return value; + }, + "attribute lookup name" + ) + .def_property("stream_dict", + &QPDFObjectHandle::getDict, &QPDFObjectHandle::replaceDict, + py::return_value_policy::reference_internal + ) + .def("__setattr__", + [](QPDFObjectHandle &h, std::string const& name, py::object pyvalue) { + std::string key = "/" + name; + auto value = objecthandle_encode(pyvalue); + object_set_key(h, key, value); + }, + "attribute access" + ) + .def("__delattr__", + [](QPDFObjectHandle &h, std::string const& name) { + std::string key = "/" + name; + object_del_key(h, key); + } + ) + .def("__dir__", + [](QPDFObjectHandle &h) { + py::list result; + py::object obj = py::cast(h); + py::object class_keys = obj.attr("__class__").attr("__dict__").attr("keys")(); + for (auto attr: class_keys) { + result.append(attr); + } + if (h.isDictionary() || h.isStream()) { + for (auto key_attr: h.getKeys()) { + std::string s = key_attr.substr(1); + result.append(py::str(s)); + } + } + return result; + } + ) + .def("get", + [](QPDFObjectHandle &h, std::string const& key, py::object default_) { + QPDFObjectHandle value; + try { + value = object_get_key(h, key); + } catch (const py::key_error &e) { + return default_; + } + return py::cast(value); + }, + "for dictionary objects, behave as dict.get(key, default=None)", + py::arg("key"), + py::arg("default_") = py::none(), + py::return_value_policy::reference_internal + ) + .def("get", + [](QPDFObjectHandle &h, QPDFObjectHandle &name, py::object default_) { + QPDFObjectHandle value; + try { + value = object_get_key(h, name.getName()); + } catch (const py::key_error &e) { + return default_; + } + return py::cast(value); + }, + "for dictionary objects, behave as dict.get(key, default=None)", + py::arg("key"), + py::arg("default_") = py::none(), + py::return_value_policy::reference_internal + ) + .def("keys", &QPDFObjectHandle::getKeys) + .def("__contains__", + [](QPDFObjectHandle &h, QPDFObjectHandle &key) { + if (!key.isName()) + throw py::type_error("Dictionaries can only contain Names"); + return object_has_key(h, key.getName()); + } + ) + .def("__contains__", + [](QPDFObjectHandle &h, std::string const& key) { + return object_has_key(h, key); + } + ) + .def("as_list", &QPDFObjectHandle::getArrayAsVector) + .def("as_dict", &QPDFObjectHandle::getDictAsMap) + .def("__iter__", + [](QPDFObjectHandle &h) -> py::iterable { + if (h.isArray()) { + auto vec = h.getArrayAsVector(); + auto pyvec = py::cast(vec); + return pyvec.attr("__iter__")(); + } else if (h.isDictionary()) { + auto vec = h.getKeys(); + auto pyvec = py::cast(vec); + return pyvec.attr("__iter__")(); + } else { + throw py::type_error("__iter__ not available on this type"); + } + }, + py::return_value_policy::reference_internal + ) + .def("items", + [](QPDFObjectHandle &h) -> py::iterable { + if (!h.isDictionary()) + throw py::type_error("items() not available on this type"); + auto dict = h.getDictAsMap(); + auto pydict = py::cast(dict); + return pydict.attr("items")(); + }, + py::return_value_policy::reference_internal + ) + .def("__str__", + [](QPDFObjectHandle &h) -> py::str { + if (h.isName()) + return h.getName(); + else if (h.isOperator()) + return h.getOperatorValue(); + else if (h.isString()) + return h.getUTF8Value(); + throw py::notimpl_error("don't know how to __str__ this object"); + } + ) + .def("__bytes__", + [](QPDFObjectHandle &h) { + if (h.isName()) + return py::bytes(h.getName()); + if (h.isStream()) { + PointerHolder buf = h.getStreamData(); + // py::bytes will make a copy of the buffer, so releasing is fine + return py::bytes((const char*)buf->getBuffer(), buf->getSize()); + } + return py::bytes(h.getStringValue()); + } + ) + .def("__getitem__", + [](QPDFObjectHandle &h, int index) { + size_t u_index = list_range_check(h, index); + return h.getArrayItem(u_index); + } + ) + .def("__setitem__", + [](QPDFObjectHandle &h, int index, QPDFObjectHandle &value) { + size_t u_index = list_range_check(h, index); + h.setArrayItem(u_index, value); + } + ) + .def("__setitem__", + [](QPDFObjectHandle &h, int index, py::object pyvalue) { + size_t u_index = list_range_check(h, index); + auto value = objecthandle_encode(pyvalue); + h.setArrayItem(u_index, value); + } + ) + .def("__delitem__", + [](QPDFObjectHandle &h, int index) { + size_t u_index = list_range_check(h, index); + h.eraseItem(u_index); + } + ) + .def("get_stream_buffer", + [](QPDFObjectHandle &h) { + PointerHolder phbuf = h.getStreamData(); + return phbuf; + }, + "Return a buffer protocol buffer describing the decoded stream" + ) + .def("get_raw_stream_buffer", + [](QPDFObjectHandle &h) { + PointerHolder phbuf = h.getRawStreamData(); + return phbuf; + }, + "Return a buffer protocol buffer describing the raw, encoded stream" + ) + .def("read_bytes", + [](QPDFObjectHandle &h) { + PointerHolder buf = h.getStreamData(); + // py::bytes will make a copy of the buffer, so releasing is fine + return py::bytes((const char*)buf->getBuffer(), buf->getSize()); + }, + "Decode and read the content stream associated with this object" + ) + .def("read_raw_bytes", + [](QPDFObjectHandle &h) { + PointerHolder buf = h.getRawStreamData(); + // py::bytes will make a copy of the buffer, so releasing is fine + return py::bytes((const char*)buf->getBuffer(), buf->getSize()); + }, + "Read the content stream associated with this object without decoding" + ) + .def("write", + [](QPDFObjectHandle &h, py::bytes data, py::args args, py::kwargs kwargs) { + std::string sdata = data; + QPDFObjectHandle filter = QPDFObjectHandle::newNull(); + QPDFObjectHandle decode_parms = QPDFObjectHandle::newNull(); + if (args.size() != 0) + throw py::value_error("Too many positional arguments"); + if (kwargs.contains("filter")) + filter = objecthandle_encode(kwargs["filter"]); + if (kwargs.contains("decode_parms")) + decode_parms = objecthandle_encode(kwargs["decode_parms"]); + h.replaceStreamData(sdata, filter, decode_parms); + }, + R"~~~( + Replace the content stream with `data`, compressed according to `filter` and `decode_parms` + + :param data: the new data to use for replacement + :type data: bytes + :param filter: The filter(s) with which the data is (already) encoded + :param decode_parms: Parameters for the filters with which the object is encode + + If only one `filter` is specified, it may be a name such as + `Name('/FlateDecode')`. If there are multiple filters, then array + of names should be given. + + If there is only one filter, `decode_parms` is a Dictionary of + parameters for that filter. If there are multiple filters, then + `decode_parms` is an Array of Dictionary, where each array index + is corresponds to the filter. + + )~~~" + ) + .def_property_readonly("images", + [](QPDFObjectHandle &h) { + if (!h.isPageObject()) + throw py::type_error("Not a Page"); + return h.getPageImages(); + } + ) + .def("_inline_image_raw_bytes", + [](QPDFObjectHandle &h) { + return py::bytes(h.getInlineImageValue()); + } + ) + .def("page_contents_add", + [](QPDFObjectHandle &h, QPDFObjectHandle &contents, bool prepend) { + if (!h.isPageObject()) + throw py::type_error("Not a Page"); + h.addPageContents(contents, prepend); + }, + "Append or prepend to an existing page's content stream.", + py::arg("contents"), + py::arg("prepend") = false, + py::keep_alive<1, 2>() + ) + .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams) + .def_property_readonly("_objgen", + &object_get_objgen + ) + .def_property_readonly("objgen", + &object_get_objgen, + R"~~~( + Return the object-generation number pair for this object + + If this is a direct object, then the returned value is ``(0, 0)``. + By definition, if this is an indirect object, it has a "objgen", + and can be looked up using this in the cross-reference (xref) table. + Direct objects cannot necessarily be looked up. + + The generation number is usually 0, except for PDFs that have been + incrementally updated. + + )~~~" + ) + .def_static("parse", + [](std::string const& stream, std::string const& description) { + return QPDFObjectHandle::parse(stream, description); + }, + "Parse PDF binary representation into PDF objects.", + py::arg("stream"), + py::arg("description") = "" + ) + .def("_parse_page_contents", + &QPDFObjectHandle::parsePageContents, + "Helper for parsing page contents; use ``pikepdf.parse_content_stream``." + ) + .def("_parse_page_contents_grouped", + [](QPDFObjectHandle &h, std::string const& whitelist) { + OperandGrouper og(whitelist); + h.parsePageContents(&og); + return og.getInstructions(); + } + ) + .def_static("_parse_stream", + &QPDFObjectHandle::parseContentStream, + "Helper for parsing PDF content stream; use ``pikepdf.parse_content_stream``." + ) + .def_static("_parse_stream_grouped", + [](QPDFObjectHandle &h, std::string const& whitelist) { + OperandGrouper og(whitelist); + QPDFObjectHandle::parseContentStream(h, &og); + if (!og.getWarning().empty()) { + auto warn = py::module::import("warnings").attr("warn"); + warn(og.getWarning()); + } + return og.getInstructions(); + } + ) + .def("unparse", + [](QPDFObjectHandle &h, bool resolved) -> py::bytes { + if (resolved) + return h.unparseResolved(); + return h.unparse(); + }, + py::arg("resolved") = false, + "Convert PDF objects into their binary representation, optionally resolving indirect objects." + ) + ; // end of QPDFObjectHandle bindings + + m.def("_new_boolean", &QPDFObjectHandle::newBool, "Construct a PDF Boolean object"); + m.def("_new_integer", &QPDFObjectHandle::newInteger, "Construct a PDF Integer object"); + m.def("_new_real", + [](const std::string& value) { + return QPDFObjectHandle::newReal(value); + }, + "Construct a PDF Real value, that is, a decimal number" + ); + m.def("_new_real", + [](double value, uint places) { + return QPDFObjectHandle::newReal(value, places); + }, + "Construct PDF real", + py::arg("value"), + py::arg("places") = 0 + ); + m.def("_new_name", + [](const std::string& s) { + if (s.at(0) != '/') + throw py::value_error("Name objects must begin with '/'"); + if (s.length() < 2) + throw py::value_error("Name must be at least one character long"); + return QPDFObjectHandle::newName(s); + }, + "Create a Name from a string. Must begin with '/'. All other characters except null are valid." + ); + m.def("_new_string", + [](const std::string& s) { + return QPDFObjectHandle::newString(s); + }, + "Construct a PDF String object." + ); + m.def("_new_string_utf8", + [](const std::string& utf8) { + return QPDFObjectHandle::newUnicodeString(utf8); + }, + "Construct a PDF String object from UTF-8 bytes." + ); + m.def("_new_array", + [](py::iterable iterable) { + return QPDFObjectHandle::newArray(array_builder(iterable)); + }, + "Construct a PDF Array object from an iterable of PDF objects or types that can be coerced to PDF objects." + ); + m.def("_new_dictionary", + [](py::dict dict) { + return QPDFObjectHandle::newDictionary(dict_builder(dict)); + }, + "Construct a PDF Dictionary from a mapping of PDF objects or Python types that can be coerced to PDF objects." + ); + m.def("_new_stream", + [](std::shared_ptr owner, py::bytes data) { + std::string s = data; + return QPDFObjectHandle::newStream(owner.get(), data); // This makes a copy of the data + }, + "Construct a PDF Stream object from binary data", + py::keep_alive<0, 1>() // returned object references the owner + ); + m.def("_new_stream", + [](std::shared_ptr owner, py::iterable content_stream) { + std::stringstream data; + + for (auto handle_command : content_stream) { + py::tuple command = py::reinterpret_borrow(handle_command); + + if (command.size() != 2) + throw py::value_error("Each item in stream data must be a tuple(operands, operator)"); + + py::object operands = command[0]; + py::object operator_ = command[1]; + for (auto operand : operands) { + QPDFObjectHandle h = objecthandle_encode(operand); + data << h.unparse(); + data << " "; + } + data << objecthandle_encode(operator_).unparse(); + data << "\n"; + } + return QPDFObjectHandle::newStream(owner.get(), data.str()); + }, + "Construct a PDF Stream object from a list of operand-operator tuples [((operands,), operator)]", + py::keep_alive<0, 1>() // returned object references the owner + ); + m.def("Operator", + [](const std::string& op) { + return QPDFObjectHandle::newOperator(op); + }, + "Construct a PDF Operator object for use in content streams" + ); + m.def("_Null", &QPDFObjectHandle::newNull, + "Construct a PDF Null object" + ); + + py::class_ parsercallbacks(m, "StreamParser"); + parsercallbacks + .def(py::init<>()) + .def("handle_object", &QPDFObjectHandle::ParserCallbacks::handleObject) + .def("handle_eof", &QPDFObjectHandle::ParserCallbacks::handleEOF); + + m.def("_encode", + [](py::handle handle) { + return objecthandle_encode(handle); + } + ); + m.def("_roundtrip", + [](py::object obj) { + return obj; + } + ); + m.def("_roundtrip", + [](QPDFObjectHandle &h) { + return h; + } + ); + m.def("unparse", + [](py::object obj) -> py::bytes { + return objecthandle_encode(obj).unparseBinary(); + } + ); + m.def("unparse", + [](QPDFObjectHandle &h) -> py::bytes { + return h.unparseBinary(); + } + ); + + + +} // init_object diff --git a/src/qpdf/object_convert.cpp b/src/qpdf/object_convert.cpp new file mode 100644 index 0000000..3a3e1ce --- /dev/null +++ b/src/qpdf/object_convert.cpp @@ -0,0 +1,138 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +/* + * Convert Python types <-> QPDFObjectHandle types + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "pikepdf.h" + + +std::map +dict_builder(const py::dict dict) +{ + StackGuard sg(" dict_builder"); + std::map result; + + for (const auto& item: dict) { + std::string key = item.first.cast(); + + auto value = objecthandle_encode(item.second); + result[key] = value; + } + return result; +} + +std::vector +array_builder(const py::iterable iter) +{ + StackGuard sg(" array_builder"); + std::vector result; + int narg = 0; + + for (const auto& item: iter) { + narg++; + + auto value = objecthandle_encode(item); + result.push_back(value); + } + return result; +} + + +QPDFObjectHandle objecthandle_encode(const py::handle handle) +{ + if (handle.is_none()) + return QPDFObjectHandle::newNull(); + + // Ensure that when we return QPDFObjectHandle/pikepdf.Object to the Py + // environment, that we can recover it + try { + auto as_qobj = handle.cast(); + return as_qobj; + } catch (const py::cast_error&) {} + + // Special-case booleans since pybind11 coerces nonzero integers to boolean + if (py::isinstance(handle)) { + bool as_bool = handle.cast(); + return QPDFObjectHandle::newBool(as_bool); + } + + auto Decimal = py::module::import("decimal").attr("Decimal"); + + if (py::isinstance(handle, Decimal)) { + return QPDFObjectHandle::newReal(py::str(handle)); + } else if (py::isinstance(handle)) { + auto as_int = handle.cast(); + return QPDFObjectHandle::newInteger(as_int); + } else if (py::isinstance(handle)) { + auto as_double = handle.cast(); + return QPDFObjectHandle::newReal(as_double); + } + + py::object obj = py::reinterpret_borrow(handle); + + if (py::isinstance(obj)) { + py::bytes py_bytes = obj; + return QPDFObjectHandle::newString(static_cast(py_bytes)); + } else if (py::isinstance(obj)) { + py::str py_str = obj; + return QPDFObjectHandle::newUnicodeString(static_cast(py_str)); + } + + if (py::hasattr(obj, "__iter__")) { + //py::print(py::repr(obj)); + bool is_mapping = false; // PyMapping_Check is unreliable in Py3 + if (py::hasattr(obj, "keys")) + is_mapping = true; + + bool is_sequence = PySequence_Check(obj.ptr()); + if (is_mapping) { + return QPDFObjectHandle::newDictionary(dict_builder(obj)); + } else if (is_sequence) { + return QPDFObjectHandle::newArray(array_builder(obj)); + } + } + + throw py::cast_error(std::string("don't know how to encode value ") + std::string(py::repr(obj))); +} + + +py::object decimal_from_pdfobject(QPDFObjectHandle h) +{ + auto decimal_constructor = py::module::import("decimal").attr("Decimal"); + + if (h.getTypeCode() == QPDFObject::object_type_e::ot_integer) { + auto value = h.getIntValue(); + return decimal_constructor(py::cast(value)); + } else if (h.getTypeCode() == QPDFObject::object_type_e::ot_real) { + auto value = h.getRealValue(); + return decimal_constructor(py::cast(value)); + } else if (h.getTypeCode() == QPDFObject::object_type_e::ot_boolean) { + auto value = h.getBoolValue(); + return decimal_constructor(py::cast(value)); + } + throw py::type_error("object has no Decimal() representation"); +} diff --git a/src/qpdf/object_repr.cpp b/src/qpdf/object_repr.cpp new file mode 100644 index 0000000..a915c53 --- /dev/null +++ b/src/qpdf/object_repr.cpp @@ -0,0 +1,244 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +/* + * Implement repr() for QPDFObjectHandle + * + * Since qpdf largely ignores const, it is not possible to use const here, + * even though repr() is const throughout. + * + * References are used for functions that are just passing handles around. + * objecthandle_repr_inner cannot cannot use references because it calls itself. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pikepdf.h" + + +std::string objecthandle_scalar_value(QPDFObjectHandle h, bool escaped) +{ + std::stringstream ss; + switch (h.getTypeCode()) { + case QPDFObject::object_type_e::ot_null: + ss << "None"; + break; + case QPDFObject::object_type_e::ot_boolean: + ss << (h.getBoolValue() ? "True" : "False"); + break; + case QPDFObject::object_type_e::ot_integer: + ss << std::to_string(h.getIntValue()); + break; + case QPDFObject::object_type_e::ot_real: + ss << "Decimal('" + h.getRealValue() + "')"; + break; + case QPDFObject::object_type_e::ot_name: + ss << std::quoted(h.getName()); + break; + case QPDFObject::object_type_e::ot_string: + ss << std::quoted(h.getUTF8Value()); + break; + case QPDFObject::object_type_e::ot_operator: + ss << std::quoted(h.getOperatorValue()); + break; + default: + return ""; + } + return ss.str(); +} + +std::string objecthandle_pythonic_typename(QPDFObjectHandle h, std::string prefix) +{ + std::string s; + + s += prefix; + switch (h.getTypeCode()) { + case QPDFObject::object_type_e::ot_null: + s += "NoneType"; + break; + case QPDFObject::object_type_e::ot_boolean: + s += "Boolean"; + break; + case QPDFObject::object_type_e::ot_integer: + s += "Integer"; + break; + case QPDFObject::object_type_e::ot_real: + s += "Real"; + break; + case QPDFObject::object_type_e::ot_name: + s += "Name"; + break; + case QPDFObject::object_type_e::ot_string: + s += "String"; + break; + case QPDFObject::object_type_e::ot_operator: + s += "Operator"; + break; + case QPDFObject::object_type_e::ot_inlineimage: + s += "InlineImage"; + break; + case QPDFObject::object_type_e::ot_array: + s += "Array"; + break; + case QPDFObject::object_type_e::ot_dictionary: + if (h.hasKey("/Type")) { + s += std::string("Dictionary(type_=\"") + h.getKey("/Type").getName() + "\")"; + } else { + s += "Dictionary"; + } + break; + case QPDFObject::object_type_e::ot_stream: + s += "Stream"; + break; + default: + s += ""; + break; + } + + return s; +} + + +std::string objecthandle_repr_typename_and_value(QPDFObjectHandle h) +{ + if (h.isNull()) + return "None"; + return objecthandle_pythonic_typename(h) + \ + "(" + objecthandle_scalar_value(h) + ")"; +} + + +static +std::string objecthandle_repr_inner(QPDFObjectHandle h, uint depth, std::set* visited, bool* pure_expr) +{ + StackGuard sg(" objecthandle_repr_inner"); + std::ostringstream oss; + + if (!h.isScalar()) { + if (visited->count(h.getObjGen()) > 0) { + *pure_expr = false; + oss << "<.get_object(" << h.getObjGen().getObj() << ", " << h.getObjGen().getGen() << ")>"; + return oss.str(); + } + + if (!(h.getObjGen() == QPDFObjGen(0, 0))) + visited->insert(h.getObjGen()); + } + + switch (h.getTypeCode()) { + case QPDFObject::object_type_e::ot_null: + case QPDFObject::object_type_e::ot_boolean: + case QPDFObject::object_type_e::ot_integer: + case QPDFObject::object_type_e::ot_real: + case QPDFObject::object_type_e::ot_name: + case QPDFObject::object_type_e::ot_string: + oss << objecthandle_scalar_value(h); + break; + case QPDFObject::object_type_e::ot_operator: + oss << objecthandle_repr_typename_and_value(h); + break; + case QPDFObject::object_type_e::ot_inlineimage: + oss << objecthandle_pythonic_typename(h); + oss << "("; + oss << "data=<...>"; + oss << ")"; + break; + case QPDFObject::object_type_e::ot_array: + oss << "["; + { + bool first = true; + oss << " "; + for (auto item: h.getArrayAsVector()) { + if (!first) oss << ", "; + first = false; + oss << objecthandle_repr_inner(item, depth, visited, pure_expr); + } + oss << " "; + } + oss << "]"; + break; + case QPDFObject::object_type_e::ot_dictionary: + oss << "{"; // This will end the line + { + bool first = true; + oss << "\n"; + for (auto item: h.getDictAsMap()) { + if (!first) oss << ",\n"; + first = false; + oss << std::string((depth + 1) * 2, ' '); // Indent each line + if (item.first == "/Parent" && item.second.isPagesObject()) { + // Don't visit /Parent keys since that just puts every page on the repr() of a single page + oss << std::quoted(item.first) << ": "; + } else { + oss << std::quoted(item.first) << ": " << objecthandle_repr_inner(item.second, depth + 1, visited, pure_expr); + } + } + oss << "\n"; + } + oss << std::string(depth * 2, ' '); // Restore previous indent level + oss << "}"; + break; + case QPDFObject::object_type_e::ot_stream: + *pure_expr = false; + oss << objecthandle_pythonic_typename(h); + oss << "("; + oss << "stream_dict="; + oss << objecthandle_repr_inner(h.getDict(), depth + 1, visited, pure_expr); + oss << ", "; + oss << "data=<...>"; + oss << ")"; + break; + default: + oss << "???"; + break; + } + + return oss.str(); +} + +std::string objecthandle_repr(QPDFObjectHandle h) +{ + if (h.isScalar() || h.isOperator()) { + // qpdf does not consider Operator a scalar but it is as far we + // are concerned here + return objecthandle_repr_typename_and_value(h); + } + + std::set visited; + bool pure_expr = true; + std::string inner = objecthandle_repr_inner(h, 0, &visited, &pure_expr); + std::string output; + + if (h.isScalar() || h.isDictionary() || h.isArray()) { + output = objecthandle_pythonic_typename(h) + "(" + inner + ")"; + } else { + output = inner; + pure_expr = false; + } + + if (pure_expr) { + // The output contains no external or parent objects so this object + // can be output as a Python expression and rebuild with repr(output) + return output; + } + // Output cannot be fully described in a Python expression + return std::string("<") + output + ">"; +} diff --git a/src/qpdf/pikepdf.h b/src/qpdf/pikepdf.h new file mode 100644 index 0000000..7fbd6e8 --- /dev/null +++ b/src/qpdf/pikepdf.h @@ -0,0 +1,200 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#pragma once + +#include +#include +#include + +#include "shims.h" + +#include +#include +#include + +#include +#include +#include + +using uint = unsigned int; + +namespace pybind11 { + PYBIND11_RUNTIME_EXCEPTION(attr_error, PyExc_AttributeError); + PYBIND11_RUNTIME_EXCEPTION(notimpl_error, PyExc_NotImplementedError); + PYBIND11_RUNTIME_EXCEPTION(filenotfound_error, PyExc_FileNotFoundError); +}; + +// Declare PointerHolder as a smart pointer +// https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers +PYBIND11_DECLARE_HOLDER_TYPE(T, PointerHolder); +namespace pybind11 { namespace detail { + template + struct holder_helper> { + static const T *get(const PointerHolder &p) { return p.getPointer(); } + }; +}} + +#define CUSTOM_TYPE_CONVERSION 1 +#if CUSTOM_TYPE_CONVERSION + +// From object_convert.cpp +pybind11::object decimal_from_pdfobject(QPDFObjectHandle h); + +namespace pybind11 { namespace detail { + template <> struct type_caster : public type_caster_base { + using base = type_caster_base; + protected: + QPDFObjectHandle value; + public: + + /** + * Conversion part 1 (Python->C++): convert a PyObject into a Object + */ + bool load(handle src, bool convert) { + // if (src.is_none()) { + // if (!convert) return false; + // value = QPDFObjectHandle::newNull(); + // return true; + // } + // Attempting to construct these does not work... + // if (convert) { + // if (PYBIND11_LONG_CHECK(src.ptr())) { + // auto as_int = src.cast(); + // value = QPDFObjectHandle::newInteger(as_int); + // } /*else if (PyFloat_Check(src.ptr())) { + // auto as_double = src.cast(); + // value = QPDFObjectHandle::newReal(as_double); + // } */ else { + // return base::load(src, convert); + // } + // return true; + // } + return base::load(src, convert); + } + + /** + * Conversion part 2 (C++ -> Python): convert an instance into + * a Python object. + * Purpose of this is to establish the indirect keep_alive relationship + * between QPDF and objects that refer back to in ways that pybind11 + * can't trace on its own. + * We also convert several QPDFObjectHandle types to native Python + * objects here. + * The ==take_ownership code paths are currently unused but present + * for completeness. They are unused because pybind11 only sets + * take_ownership when a binding returns raw pointers to Python, and + * by making this caster private we prohibit that. + */ + private: + // 'private': disallow returning pointers to QPDFObjectHandle from bindings + static handle cast(const QPDFObjectHandle *csrc, return_value_policy policy, handle parent) { + QPDFObjectHandle *src = const_cast(csrc); + if (!csrc) + return none().release(); + + bool primitive = true; + handle h; + + switch (src->getTypeCode()) { + case QPDFObject::object_type_e::ot_null: + h = pybind11::none().release(); + break; + case QPDFObject::object_type_e::ot_integer: + h = pybind11::int_(src->getIntValue()).release(); + break; + case QPDFObject::object_type_e::ot_boolean: + h = pybind11::bool_(src->getBoolValue()).release(); + break; + case QPDFObject::object_type_e::ot_real: + h = decimal_from_pdfobject(*src).release(); + break; + default: + primitive = false; + break; + } + if (primitive && h) { + if (policy == return_value_policy::take_ownership) + delete csrc; + return h; + } + + QPDF *owner = src->getOwningQPDF(); + if (policy == return_value_policy::take_ownership) { + h = base::cast(std::move(*csrc), policy, parent); + delete csrc; + } else { + h = base::cast(*csrc, policy, parent); + } + if (owner) { + // Find the Python object that refers to our owner + // Can do that by casting or more direct lookup + //auto pyqpdf = pybind11::cast(owner); + auto tinfo = get_type_info(typeid(QPDF)); + handle pyqpdf = get_object_handle(owner, tinfo); + + // Tell pybind11 that it must keep pyqpdf alive as long as h is + // alive + keep_alive_impl(h, pyqpdf); + } + return h; + } + + public: + static handle cast(QPDFObjectHandle &&src, return_value_policy policy, handle parent) { + return cast(&src, return_value_policy::move, parent); + } + + static handle cast(const QPDFObjectHandle &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference) + policy = return_value_policy::copy; + return cast(&src, policy, parent); + } + }; +}} // namespace pybind11::detail +#endif + +namespace py = pybind11; + +PYBIND11_MAKE_OPAQUE(std::vector); + +typedef std::map ObjectMap; +PYBIND11_MAKE_OPAQUE(ObjectMap); + + +// From object.cpp +size_t list_range_check(QPDFObjectHandle h, int index); +void init_object(py::module& m); + +// From object_repr.cpp +std::string objecthandle_scalar_value(QPDFObjectHandle h, bool escaped=true); +std::string objecthandle_pythonic_typename(QPDFObjectHandle h, std::string prefix = "pikepdf."); +std::string objecthandle_repr_typename_and_value(QPDFObjectHandle h); +std::string objecthandle_repr(QPDFObjectHandle h); + +// From object_convert.cpp +py::object decimal_from_pdfobject(QPDFObjectHandle h); +QPDFObjectHandle objecthandle_encode(const py::handle handle); +std::vector array_builder(const py::iterable iter); +std::map dict_builder(const py::dict dict); + +// Support for recursion checks +class StackGuard +{ +public: + StackGuard(const char *where) { + Py_EnterRecursiveCall(where); + } + StackGuard(const StackGuard&) = delete; + StackGuard& operator= (const StackGuard&) = delete; + StackGuard(StackGuard&&) = delete; + StackGuard& operator= (StackGuard&&) = delete; + ~StackGuard() { + Py_LeaveRecursiveCall(); + } +}; diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp new file mode 100644 index 0000000..5bb8ea9 --- /dev/null +++ b/src/qpdf/qpdf.cpp @@ -0,0 +1,582 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include +#include +#include +#include + +#include "pikepdf.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "qpdf_pagelist.h" +#include "qpdf_inputsource.h" +#include "utils.h" + +extern "C" const char* qpdf_get_qpdf_version(); + + +void check_stream_is_usable(py::object stream) +{ + auto TextIOBase = py::module::import("io").attr("TextIOBase"); + + if (py::isinstance(stream, TextIOBase)) { + throw py::type_error("stream must be binary (no transcoding) and seekable"); + } +} + +std::shared_ptr +open_pdf( + py::object filename_or_stream, + std::string password, + bool hex_password=false, + bool ignore_xref_streams=false, + bool suppress_warnings=true, + bool attempt_recovery=true, + bool inherit_page_attributes=true) +{ + auto q = std::make_shared(); + + q->setSuppressWarnings(suppress_warnings); + q->setPasswordIsHexKey(hex_password); + q->setIgnoreXRefStreams(ignore_xref_streams); + q->setAttemptRecovery(attempt_recovery); + + if (py::hasattr(filename_or_stream, "read") && py::hasattr(filename_or_stream, "seek")) { + // Python code gave us an object with a stream interface + py::object stream = filename_or_stream; + + check_stream_is_usable(stream); + + // The PythonInputSource object will be owned by q + InputSource* input_source = new PythonInputSource(stream); + py::gil_scoped_release release; + q->processInputSource(input_source, password.c_str()); + } else { + auto filename = filename_or_stream; + std::string description = py::str(filename); + FILE* file = portable_fopen(filename_or_stream, "rb"); + + // We can release GIL because Python knows nothing about q at this + // point; this could also take a moment for large files + py::gil_scoped_release release; + q->processFile( + description.c_str(), + file, // transferring ownership + true, // QPDF will close the file + password.c_str() + ); + file = nullptr; // QPDF owns the file and will close it + } + + if (inherit_page_attributes) { + // This could be expensive for a large file, plausibly (not tested), + // so release the GIL again. + py::gil_scoped_release release; + q->pushInheritedAttributesToPage(); + } + + return q; +} + + +class PikeProgressReporter : public QPDFWriter::ProgressReporter { +public: + PikeProgressReporter(py::function callback) + { + this->callback = callback; + } + + virtual ~PikeProgressReporter() = default; + + virtual void reportProgress(int percent) override + { + py::gil_scoped_acquire acquire; + this->callback(percent); + } +private: + py::function callback; +}; + + +void save_pdf( + QPDF& q, + py::object filename_or_stream, + bool static_id=false, + bool preserve_pdfa=true, + std::string min_version="", + std::string force_version="", + bool compress_streams=true, + qpdf_stream_decode_level_e stream_decode_level=qpdf_dl_generalized, + qpdf_object_stream_e object_stream_mode=qpdf_o_preserve, + bool normalize_content=false, + bool linearize=false, + bool qdf=false, + py::object progress=py::none()) +{ + QPDFWriter w(q); + + // Parameters + if (static_id) { + w.setStaticID(true); + } + w.setNewlineBeforeEndstream(preserve_pdfa); + if (!min_version.empty()) { + w.setMinimumPDFVersion(min_version, 0); + } + if (!force_version.empty()) { + w.forcePDFVersion(force_version, 0); + } + w.setCompressStreams(compress_streams); + w.setDecodeLevel(stream_decode_level); + w.setObjectStreamMode(object_stream_mode); + + if (normalize_content && linearize) { + throw py::value_error("cannot save with both normalize_content and linearize"); + } + w.setContentNormalization(normalize_content); + w.setLinearization(linearize); + w.setQDFMode(qdf); + + if (!progress.is_none()) { + auto reporter = PointerHolder(new PikeProgressReporter(progress)); + w.registerProgressReporter(reporter); + } + + if (py::hasattr(filename_or_stream, "write") && py::hasattr(filename_or_stream, "seek")) { + // Python code gave us an object with a stream interface + py::object stream = filename_or_stream; + check_stream_is_usable(stream); + + // TODO might be able to improve this by streaming rather than buffering + // using subclass of Pipeline that routes calls to Python. + w.setOutputMemory(); + + // It would be kind to release the GIL here, but this is not possible if + // another thread has an object and tries to mess with it. Correctness + // is more important than performance. + w.write(); + + // But now that we've held the GIL forever, we can release it and take + // it back again; at least in theory giving other threads a chance to + // to do something. + { + py::gil_scoped_release release; + } + + // getBuffer returns Buffer* and qpdf says we are responsible for + // deleting it, so capture it in a unique_ptr + std::unique_ptr output_buffer(w.getBuffer()); + + // Create a memoryview of the buffer that libqpdf created + // Awkward API alert: + // QPDFWriter::getBuffer -> Buffer* (caller frees memory) + // and Buffer::getBuffer -> unsigned char* (caller does not own memory) + py::buffer_info output_buffer_info( + output_buffer->getBuffer(), + output_buffer->getSize()); + py::memoryview view_output_buffer(output_buffer_info); + + // Send it to the stream object (probably copying) + stream.attr("write")(view_output_buffer); + } else { + py::object filename = filename_or_stream; + std::string description = py::str(filename); + // Delete the intended filename, in case it is the same as the input file. + // This ensures that the input file will continue to exist in memory on Linux. + portable_unlink(filename); + FILE* file = portable_fopen(filename, "wb"); + w.setOutputFile(description.c_str(), file, true); + w.write(); + file = nullptr; // QPDF will close it + } +} + + +PYBIND11_MODULE(_qpdf, m) { + //py::options options; + //options.disable_function_signatures(); + + m.doc() = "pikepdf provides a Pythonic interface for QPDF"; + + m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version"); + + static py::exception exc_main(m, "PdfError"); + static py::exception exc_password(m, "PasswordError"); + py::register_exception_translator([](std::exception_ptr p) { + try { + if (p) std::rethrow_exception(p); + } catch (const QPDFExc &e) { + if (e.getErrorCode() == qpdf_e_password) { + exc_password(e.what()); + } else { + exc_main(e.what()); + } + } + }); + + py::enum_(m, "ObjectStreamMode") + .value("disable", qpdf_object_stream_e::qpdf_o_disable) + .value("preserve", qpdf_object_stream_e::qpdf_o_preserve) + .value("generate", qpdf_object_stream_e::qpdf_o_generate); + + py::enum_(m, "StreamDecodeLevel") + .value("none", qpdf_stream_decode_level_e::qpdf_dl_none) + .value("generalized", qpdf_stream_decode_level_e::qpdf_dl_generalized) + .value("specialized", qpdf_stream_decode_level_e::qpdf_dl_specialized) + .value("all", qpdf_stream_decode_level_e::qpdf_dl_all); + + init_pagelist(m); + + py::class_>(m, "Pdf", "In-memory representation of a PDF") + .def_static("new", + []() { + auto q = std::make_shared(); + q->emptyPDF(); + q->setSuppressWarnings(true); + return q; + }, + "create a new empty PDF from stratch" + ) + .def_static("open", open_pdf, + R"~~~( + Open an existing file at `filename_or_stream`. + + If `filename_or_stream` is path-like, the file will be opened. The + file should not be modified by another process while it is open in + pikepdf. + + If `filename_or_stream` has `.read()` and `.seek()` methods, the file + will be accessed as a readable binary stream. pikepdf will read the + entire stream into a private buffer. + + Args: + filename_or_stream (os.PathLike): Filename of PDF to open + password (str or bytes): User or owner password to open an + encrypted PDF. If a str is given it will be converted to + UTF-8. + hex_password (bool): If True, interpret the password as a + hex-encoded version of the exact encryption key to use, without + performing the normal key computation. Useful in forensics. + ignore_xref_streams (bool): If True, ignore cross-reference + streams. See qpdf documentation. + suppress_warnings (bool): If True (default), warnings are not + printed to stderr. Use `get_warnings()` to retrieve warnings. + attempt_recovery (bool): If True (default), attempt to recover + from PDF parsing errors. + inherit_page_attributes (bool): If True (default), push attributes + set on a group of pages to individual pages + + Raises: + pikepdf.PasswordError: If the password failed to open the + file. + pikepdf.PdfError: If for other reasons we could not open + the file. + TypeError: If the type of `filename_or_stream` is not + usable. + FileNotFoundError: If the file was not found. + )~~~", + py::arg("filename_or_stream"), + py::arg("password") = "", + py::arg("hex_password") = false, + py::arg("ignore_xref_streams") = false, + py::arg("suppress_warnings") = true, + py::arg("attempt_recovery") = true, + py::arg("inherit_page_attributes") = true + ) + .def("__repr__", + [](QPDF& q) { + return std::string(""); + } + ) + .def_property_readonly("filename", &QPDF::getFilename, + "the source filename of an existing PDF, when available") + .def_property_readonly("pdf_version", &QPDF::getPDFVersion, + "the PDF standard version, such as '1.7'") + .def_property_readonly("extension_level", &QPDF::getExtensionLevel) + .def_property_readonly("Root", &QPDF::getRoot, + "the /Root object of the PDF" + ) + .def_property_readonly("root", &QPDF::getRoot, + "alias for .Root, the /Root object of the PDF" + ) + .def_property("docinfo", + [](QPDF& q) { + if (!q.getTrailer().hasKey("/Info")) { + auto info = q.makeIndirectObject(QPDFObjectHandle::newDictionary()); + q.getTrailer().replaceKey("/Info", info); + } + return q.getTrailer().getKey("/Info"); + }, + [](QPDF& q, QPDFObjectHandle& replace) { + if (!replace.isIndirect()) + throw py::value_error("docinfo must be an indirect object - use Pdf.make_indirect"); + q.getTrailer().replaceKey("/Info", replace); + }, + "access the document information dictionary" + ) + .def_property_readonly("trailer", &QPDF::getTrailer, + R"~~~( + Provides access to the PDF trailer object. + + See section 7.5.5 of the PDF reference manual. Generally speaking, + the trailer should not be modified with pikepdf, and modifying it + may not work. Some of the values in the trailer are automatically + changed when a file is saved. + )~~~" + ) + .def_property_readonly("pages", + [](std::shared_ptr q) { + return PageList(q); + }, + py::keep_alive<0, 1>() + ) + .def_property_readonly("_pages", &QPDF::getAllPages) + .def_property_readonly("is_encrypted", &QPDF::isEncrypted) + .def_property_readonly("is_linearized", &QPDF::isLinearized, + R"~~~( + Returns True if the PDF is linearized. + + Specifically returns True iff the file starts with a linearization + parameter dictionary. Does no additional validation. + )~~~" + ) + .def("check_linearization", + [](QPDF& q, py::object stream) { + py::scoped_estream_redirect redirector( + std::cerr, + stream + ); + q.checkLinearization(); + }, + R"~~~( + Reports information on the PDF's linearization + + Args: + stream: A stream to write this information too; must + implement ``.write()`` and ``.flush()`` method. Defaults to + :data:`sys.stderr`. + + )~~~", + py::arg_v("stream", py::module::import("sys").attr("stderr"), "sys.stderr") + ) + .def("get_warnings", &QPDF::getWarnings) // this is a def because it modifies state by clearing warnings + .def("show_xref_table", &QPDF::showXRefTable, + R"~~~( + Pretty-print the Pdf's xref (cross-reference table) + )~~~", + py::call_guard() + ) + .def("_add_page", + [](QPDF& q, QPDFObjectHandle& page, bool first=false) { + q.addPage(page, first); + }, + R"~~~( + Attach a page to this PDF. + + The page can be either be a newly constructed PDF object or it can + be obtained from another PDF. + + :param pikepdf.Object page: The page object to attach + :param bool first: If True, prepend this before the first page; if False append after last page + )~~~", + py::arg("page"), + py::arg("first")=false, + py::keep_alive<1, 2>() + ) + .def("_add_page_at", &QPDF::addPageAt, py::keep_alive<1, 2>()) + .def("_remove_page", &QPDF::removePage) + .def("remove_unreferenced_resources", + [](QPDF& q) { + QPDFPageDocumentHelper helper(q); + helper.removeUnreferencedResources(); + }, + R"~~~( + Remove from /Resources of each page any object not referenced in page's contents + + PDF pages may share resource dictionaries with other pages. If + pikepdf is used for page splitting, pages may reference resources + in their /Resources dictionary that are not actually required. + This purges all unnecessary resource entries. + + Suggested before saving. + + )~~~" + ) + .def("save", + save_pdf, + R"~~~( + Save all modifications to this :class:`pikepdf.Pdf` + + Args: + filename (str or stream): Where to write the output. If a file + exists in this location it will be overwritten. + + static_id (bool): Indicates that the ``/ID`` metadata, normally + calculated as a hash of certain PDF contents and metadata + including the current time, should instead be generated + deterministically. Normally for debugging. + preserve_pdfa (bool): Ensures that the file is generated in a + manner compliant with PDF/A and other stricter variants. + This should be True, the default, in most cases. + + min_version (str): Sets the minimum version of PDF + specification that should be required. If left alone QPDF + will decide. + force_version (str): Override the version recommend by QPDF, + potentially creating an invalid file that does not display + in old versions. See QPDF manual for details. + + object_stream_mode (pikepdf.ObjectStreamMode): + ``disable`` prevents the use of object streams. + ``preserve`` keeps object streams from the input file. + ``generate`` uses object streams wherever possible, + creating the smallest files but requiring PDF 1.5+. + + compress_streams (bool): Enables or disables the compression of + stream objects in the PDF. Metadata is never compressed. + By default this is set to ``True``, and should be except + for debugging. + + stream_decode_level (pikepdf.StreamDecodeLevel): Specifies how + to encode stream objects. See documentation for + ``StreamDecodeLevel``. + + normalize_content (bool): Enables parsing and reformatting the + content stream within PDFs. This may debugging PDFs easier. + + linearize (bool): Enables creating linear or "fast web view", + where the file's contents are organized sequentially so that + a viewer can begin rendering before it has the whole file. + As a drawback, it tends to make files larger. + + qdf (bool): Save output QDF mode. QDF mode is a special output + mode in QPDF to allow editing of PDFs in a text editor. Use + the program ``fix-qdf`` to fix convert back to a standard + PDF. + + You may call ``.save()`` multiple times with different parameters + to generate different versions of a file, and you *may* continue + to modify the file after saving it. ``.save()`` does not modify + the ``Pdf`` object in memory. + + .. note:: + + :meth:`pikepdf.Pdf.remove_unreferenced_resources` before saving + may eliminate unnecessary resources from the output file, so + calling this method before saving is recommended. This is not + done automatically because ``.save()`` is intended to be + idempotent. + + )~~~", + py::arg("filename"), + py::arg("static_id")=false, + py::arg("preserve_pdfa")=true, + py::arg("min_version")="", + py::arg("force_version")="", + py::arg("compress_streams")=true, + py::arg("stream_decode_level")=qpdf_stream_decode_level_e::qpdf_dl_generalized, + py::arg("object_stream_mode")=qpdf_object_stream_e::qpdf_o_preserve, + py::arg("normalize_content")=false, + py::arg("linearize")=false, + py::arg("qdf")=false, + py::arg("progress")=py::none() + ) + .def("_get_object_id", &QPDF::getObjectByID) + .def("get_object", + [](QPDF &q, std::pair objgen) { + return q.getObjectByID(objgen.first, objgen.second); + }, + R"~~~( + Look up an object by ID and generation number + + Returns: + pikepdf.Object + )~~~", + py::return_value_policy::reference_internal + ) + .def("get_object", + [](QPDF &q, int objid, int gen) { + return q.getObjectByID(objid, gen); + }, + R"~~~( + Look up an object by ID and generation number + + Returns: + pikepdf.Object + )~~~", + py::return_value_policy::reference_internal + ) + .def("make_indirect", &QPDF::makeIndirectObject, + R"~~~( + Attach an object to the Pdf as an indirect object + + Direct objects appear inline in the binary encoding of the PDF. + Indirect objects appear inline as references (in English, "look + up object 4 generation 0") and then read from another location in + the file. The PDF specification requires that certain objects + are indirect - consult the PDF specification to confirm. + + Generally a resource that is shared should be attached as an + indirect object. :class:`pikepdf.Stream` objects are always + indirect, and creating them will automatically attach it to the + Pdf. + + See Also: + :meth:`pikepdf.Object.is_indirect` + + Returns: + pikepdf.Object + )~~~" + ) + .def("make_indirect", + [](QPDF &q, py::object obj) -> QPDFObjectHandle { + return q.makeIndirectObject(objecthandle_encode(obj)); + }, + R"~~~( + Encode a Python object and attach to this Pdf as an indirect object + + Returns: + pikepdf.Object + )~~~" + ) + .def("copy_foreign", + [](QPDF &q, QPDFObjectHandle &h) -> QPDFObjectHandle { + return q.copyForeignObject(h); + }, + "Copy object from foreign PDF to this one.", + py::return_value_policy::reference_internal, + py::keep_alive<1, 2>() + ) + .def("_replace_object", + [](QPDF &q, int objid, int gen, QPDFObjectHandle &h) { + q.replaceObject(objid, gen, h); + } + ) + ; // class Pdf + + init_object(m); + +#ifdef VERSION_INFO + m.attr("__version__") = VERSION_INFO; +#else + m.attr("__version__") = "dev"; +#endif +} diff --git a/src/qpdf/qpdf_inputsource.h b/src/qpdf/qpdf_inputsource.h new file mode 100644 index 0000000..dc26267 --- /dev/null +++ b/src/qpdf/qpdf_inputsource.h @@ -0,0 +1,137 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include + +#include "pikepdf.h" + + +class PythonInputSource : public InputSource +{ +public: + PythonInputSource(py::object stream) : stream(stream) + { + if (!stream.attr("readable")()) + throw py::value_error("not readable"); + if (!stream.attr("seekable")()) + throw py::value_error("not seekable"); + this->name = py::cast(py::repr(stream)); + } + virtual ~PythonInputSource() = default; + PythonInputSource(const PythonInputSource&) = delete; + PythonInputSource& operator= (const PythonInputSource&) = delete; + PythonInputSource(PythonInputSource&&) = delete; + PythonInputSource& operator= (PythonInputSource&&) = delete; + + std::string const& getName() const override + { + return this->name; + } + + qpdf_offset_t tell() override + { + py::gil_scoped_acquire gil; + return py::cast(this->stream.attr("tell")()); + } + + void seek(qpdf_offset_t offset, int whence) override + { + py::gil_scoped_acquire gil; + this->stream.attr("seek")(offset, whence); + } + + void rewind() override + { + this->seek(0, SEEK_SET); + } + + size_t read(char* buffer, size_t length) override + { + py::gil_scoped_acquire gil; + + py::buffer_info buffer_info(buffer, length); + py::memoryview view_buffer_info(buffer_info); + + this->last_offset = this->tell(); + py::object result = this->stream.attr("readinto")(view_buffer_info); + if (result.is_none()) + return 0; + size_t bytes_read = py::cast(result); + + if (bytes_read == 0) { + if (length > 0) { + // EOF + this->seek(0, SEEK_END); + this->last_offset = this->tell(); + } + } + return bytes_read; + } + + void unreadCh(char ch) override + { + this->seek(-1, SEEK_CUR); + } + + qpdf_offset_t findAndSkipNextEOL() override + { + py::gil_scoped_acquire gil; + + qpdf_offset_t result = 0; + bool done = false; + bool eol_straddles_buf = false; + std::string buf(4096, '\0'); + std::string line_endings = "\r\n"; + + while (!done) { + qpdf_offset_t cur_offset = this->tell(); + size_t len = this->read(const_cast(buf.data()), buf.size()); + if (len == 0) { + done = true; + result = this->tell(); + } else { + size_t found; + if (!eol_straddles_buf) { + found = buf.find_first_of(line_endings); + if (found == std::string::npos) + continue; + } else { + found = 0; + } + + size_t found_end = buf.find_first_not_of(line_endings, found); + if (found_end == std::string::npos) { + eol_straddles_buf = true; + continue; + } + result = cur_offset + found_end; + this->seek(result, SEEK_SET); + done = true; + } + } + return result; + } + +private: + py::object stream; + std::string name; +}; diff --git a/src/qpdf/qpdf_pagelist.cpp b/src/qpdf/qpdf_pagelist.cpp new file mode 100644 index 0000000..d8222dd --- /dev/null +++ b/src/qpdf/qpdf_pagelist.cpp @@ -0,0 +1,295 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include "pikepdf.h" +#include "qpdf_pagelist.h" + +static void assert_pyobject_is_page(py::handle obj) +{ + QPDFObjectHandle h; + try { + h = obj.cast(); + } catch (const py::cast_error&) { + throw py::type_error("only pikepdf pages can be assigned to a page list"); + } + if (!h.isPageObject()) { + throw py::type_error("only pages can be assigned to a page list"); + } +} + +size_t uindex_from_index(PageList &pl, ssize_t index) +{ + if (index < 0) + index += pl.count(); + if (index < 0) // Still + throw py::index_error("Accessing nonexistent PDF page number"); + size_t uindex = index; + return uindex; +} + +QPDFObjectHandle PageList::get_page(size_t index) const +{ + auto pages = this->qpdf->getAllPages(); + if (index < 0) + index += pages.size(); + if (index < 0) // Still + throw py::index_error("Accessing nonexistent PDF page number"); + size_t uindex = index; + if (uindex < pages.size()) + return pages.at(uindex); + throw py::index_error("Accessing nonexistent PDF page number"); +} + +std::vector PageList::get_pages_impl(py::slice slice) const +{ + size_t start, stop, step, slicelength; + if (!slice.compute(this->count(), &start, &stop, &step, &slicelength)) + throw py::error_already_set(); + std::vector result; + for (size_t i = 0; i < slicelength; ++i) { + QPDFObjectHandle oh = this->get_page(start); + result.push_back(oh); + start += step; + } + return result; +} + +py::list PageList::get_pages(py::slice slice) const +{ + return py::cast(this->get_pages_impl(slice)); +} + +void PageList::set_page(size_t index, py::object page) +{ + this->insert_page(index, page); + if (index != this->count()) { + this->delete_page(index + 1); + } +} + +void PageList::set_pages_from_iterable(py::slice slice, py::iterable other) +{ + size_t start, stop, step, slicelength; + if (!slice.compute(this->count(), &start, &stop, &step, &slicelength)) + throw py::error_already_set(); + py::list results; + py::iterator it = other.attr("__iter__")(); + + // Unpack list into iterable, check that each object is a page but + // don't save the handles yet + for(; it != py::iterator::sentinel(); ++it) { + assert_pyobject_is_page(*it); + results.append(*it); + } + + if (step != 1) { + // For an extended slice we must be replace an equal number of pages + if (results.size() != slicelength) { + throw py::value_error( + std::string("attempt to assign sequence of length ") + + std::to_string(results.size()) + + std::string(" to extended slice of size ") + + std::to_string(slicelength) + ); + } + for (size_t i = 0; i < slicelength; ++i) { + this->set_page(start + (i * step), results[i]); + } + } else { + // For simple slices, we can replace differing sizes + // meaning results.size() could be slicelength, or not + // so insert all pages first (to ensure nothing is freed yet) + // and then delete all pages we no longer need + + // Insert first to ensure we don't delete any pages we will need + for (size_t i = 0; i < results.size(); ++i) { + this->insert_page(start + i, results[i]); + } + + size_t del_start = start + results.size(); + for (size_t i = 0; i < slicelength; ++i) { + this->delete_page(del_start); + } + } +} + +void PageList::delete_page(size_t index) +{ + auto page = this->get_page(index); + /* + // Need a dec_ref to match the inc_ref in insert_page, but it's unclear + // how to do that. The item will be set the current QPDF always. + // Accessing data from another PDF seems to involve some pipeline + // magic in QPDF around libqpdf/QPDFWriter.cc:1614 + if (original page owner != &this->getQPDF()) { + // If we are removing a page not originally owned by our QPDF, + // remove the reference count we put it in insert_page() + py::object pyqpdf = py::cast(page_owner); + pyqpdf.dec_ref(); + } + */ + this->qpdf->removePage(page); +} + +void PageList::delete_pages_from_iterable(py::slice slice) +{ + // See above: need a way to dec_ref pages with another owner + // Get handles for all pages, then remove them, since page numbers shift + // after delete + auto kill_list = this->get_pages_impl(slice); + for (auto page : kill_list) { + this->qpdf->removePage(page); + } +} + +size_t PageList::count() const +{ + return this->qpdf->getAllPages().size(); +} + +void PageList::insert_page(size_t index, py::handle obj) +{ + QPDFObjectHandle page; + try { + page = obj.cast(); + } catch (const py::cast_error&) { + throw py::type_error("only pages can be inserted"); + } + if (!page.isPageObject()) + throw py::type_error("only pages can be inserted"); + + this->insert_page(index, page); +} + +void PageList::insert_page(size_t index, QPDFObjectHandle page) +{ + // Find out who owns us + QPDF *page_owner = page.getOwningQPDF(); + + if (page_owner == this->qpdf.get()) { + // qpdf does not accept duplicating pages within the same file, + // so manually create a copy + page = this->qpdf->makeIndirectObject(page); + } else { + // libqpdf does not transfer a page's contents to the new QPDF. + // Instead WHEN ASKED TO WRITE it will go back and get the data + // from objecthandle->getOwningQPDF(). Therefore we must ensure + // our previous owner is kept alive. +#if 1 + auto tinfo = py::detail::get_type_info(typeid(QPDF)); + py::handle pyqpdf = py::detail::get_object_handle(page_owner, tinfo); + py::handle pypage = py::cast(page); + py::detail::keep_alive_impl(pypage, pyqpdf); +#else + // MSVC++ complains about the symbol + // QPDF::Members::~Members() not being exported when this version + // is used, but it works for GCC and Clang. + py::handle pyqpdf = py::cast(page_owner); + py::handle pypage = py::cast(page); + py::detail::keep_alive_impl(pypage, pyqpdf); +#endif + } + if (index != this->count()) { + QPDFObjectHandle refpage = this->get_page(index); + this->qpdf->addPageAt(page, true, refpage); + } else { + this->qpdf->addPage(page, false); + } +} + + +void init_pagelist(py::module &m) +{ + py::class_(m, "PageList") + .def("__getitem__", + [](PageList &pl, ssize_t index) { + size_t uindex = uindex_from_index(pl, index); + return pl.get_page(uindex); + } + ) + .def("__getitem__", &PageList::get_pages) + .def("__setitem__", + [](PageList &pl, ssize_t index, py::object page) { + size_t uindex = uindex_from_index(pl, index); + pl.set_page(uindex, page); + } + ) + .def("__setitem__", &PageList::set_pages_from_iterable) + .def("__delitem__", + [](PageList &pl, ssize_t index) { + size_t uindex = uindex_from_index(pl, index); + pl.delete_page(uindex); + } + ) + .def("__delitem__", &PageList::delete_pages_from_iterable) + .def("__len__", &PageList::count) + .def("p", + [](PageList &pl, size_t index) { + if (index == 0) // Indexing past end is checked in .get_page + throw py::index_error("page access out of range in 1-based indexing"); + return pl.get_page(index - 1); + }, + "convenience - look up page number in ordinal numbering, .p(1) is first page" + ) + .def("__iter__", + [](PageList &pl) { + return PageList(pl.qpdf, 0); + } + ) + .def("__next__", + [](PageList &pl) { + if (pl.iterpos < pl.count()) + return pl.get_page(pl.iterpos++); + throw py::stop_iteration(); + } + ) + .def("insert", + [](PageList &pl, ssize_t index, py::object obj) { + size_t uindex = uindex_from_index(pl, index); + pl.insert_page(uindex, obj); + }, py::keep_alive<1, 3>() + ) + .def("reverse", + [](PageList &pl) { + py::slice ordinary_indices(0, pl.count(), 1); + py::int_ step(-1); + py::slice reversed = py::reinterpret_steal( + PySlice_New(Py_None, Py_None, step.ptr())); + py::list reversed_pages = pl.get_pages(reversed); + pl.set_pages_from_iterable(ordinary_indices, reversed_pages); + } + ) + .def("append", + [](PageList &pl, py::object page) { + pl.insert_page(pl.count(), page); + }, + py::keep_alive<1, 2>() + ) + .def("extend", + [](PageList &pl, PageList &other) { + size_t other_count = other.count(); + for (size_t i = 0; i < other_count; i++) { + if (other_count != other.count()) + throw py::value_error("source page list modified during iteration"); + pl.insert_page(pl.count(), other.get_page(i)); + } + }, + py::keep_alive<1, 2>() + ) + .def("extend", + [](PageList &pl, py::iterable iterable) { + py::iterator it = iterable.attr("__iter__")(); + while (it != py::iterator::sentinel()) { + assert_pyobject_is_page(*it); + pl.insert_page(pl.count(), *it); + ++it; + } + }, + py::keep_alive<1, 2>() + ); +} diff --git a/src/qpdf/qpdf_pagelist.h b/src/qpdf/qpdf_pagelist.h new file mode 100644 index 0000000..975e259 --- /dev/null +++ b/src/qpdf/qpdf_pagelist.h @@ -0,0 +1,37 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#pragma once + +#include "pikepdf.h" + +#include + + +void init_pagelist(py::module &m); + +class PageList { +public: + PageList(std::shared_ptr q, size_t iterpos = 0) : iterpos(iterpos), qpdf(q) {}; + + QPDFObjectHandle get_page(size_t index) const; + py::list get_pages(py::slice slice) const; + void set_page(size_t index, py::object page); + void set_pages_from_iterable(py::slice slice, py::iterable other); + void delete_page(size_t index); + void delete_pages_from_iterable(py::slice slice); + size_t count() const; + void insert_page(size_t index, py::handle obj); + void insert_page(size_t index, QPDFObjectHandle page); +public: + size_t iterpos; + std::shared_ptr qpdf; + +private: + std::vector get_pages_impl(py::slice slice) const; +}; diff --git a/src/qpdf/shims.cpp b/src/qpdf/shims.cpp new file mode 100644 index 0000000..0020f12 --- /dev/null +++ b/src/qpdf/shims.cpp @@ -0,0 +1,45 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +/* Support for features missing from C++11, minimal versions */ + +#if __cplusplus < 201402L // If C++11 + +#include +#include + +#include "pikepdf.h" + +namespace std { + +string quoted(const string &s) +{ + stringstream ss; + ss << '"'; + for (const char &c : s) { + if (c == '"') { + ss << "\\\""; + } else if (c == '\\') { + ss << "\\\\"; + } else { + ss << c; + } + } + ss << '"'; + return ss.str(); +} + +string quoted(const char* s) +{ + return quoted(string(s)); +} + + +}; + +#endif // End C++11 \ No newline at end of file diff --git a/src/qpdf/shims.h b/src/qpdf/shims.h new file mode 100644 index 0000000..1f14388 --- /dev/null +++ b/src/qpdf/shims.h @@ -0,0 +1,30 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#if __cplusplus < 201402L // If C++11 + +#include +#include +#include +#include + +namespace std { + // Provide make_unique for C++11 (not array-capable) + // See https://stackoverflow.com/questions/17902405/how-to-implement-make-unique-function-in-c11/17902439#17902439 for full version if needed + template + unique_ptr make_unique( Args&& ...args ) + { + return unique_ptr( new T( std::forward(args)... ) ); + } + + // Provide basic std::quoted for C++11 + string quoted(const char* s); + string quoted(const string &s); +}; + +#endif // }} diff --git a/src/qpdf/utils.cpp b/src/qpdf/utils.cpp new file mode 100644 index 0000000..e938f8c --- /dev/null +++ b/src/qpdf/utils.cpp @@ -0,0 +1,105 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include +#include + +#include "utils.h" + +/* Convert a Python object to a filesystem encoded path + * Use Python's os.fspath() which accepts os.PathLike (str, bytes, pathlib.Path) + * and returns bytes encoded in the filesystem encoding. + * Cast to a string without transcoding. + */ + +#if PY_VERSION_HEX < 0x03060000 + + py::object fspath(py::object filename) + { + auto py_fspath = py::module::import("pikepdf._cpphelpers").attr("fspath"); + return py_fspath(filename); + } + +#else + + py::object fspath(py::object filename) + { + py::handle handle = PyOS_FSPath(filename.ptr()); + if (!handle) + throw py::error_already_set(); + return py::reinterpret_steal(handle); + } + +#endif + + +/* Open a file, accounting for encoding of the filename + * + * First use fspath to resolve the object to a str/bytes, if it is a fancy + * path like pathlib.Path. Then ask Python to open it. + * + * This is surprisingly hard to get right. Filename could be PathLike or str + * or bytes. If on Windows, filename needs to be wchar_t and we need to use + * _wfopen. Some environment variables factor in. So this awkward approach + * let us delegate all the details to Python. + * + * Ideally we would just use _Py_fopen_obj, but that is a private API. + */ +FILE *portable_fopen(py::object filename, const char* mode) +{ + auto path = fspath(filename); + auto io_open = py::module::import("io").attr("open"); + py::object pyfile; + py::int_ filedes = {-1}; + py::int_ filedes_dup = {-1}; + + // Use Python's builtin open to open the file, since it takes care of + // all of filename encoding issues and interprets mode + pyfile = io_open(path, mode); + try { + // Get file descriptor, and dup() it + filedes = pyfile.attr("fileno")(); + filedes_dup = py::module::import("os").attr("dup")(filedes); + } catch (const std::exception &e) { + pyfile.attr("close")(); + throw; + } + + try { + // Close original, releasing Python's buffers. We still have the duplicate + // descriptor. + pyfile.attr("close")(); + + // Now use stdlib to wrap descriptor as a FILE + FILE *file = fdopen(filedes_dup, mode); + if (!file) + throw std::system_error(errno, std::generic_category()); + return file; + } catch (const std::exception &e) { + if (filedes_dup.cast() >= 0) + close(filedes_dup); + throw; + } +} + +/* Delete a filename + * + * equivalent to + * with suppress(FileNotFoundError): + * os.unlink(f) + */ +void portable_unlink(py::object filename) +{ + auto path = fspath(filename); + auto os_unlink = py::module::import("os").attr("unlink"); + try { + os_unlink(path); + } catch (const std::exception &e) { // py::filenotfound_error doesn't work; pybind11 issue? + // Discard exception + } +} diff --git a/src/qpdf/utils.h b/src/qpdf/utils.h new file mode 100644 index 0000000..433bbda --- /dev/null +++ b/src/qpdf/utils.h @@ -0,0 +1,15 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#pragma once + +#include "pikepdf.h" + +py::object fspath(py::object filename); +FILE *portable_fopen(py::object filename, const char* mode); +void portable_unlink(py::object filename); diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8a67e83 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,28 @@ +import os +import sys +import platform + +pytest_plugins = ['helpers_namespace'] + +import pytest +from pathlib import Path +from subprocess import Popen, PIPE + + +if sys.version_info < (3, 4): + print("Requires Python 3.4+") + sys.exit(1) + + +TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) +PROJECT_ROOT = os.path.dirname(TESTS_ROOT) + + +@pytest.fixture +def resources(): + return Path(TESTS_ROOT) / 'resources' + + +@pytest.fixture(scope="function") +def outdir(tmpdir): + return Path(str(tmpdir)) diff --git a/tests/resources/cmyk-jpeg.pdf b/tests/resources/cmyk-jpeg.pdf new file mode 100644 index 0000000..c9ce390 Binary files /dev/null and b/tests/resources/cmyk-jpeg.pdf differ diff --git a/tests/resources/congress-gray.pdf b/tests/resources/congress-gray.pdf new file mode 100644 index 0000000..a124bdf Binary files /dev/null and b/tests/resources/congress-gray.pdf differ diff --git a/tests/resources/congress.pdf b/tests/resources/congress.pdf new file mode 100644 index 0000000..1ad8cb7 Binary files /dev/null and b/tests/resources/congress.pdf differ diff --git a/tests/resources/formxobject.pdf b/tests/resources/formxobject.pdf new file mode 100644 index 0000000..7c9044d Binary files /dev/null and b/tests/resources/formxobject.pdf differ diff --git a/tests/resources/fourpages.pdf b/tests/resources/fourpages.pdf new file mode 100644 index 0000000..63bad75 --- /dev/null +++ b/tests/resources/fourpages.pdf @@ -0,0 +1,82 @@ +%PDF-1.3 +% ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< /F1 2 0 R >> +endobj +2 0 obj +<< /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >> +endobj +3 0 obj +<< /Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources << /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> /Rotate 0 /Trans << >> + /Type /Page >> +endobj +4 0 obj +<< /Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources << /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> /Rotate 0 /Trans << >> + /Type /Page >> +endobj +5 0 obj +<< /Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources << /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> /Rotate 0 /Trans << >> + /Type /Page >> +endobj +6 0 obj +<< /Contents 13 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources << /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> /Rotate 0 /Trans << >> + /Type /Page >> +endobj +7 0 obj +<< /Outlines 14 0 R /PageMode /UseNone /Pages 9 0 R /Type /Catalog >> +endobj +8 0 obj +<< /Author (anonymous) /CreationDate (D:20170104164857+08'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20170104164857+08'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (untitled) /Trapped /False >> +endobj +9 0 obj +<< /Count 4 /Kids [ 3 0 R 4 0 R 5 0 R 6 0 R ] /Type /Pages >> +endobj +10 0 obj +<< /Filter [ /ASCII85Decode /FlateDecode ] /Length 119 >> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_TN+H0EFh6kukeI%&##8)Z^Pl&?o>*,]c=n'f=]$Mi(.A$51fd2&/H`'LK1BH51\8!1sZendstream +endobj +11 0 obj +<< /Filter [ /ASCII85Decode /FlateDecode ] /Length 110 >> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_TQKrX5]O8D6FO'f:M@POnBD'FGK%;`,:VZ4oX4^mZ!_\H#l0(%_~>endstream +endobj +12 0 obj +<< /Filter [ /ASCII85Decode /FlateDecode ] /Length 109 >> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_TQKrX5]O8D6FO'f86415f$E$51fd2&/H`'LK1BH51\8!(0KrP5~>endstream +endobj +13 0 obj +<< /Filter [ /ASCII85Decode /FlateDecode ] /Length 107 >> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_TQKrX5]O8D6FO'f::NMb=`k'%9c:'gf2?6F=O3'mR7h.JX]0~>endstream +endobj +14 0 obj +<< /Count 0 /Type /Outlines >> +endobj +xref +0 15 +0000000000 65535 f +0000000075 00000 n +0000000109 00000 n +0000000219 00000 n +0000000417 00000 n +0000000615 00000 n +0000000813 00000 n +0000001011 00000 n +0000001099 00000 n +0000001399 00000 n +0000001479 00000 n +0000001694 00000 n +0000001900 00000 n +0000002105 00000 n +0000002308 00000 n +trailer +<< /ID + % ReportLab generated PDF document -- digest (http://www.reportlab.com) + [(sc\\\343Gt\023\377\352\355/\266'\263I\372) (sc\\\343Gt\023\377\352\355/\266'\263I\372)] + /Info 8 0 R /Root 7 0 R /Size 15 >> +startxref +2358 +%%EOF diff --git a/tests/resources/graph-encrypted.pdf b/tests/resources/graph-encrypted.pdf new file mode 100644 index 0000000..6e086af Binary files /dev/null and b/tests/resources/graph-encrypted.pdf differ diff --git a/tests/resources/graph.pdf b/tests/resources/graph.pdf new file mode 100644 index 0000000..9b48e60 Binary files /dev/null and b/tests/resources/graph.pdf differ diff --git a/tests/resources/image-mono-inline.pdf b/tests/resources/image-mono-inline.pdf new file mode 100644 index 0000000..6099821 --- /dev/null +++ b/tests/resources/image-mono-inline.pdf @@ -0,0 +1,74 @@ +%PDF-1.3 +% ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 7 0 R /MediaBox [ 0 0 576 432 ] /Parent 6 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/Outlines 8 0 R /PageMode /UseNone /Pages 6 0 R /Type /Catalog +>> +endobj +5 0 obj +<< +/Author (anonymous) /CreationDate (D:20180523163359+08'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20180523163359+08'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (untitled) /Trapped /False +>> +endobj +6 0 obj +<< +/Count 1 /Kids [ 3 0 R ] /Type /Pages +>> +endobj +7 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 160 +>> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_H!OBD?NuCV35o1S34@S_\F^:oK>Hm;h-Z[EXl-s%rujq)PReFmq?,:AE:[LH+It*-RhJ''s"89(Znk7AqhhifU*t"6lX_1EOgC9"`c&2!endstream +endobj +8 0 obj +<< +/Count 0 /Type /Outlines +>> +endobj +xref +0 9 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000404 00000 n +0000000488 00000 n +0000000784 00000 n +0000000843 00000 n +0000001093 00000 n +trailer +<< +/ID +[] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 5 0 R +/Root 4 0 R +/Size 9 +>> +startxref +1139 +%%EOF diff --git a/tests/resources/invalid_creationdate.pdf b/tests/resources/invalid_creationdate.pdf new file mode 100644 index 0000000..8c46dd7 Binary files /dev/null and b/tests/resources/invalid_creationdate.pdf differ diff --git a/tests/resources/pal-1bit-rgb.pdf b/tests/resources/pal-1bit-rgb.pdf new file mode 100644 index 0000000..bd889fc Binary files /dev/null and b/tests/resources/pal-1bit-rgb.pdf differ diff --git a/tests/resources/pal-1bit-trivial.pdf b/tests/resources/pal-1bit-trivial.pdf new file mode 100644 index 0000000..bb9fb87 Binary files /dev/null and b/tests/resources/pal-1bit-trivial.pdf differ diff --git a/tests/resources/pal.pdf b/tests/resources/pal.pdf new file mode 100644 index 0000000..acc2706 Binary files /dev/null and b/tests/resources/pal.pdf differ diff --git a/tests/resources/pike-jp2.pdf b/tests/resources/pike-jp2.pdf new file mode 100644 index 0000000..588fcd6 Binary files /dev/null and b/tests/resources/pike-jp2.pdf differ diff --git a/tests/resources/sandwich.pdf b/tests/resources/sandwich.pdf new file mode 100644 index 0000000..99fc27e Binary files /dev/null and b/tests/resources/sandwich.pdf differ diff --git a/tests/resources/veraPDF test suite 6-2-10-t02-pass-a.pdf b/tests/resources/veraPDF test suite 6-2-10-t02-pass-a.pdf new file mode 100644 index 0000000..86d42b2 Binary files /dev/null and b/tests/resources/veraPDF test suite 6-2-10-t02-pass-a.pdf differ diff --git a/tests/resources/veraPDF test suite 6-2-3-3-t01-fail-c.pdf b/tests/resources/veraPDF test suite 6-2-3-3-t01-fail-c.pdf new file mode 100644 index 0000000..8157a0b --- /dev/null +++ b/tests/resources/veraPDF test suite 6-2-3-3-t01-fail-c.pdf @@ -0,0 +1,151 @@ +%PDF-1.4 +%вгПу +1 0 obj +<< +/Type /Catalog +/Outlines 2 0 R +/OutputIntents [] +/Pages 3 0 R +/Metadata 4 0 R +/PageMode /UseOutlines +>> +endobj +2 0 obj +<< +/Type /Outlines +/First 5 0 R +/Count 1 +/Last 5 0 R +>> +endobj +3 0 obj +<< +/Type /Pages +/Count 1 +/Kids [6 0 R] +>> +endobj +4 0 obj +<< +/Type /Metadata +/Subtype /XML +/Length 870>> +stream + veraPDF Consortium + +endstream +endobj +5 0 obj +<< +/Title (veraPDF test suite: 6-2-3-3-t01-fail-c) +/First 7 0 R +/Count 5 +/Last 8 0 R +/Parent 2 0 R +>> +endobj +6 0 obj +<< +/CropBox [0 0 500 500] +/ProcSet [/PDF /Text] +/Parent 3 0 R +/Contents 9 0 R +/Type /Page +/Resources << +/XObject << +/X0 10 0 R +>> +>> +/MediaBox [0 0 500 500] +>> +endobj +7 0 obj +<< +/Title (clause 6-2-3-3) +/Parent 5 0 R +/Next 11 0 R +>> +endobj +8 0 obj +<< +/Title (expected message: DeviceRGB colour space is used in a Form XObject, but the file does not define the OutputIntent) +/Parent 5 0 R +/Prev 12 0 R +>> +endobj +9 0 obj +<< +/Length 30>> +stream +q +1 0 0 1 50 350 cm +/X0 Do +Q +endstream +endobj +10 0 obj +<< +/Type /XObject +/Subtype /Form +/BBox [0 0 1000 1000] +/Length 46>> +stream +q +0 0.7 0.7 RG +0 0.7 0.7 rg +50 50 50 50 re +B +Q +endstream +endobj +11 0 obj +<< +/Title (topic 01) +/Parent 5 0 R +/Prev 7 0 R +/Next 13 0 R +>> +endobj +12 0 obj +<< +/Title (expected result: fail) +/Parent 5 0 R +/Prev 13 0 R +/Next 8 0 R +>> +endobj +13 0 obj +<< +/Title (instance c) +/Parent 5 0 R +/Prev 11 0 R +/Next 12 0 R +>> +endobj + +xref +0 14 +0000000000 65535 f +0000000019 00000 n +0000000141 00000 n +0000000212 00000 n +0000000269 00000 n +0000001221 00000 n +0000001338 00000 n +0000001513 00000 n +0000001585 00000 n +0000001756 00000 n +0000001837 00000 n +0000001987 00000 n +0000002066 00000 n +0000002158 00000 n + +trailer +<< +/Size 14 +/Root 1 0 R +/ID [ <9BB96B2659C401CF3D8C09D7F8039C2B>] +>> +startxref +2243 +%%EOF \ No newline at end of file diff --git a/tests/test_dictionary.py b/tests/test_dictionary.py new file mode 100644 index 0000000..5341968 --- /dev/null +++ b/tests/test_dictionary.py @@ -0,0 +1,37 @@ +from pikepdf import Pdf +import pytest + + +@pytest.fixture +def congress(resources): + pdf = Pdf.open(resources / 'congress.pdf') + pdfimage = pdf.pages[0].Resources.XObject['/Im0'] + return pdfimage, pdf + + +def test_get_equality_stream(congress): + image = congress[0] + assert image.ColorSpace == image['/ColorSpace'] == image.get('/ColorSpace') + assert image.ColorSpace == image.stream_dict.ColorSpace + + with pytest.raises(AttributeError): + image.NoSuchKey + with pytest.raises(KeyError): + image['/NoSuchKey'] + + image.get('/NoSuchKey', 42) == 42 + + +def test_get_equality_dict(congress): + page = congress[1].pages[0] + + assert page.MediaBox == page['/MediaBox'] == page.get('/MediaBox') + + with pytest.raises(RuntimeError): + page.stream_dict + with pytest.raises(AttributeError): + page.NoSuchKey + with pytest.raises(KeyError): + page['/NoSuchKey'] + + page.get('/NoSuchKey', 42) == 42 diff --git a/tests/test_formxobject.py b/tests/test_formxobject.py new file mode 100644 index 0000000..f402d76 --- /dev/null +++ b/tests/test_formxobject.py @@ -0,0 +1,76 @@ +import pytest +from pikepdf import Pdf, Object, Stream, Name, Dictionary + +# pylint: disable=e1137 + + +def test_create_form_xobjects(outdir): + pdf = Pdf.new() + + font = pdf.make_indirect( + Object.parse(b""" + << + /Type /Font + /Subtype /Type1 + /Name /F1 + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + >> + """) + ) + + width, height = 100, 100 + image_data = b"\xff\x7f\x00" * (width * height) + + image = Stream(pdf, image_data) + image.stream_dict = Object.parse(""" + << + /Type /XObject + /Subtype /Image + /ColorSpace /DeviceRGB + /BitsPerComponent 8 + /Width 100 + /Height 100 + >> + """) + xobj_image = Dictionary({'/Im1': image}) + + form_xobj_res = Dictionary({ + '/XObject': xobj_image + }) + form_xobj = Stream(pdf, b""" + /Im1 Do + """) + form_xobj['/Type'] = Name('/XObject') + form_xobj['/Subtype'] = Name('/Form') + form_xobj['/FormType'] = 1 + form_xobj['/Matrix'] = [1, 0, 0, 1, 0, 0] + form_xobj['/BBox'] = [0, 0, 1, 1] + form_xobj['/Resources'] = form_xobj_res + + rfont = {'/F1': font} + + resources = { + '/Font': rfont, + '/XObject': {'/Form1': form_xobj}, + } + + mediabox = [0, 0, 612, 792] + + stream = b""" + BT /F1 24 Tf 72 720 Td (Hi there) Tj ET + q 144 0 0 144 234 324 cm /Form1 Do Q + q 72 0 0 72 378 180 cm /Form1 Do Q + """ + + contents = Stream(pdf, stream) + + page = pdf.make_indirect({ + '/Type': Name('/Page'), + '/MediaBox': mediabox, + '/Contents': contents, + '/Resources': resources + }) + + pdf.pages.append(page) + pdf.save(outdir / 'formxobj.pdf') diff --git a/tests/test_image_access.py b/tests/test_image_access.py new file mode 100644 index 0000000..05fa010 --- /dev/null +++ b/tests/test_image_access.py @@ -0,0 +1,260 @@ +import pytest +import imghdr +from io import BytesIO +from PIL import Image, features as PIL_features +import zlib + +# pylint: disable=w0621 + + +from pikepdf import ( + Pdf, PdfImage, PdfError, Name, + parse_content_stream, PdfInlineImage, Stream, StreamDecodeLevel +) + + +def first_image_in(filename): + pdf = Pdf.open(filename) + pdfimagexobj = next(iter(pdf.pages[0].images.values())) + return pdfimagexobj, pdf + + +@pytest.fixture +def congress(resources): + return first_image_in(resources / 'congress.pdf') + + +@pytest.fixture +def sandwich(resources): + return first_image_in(resources / 'sandwich.pdf') + + +def test_image_from_nonimage(resources): + pdf = Pdf.open(resources / 'congress.pdf') + resources = pdf.pages[0].Contents + with pytest.raises(TypeError): + PdfImage(resources) + + +def test_image(congress): + pdfimage = PdfImage(congress[0]) + pillowimage = pdfimage.as_pil_image() + + assert pillowimage.mode == pdfimage.mode + assert pillowimage.size == pdfimage.size + + +def test_imagemask(congress): + assert PdfImage(congress[0]).image_mask == False + + +def test_image_replace(congress, outdir): + pdfimage = PdfImage(congress[0]) + pillowimage = pdfimage.as_pil_image() + + grayscale = pillowimage.convert('L') + grayscale = grayscale.resize((4, 4)) # So it is not obnoxious on error + + congress[0].write( + zlib.compress(grayscale.tobytes()), + filter=Name("/FlateDecode") + ) + congress[0].ColorSpace = Name("/DeviceGray") + pdf = congress[1] + pdf.save(outdir / 'congress_gray.pdf') + + +def test_lowlevel_jpeg(congress): + raw_bytes = congress[0].read_raw_bytes() + with pytest.raises(PdfError): + congress[0].read_bytes() + + assert imghdr.what('', h=raw_bytes) == 'jpeg' + + pim = PdfImage(congress[0]) + b = BytesIO() + pim.extract_to(stream=b) + b.seek(0) + im = Image.open(b) + assert im.size == (congress[0].Width, congress[0].Height) + assert im.mode == 'RGB' + + +def test_lowlevel_replace_jpeg(congress, outdir): + # This test will modify the PDF so needs its own image + raw_bytes = congress[0].read_raw_bytes() + + im = Image.open(BytesIO(raw_bytes)) + grayscale = im.convert('L') + grayscale = grayscale.resize((4, 4)) # So it is not obnoxious on error + + congress[0].write( + zlib.compress(grayscale.tobytes()[:10]), + filter=Name("/FlateDecode") + ) + congress[0].ColorSpace = Name('/DeviceGray') + + pdf = congress[1] + pdf.save(outdir / 'congress_gray.pdf') + + +@pytest.fixture +def inline(resources): + pdf = Pdf.open(resources / 'image-mono-inline.pdf') + for operands, _command in parse_content_stream(pdf.pages[0]): + if operands and isinstance(operands[0], PdfInlineImage): + return operands[0], pdf + + +def test_inline(inline): + iimage, _pdf = inline + assert iimage.width == 8 + assert iimage.image_mask == False + assert iimage.mode == 'RGB' + assert iimage.is_inline + assert iimage.colorspace == '/DeviceRGB' + + +def test_bits_per_component_missing(congress): + cong_im = congress[0] + del cong_im.stream_dict['/BitsPerComponent'] + assert PdfImage(congress[0]).bits_per_component == 8 + + +@pytest.mark.parametrize('w,h,pixeldata,cs,bpc', [ + (1, 1, b'\xff', '/DeviceGray', 1), + (1, 1, b'\xf0', '/DeviceGray', 8), + (1, 1, b'\xff\x00\xff', '/DeviceRGB', 8) +]) +def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc): + pdf = Pdf.new() + + image_data = pixeldata * (w * h) + + image = Stream(pdf, image_data) + image.Type = Name('/XObject') + image.Subtype = Name('/Image') + image.ColorSpace = Name(cs) + image.BitsPerComponent = bpc + image.Width = w + image.Height = h + + xobj = {'/Im1': image} + resources = {'/XObject': xobj} + mediabox = [0, 0, 100, 100] + stream = b'q 100 0 0 100 0 0 cm /Im1 Do Q' + contents = Stream(pdf, stream) + + page_dict = { + '/Type': Name('/Page'), + '/MediaBox': mediabox, + '/Contents': contents, + '/Resources': resources + } + page = pdf.make_indirect(page_dict) + + pdf.pages.append(page) + outfile = outdir / 'test{w}{h}{cs}{bpc}.pdf'.format( + w=w, h=h, cs=cs[1:], bpc=bpc + ) + pdf.save(outfile, compress_streams=False, + stream_decode_level=StreamDecodeLevel.none) + + p2 = pdf.open(outfile) + pim = PdfImage(p2.pages[0].Resources.XObject['/Im1']) + + assert pim.bits_per_component == bpc + assert pim.colorspace == cs + assert pim.width == w + assert pim.height == h + if cs == '/DeviceRGB': + assert pim.mode == 'RGB' + elif cs == '/DeviceGray' and bpc == 8: + assert pim.mode == 'L' + elif bpc == 1: + assert pim.mode == '1' + assert not pim.palette + + assert pim.filters == [] + assert pim.read_bytes() == pixeldata + + outstream = BytesIO() + pim.extract_to(stream=outstream) + outstream.seek(0) + im = Image.open(outstream) + assert pim.mode == im.mode + + +@pytest.mark.parametrize('filename,bpc,filters,ext,mode,format', + [ + ('sandwich.pdf', 1, ['/CCITTFaxDecode'], '.tif', '1', 'TIFF'), + ('congress-gray.pdf', 8, ['/DCTDecode'], '.jpg', 'L', 'JPEG'), + ('congress.pdf', 8, ['/DCTDecode'], '.jpg', 'RGB', 'JPEG'), + ('cmyk-jpeg.pdf', 8, ['/DCTDecode'], '.jpg', 'CMYK', 'JPEG') + ] +) +def test_direct_extract(resources, filename, bpc, filters, ext, mode, format): + xobj, pdf = first_image_in(resources / filename) + pim = PdfImage(xobj) + + assert pim.bits_per_component == bpc + assert pim.filters == filters + + outstream = BytesIO() + outext = pim.extract_to(stream=outstream) + assert outext == ext, 'unexpected output file' + outstream.seek(0) + + im = Image.open(outstream) + assert im.mode == mode + assert im.format == format + + +@pytest.mark.parametrize('filename,bpc', [ + ('pal.pdf', 8), + ('pal-1bit-trivial.pdf', 1), + pytest.param('pal-1bit-rgb.pdf', 1, marks=pytest.mark.xfail(raises=NotImplementedError)), +]) +def test_image_palette(resources, filename, bpc): + pdf = Pdf.open(resources / filename) + pim = PdfImage(next(iter(pdf.pages[0].images.values()))) + + assert pim.palette[0] == 'RGB' + assert pim.colorspace == '/DeviceRGB' + assert not pim.is_inline + assert pim.mode == 'P' + assert pim.bits_per_component == bpc + + outstream = BytesIO() + pim.extract_to(stream=outstream) + + +def test_bool_in_inline_image(): + piim = PdfInlineImage(image_data=b'', image_object=(Name.IM, True)) + assert piim.image_mask + + +@pytest.mark.skipif(not PIL_features.check_codec('jpg_2000'), + reason='no JPEG2000 codec') +def test_jp2(resources): + pdf = Pdf.open(resources / 'pike-jp2.pdf') + xobj = next(iter(pdf.pages[0].images.values())) + pim = PdfImage(xobj) + + assert '/JPXDecode' in pim.filters + assert pim.colorspace == '/DeviceRGB' + assert not pim.is_inline + assert not pim.indexed + assert pim.mode == 'RGB' + assert pim.bits_per_component == 8 + + outstream = BytesIO() + pim.extract_to(stream=outstream) + del pim + del xobj.ColorSpace + + # If there is no explicit ColorSpace metadata we should get it from the + # compressed data stream + pim = PdfImage(xobj) + assert pim.colorspace == '/DeviceRGB' + assert pim.bits_per_component == 8 diff --git a/tests/test_ipython.py b/tests/test_ipython.py new file mode 100644 index 0000000..4f616c8 --- /dev/null +++ b/tests/test_ipython.py @@ -0,0 +1,24 @@ +""" +Test IPython/Jupyter display hooks +""" + +import pikepdf +import pytest + + +@pytest.fixture +def graph(resources): + return pikepdf.open(resources / 'graph.pdf') + + +def test_display_page(graph): + page0 = graph.pages[0] + mimebundle = page0._repr_mimebundle_(include=None, exclude=None) + assert 'application/pdf' in mimebundle + + +def test_display_image(graph): + im0 = graph.pages[0].Resources.XObject['/Im0'] + pim = pikepdf.PdfImage(im0) + result = pim._repr_png_() + assert result[1:4] == b'PNG' diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 0000000..db54463 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,338 @@ +from pathlib import Path +from datetime import datetime, timezone, timedelta +import re + +import pytest +from hypothesis import given +from hypothesis.strategies import integers +import pikepdf +from pikepdf import Pdf, Dictionary, Name, PasswordError +from pikepdf.models.metadata import ( + decode_pdf_date, encode_pdf_date, + XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP, + DateConverter +) + +import defusedxml.ElementTree as ET + +try: + from libxmp import XMPMeta +except ImportError: + XMPMeta = None + +pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning') + +# pylint: disable=w0621 + + +@pytest.fixture +def vera(resources): + # Has XMP but no docinfo + return Pdf.open(resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf') + + +@pytest.fixture +def graph(resources): + # Has XMP and docinfo, all standard format XMP + return Pdf.open(resources / 'graph.pdf') + + +@pytest.fixture +def sandwich(resources): + # Has XMP, docinfo, , shorthand attribute XMP + return Pdf.open(resources / 'sandwich.pdf') + + +@pytest.fixture +def trivial(resources): + # Has no XMP or docinfo + return Pdf.open(resources / 'pal-1bit-trivial.pdf') + + +@pytest.fixture +def enron1(resources): + # Has nuls in docinfo, old PDF + return Pdf.open(resources / 'enron1_gs.pdf') + + +@pytest.fixture +def invalid_creationdate(resources): + # Has nuls in docinfo, old PDF + return Pdf.open(resources / 'invalid_creationdate.pdf') + + +def test_lowlevel(sandwich): + meta = sandwich.open_metadata() + assert meta._qname('pdf:Producer') == '{http://ns.adobe.com/pdf/1.3/}Producer' + assert meta._prefix_from_uri('{http://ns.adobe.com/pdf/1.3/}Producer') == 'pdf:Producer' + assert 'pdf:Producer' in meta + assert '{http://ns.adobe.com/pdf/1.3/}Producer' in meta + assert 'xmp:CreateDate' in meta + assert meta['xmp:ModifyDate'].startswith('2017') + assert len(meta) > 0 + assert meta['dc:title'] == 'Untitled' + + assert 'pdf:invalid' not in meta + assert '{http://ns.adobe.com/pdf/1.3/}invalid' not in meta + with pytest.raises(TypeError): + assert ['hi'] in meta + + with pytest.raises(KeyError): + meta['dc:invalid'] + with pytest.raises(KeyError): + meta['{http://ns.adobe.com/pdf/1.3/}invalid'] + with pytest.raises(KeyError): + meta['{http://invalid.com/ns/}doublyinvalid'] + + +def test_no_info(vera, outdir): + assert vera.trailer.get('/Info') is None, 'need a test file with no /Info' + + assert len(vera.docinfo) == 0 + creator = 'pikepdf test suite' + vera.docinfo['/Creator'] = creator + assert vera.docinfo.is_indirect, "/Info must be an indirect object" + vera.save(outdir / 'out.pdf') + + new = Pdf.open(outdir / 'out.pdf') + assert new.docinfo['/Creator'] == creator + + +def test_update_info(graph, outdir): + new_title = '我敢打赌,你只是想看看这意味着什么' + graph.docinfo['/Title'] = new_title + graph.save(outdir / 'out.pdf') + + new = Pdf.open(outdir / 'out.pdf') + assert new.docinfo['/Title'] == new_title + assert graph.docinfo['/Author'] == new.docinfo['/Author'] + + with pytest.raises(ValueError): + new.docinfo = Dictionary({'/Keywords': 'bob'}) + + new.docinfo = graph.make_indirect(Dictionary({'/Keywords': 'bob'})) + assert new.docinfo.is_indirect, "/Info must be an indirect object" + + +def test_copy_info(vera, graph, outdir): + vera.docinfo = vera.copy_foreign(graph.docinfo) + assert vera.docinfo.is_indirect, "/Info must be an indirect object" + vera.save(outdir / 'out.pdf') + + +def test_add_new_xmp_and_mark(trivial): + with trivial.open_metadata( + set_pikepdf_as_editor=False, update_docinfo=False + ) as xmp_view: + assert not xmp_view + + with trivial.open_metadata(update_docinfo=False + ) as xmp: + assert not xmp # No changes at this point + del xmp + + print(trivial.Root.Metadata.read_bytes()) + + with trivial.open_metadata(update_docinfo=False + ) as xmp: + assert 'pikepdf' in xmp['pdf:Producer'] + assert 'xmp:MetadataDate' in xmp + + +def test_update_docinfo(vera): + with vera.open_metadata(set_pikepdf_as_editor=False, update_docinfo=True) as xmp: + pass + assert xmp['pdf:Producer'] == vera.docinfo[Name.Producer] + assert xmp['xmp:CreatorTool'] == vera.docinfo[Name.Creator] + assert xmp['dc:creator'][0] == vera.docinfo[Name.Author] + + # Test delete propagation + with vera.open_metadata(set_pikepdf_as_editor=False, update_docinfo=True) as xmp: + del xmp['dc:creator'] + assert 'dc:creator' not in xmp + assert Name.Author not in vera.docinfo + + +@pytest.mark.parametrize('filename', list((Path(__file__).parent / 'resources').glob('*.pdf'))) +def test_roundtrip(filename): + try: + pdf = Pdf.open(filename) + except PasswordError: + return + with pdf.open_metadata() as xmp: + for k in xmp.keys(): + if not 'Date' in k: + xmp[k] = 'A' + assert 'BobDoug' in xmp_str + assert 'Mackenzie' in xmp_str + + if not XMPMeta: + pytest.skip(msg='needs libxmp') + + xmpmeta = XMPMeta(xmp_str=str(xmp)) + DC = XMP_NS_DC + assert xmpmeta.does_array_item_exist(DC, 'creator', 'Bob') + assert xmpmeta.does_array_item_exist(DC, 'creator', 'Doug') + assert xmpmeta.get_localized_text(DC, 'title', None, 'x-default') == 'Title' + assert xmpmeta.does_array_item_exist(DC, 'publisher', 'Mackenzie') + + +def test_python_xmp_validate_change_list(graph): + with graph.open_metadata() as xmp: + assert 'dc:creator' in xmp + xmp['dc:creator'] = ['Dobby', 'Kreacher'] + assert str(xmp) + if not XMPMeta: + pytest.skip(msg='needs libxmp') + xmpmeta = XMPMeta(xmp_str=str(xmp)) + DC = XMP_NS_DC + assert xmpmeta.does_array_item_exist(DC, 'creator', 'Dobby') + assert xmpmeta.does_array_item_exist(DC, 'creator', 'Kreacher') + + +def test_python_xmp_validate_change(sandwich): + with sandwich.open_metadata() as xmp: + assert 'xmp:CreatorTool' in xmp + xmp['xmp:CreatorTool'] = 'Creator' # Exists as a xml tag text + xmp['pdf:Producer'] = 'Producer' # Exists as a tag node + assert str(xmp) + if not XMPMeta: + pytest.skip(msg='needs libxmp') + xmpmeta = XMPMeta(xmp_str=str(xmp)) + assert xmpmeta.does_property_exist(XMP_NS_XMP, 'CreatorTool') + assert xmpmeta.does_property_exist(XMP_NS_PDF, 'Producer') + + +def test_decode_pdf_date(): + VALS = [ + ('20160220040559', datetime(2016, 2, 20, 4, 5, 59)), + ("20180101010101Z00'00'", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)), + ("20180101010101Z", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)), + ("20180101010101+0000", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)), + ("20180101010101+0100", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone(timedelta(hours=1)))), + ] + for s, d in VALS: + assert decode_pdf_date(s) == d + + +def test_date_docinfo_from_xmp(): + VALS = [ + ('2018-12-04T03:02:01', "20181204030201"), + ('2018-12-15T07:36:43Z', "20181215073643+00'00'"), + ('2018-12-04T03:02:01-01:00', "20181204030201-01'00'"), + ] + for xmp_val, docinfo_val in VALS: + assert DateConverter.docinfo_from_xmp(xmp_val) == docinfo_val + + +@given( + integers(-9999, 9999), + integers(0, 99), + integers(0, 99), + integers(0, 99), + integers(0, 99), + integers(0, 99), +) +def test_random_dates(year, month, day, hour, mins, sec): + date_args = year, month, day, hour, mins, sec + xmp = '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}'.format(*date_args) + docinfo = '{:04d}{:02d}{:02d}{:02d}{:02d}{:02d}'.format(*date_args) + + try: + converted = DateConverter.docinfo_from_xmp(xmp) + except ValueError: + pass + else: + assert converted == docinfo + + try: + converted = DateConverter.xmp_from_docinfo(docinfo) + except ValueError: + pass + else: + assert converted == xmp + + +def test_bad_char_rejection(trivial): + with trivial.open_metadata() as xmp: + xmp['dc:description'] = 'Bad characters \x00 \x01 \x02' + xmp['dc:creator'] = ['\ue001bad', '\ufff0bad'] + ET.fromstring(str(xmp)) + + +def test_xpacket(sandwich): + xmpstr1 = sandwich.Root.Metadata.read_bytes() + xpacket_begin = b' len(strnum): + strnum = strnum[:radix] + '.' + strnum[radix:] + + d = Decimal(strnum) + assert encode(d) == d + + +@given(floats()) +def test_decimal_from_float(f): + d = Decimal(f) + if isfinite(f) and d.is_finite(): + try: + # PDF is limited to ~5 sig figs + decstr = str(d.quantize(Decimal('1.000000'))) + except InvalidOperation: + return # PDF doesn't support exponential notation + try: + py_d = Object.parse(decstr) + except RuntimeError as e: + if 'overflow' in str(e) or 'underflow' in str(e): + py_d = Object.parse(str(f)) + + assert isclose(py_d, d, abs_tol=1e-5), (d, f.hex()) + else: + with pytest.raises(PdfError, message=repr(f)): + Object.parse(str(d)) + + +@given(lists(integers(-10, 10), min_size=0, max_size=10)) +def test_list(array): + a = pikepdf.Array(array) + assert a == array + + +@given(lists(lists(integers(1,10), min_size=1, max_size=5),min_size=1,max_size=5)) +def test_nested_list(array): + a = pikepdf.Array(array) + assert a == array + + +@given(recursive(integers(1,10) | booleans(), lambda children: lists(children), max_leaves=20)) +def test_nested_list2(array): + assume(isinstance(array, list)) + a = pikepdf.Array(array) + assert a == array + + +def test_list_apis(): + a = pikepdf.Array([1, 2, 3]) + a[1] = None + assert a[1] is None + assert len(a) == 3 + del a[1] + assert len(a) == 2 + a[-1] = Name('/Foo') + + +def test_stack_depth(): + a = [42] + for _ in range(100): + a = [a] + rlimit = sys.getrecursionlimit() + try: + sys.setrecursionlimit(100) + with pytest.raises(RecursionError, message="recursion"): + assert encode(a) == a + with pytest.raises(RecursionError, message="recursion"): + encode(a) == encode(a) # pylint: disable=expression-not-assigned + with pytest.raises(RecursionError, message="recursion"): + repr(a) + finally: + sys.setrecursionlimit(rlimit) # So other tests are not affected + + +def test_bytes(): + b = b'\x79\x78\x77\x76' + qs = String(b) + assert bytes(qs) == b + + s = 'é' + qs = String(s) + assert str(qs) == s + + +def test_len_array(): + assert len(Array([])) == 0 + assert len(Array()) == 0 + assert len(Array([3])) == 1 + + +def test_name_equality(): + # Who needs transitivity? :P + # While this is less than ideal ('/Foo' != b'/Foo') it allows for slightly + # sloppy tests like if colorspace == '/Indexed' without requiring + # Name('/Indexed') everywhere + assert Name('/Foo') == '/Foo' + assert Name('/Foo') == b'/Foo' + assert Name.Foo == Name('/Foo') + + +def test_unslashed_name(): + with pytest.raises(ValueError, match='must begin with'): + Name('Monty') not in d + + +def test_forbidden_name_usage(): + with pytest.raises(TypeError): + Name.Monty = Name.Python + with pytest.raises(TypeError): + Name['/Monty'] + + +class TestHashViolation: + + def check(self, a, b): + assert a == b, "invalid test case" + assert hash(a) == hash(b), "hash violation" + + def test_unequal_but_similar(self): + assert Name('/Foo') != String('/Foo') + + def test_numbers(self): + self.check(Object.parse('1.0'), 1) + self.check(Object.parse('42'), 42) + + def test_bool_comparison(self): + self.check(Object.parse('0.0'), False) + self.check(True, 1) + + def test_string(self): + utf16 = b'\xfe\xff' + 'hello'.encode('utf-16be') + self.check(String(utf16), String('hello')) + + def test_name(self): + self.check(Name.This, Name('/This')) + + def test_operator(self): + self.check(Operator('q'), Operator('q')) + + +def test_not_constructible(): + with pytest.raises(TypeError, message="constructor"): + Object() + + +class TestRepr: + + def test_repr_dict(self): + d = Dictionary({ + '/Boolean': True, + '/Integer': 42, + '/Real': Decimal('42.42'), + '/String': String('hi'), + '/Array': Array([1, 2, 3.14]), + '/Operator': Operator('q'), + '/Dictionary': Dictionary({'/Color': 'Red'}) + }) + expected = """\ + pikepdf.Dictionary({ + "/Array": [ 1, 2, Decimal('3.140000') ], + "/Boolean": True, + "/Dictionary": { + "/Color": "Red" + }, + "/Integer": 42, + "/Operator": pikepdf.Operator("q"), + "/Real": Decimal('42.42'), + "/String": "hi" + }) + """ + + def strip_all_whitespace(s): + return ''.join(s.split()) + + assert strip_all_whitespace(repr(d)) == strip_all_whitespace(expected) + assert eval(repr(d)) == d + + def test_repr_scalar(self): + scalars = [ + False, + 666, + Decimal('3.14'), + String('scalar'), + Name('/Bob'), + Operator('Q') + ] + for s in scalars: + assert eval(repr(s)) == s + + def test_repr_indirect(self, resources): + graph = pikepdf.open(resources / 'graph.pdf') + repr_page0 = repr(graph.pages[0]) + assert repr_page0[0] == '<', 'should not be constructible' + + +def test_utf16_error(): + with pytest.raises((UnicodeEncodeError, RuntimeError)): + str(encode('\ud801')) + + +class TestDictionary: + + def test_dictionary_contains(self): + d = Dictionary({ + '/Monty': 'Python', + '/Flying': 'Circus' + }) + assert Name.Flying in d + assert Name('/Monty') in d + assert Name.Brian not in d + + def test_dictionary_none(self): + d = pikepdf.Dictionary({'/One': 1, '/Two': 2}) + with pytest.raises(ValueError): + d['/Two'] = None + + def test_dictionary_init(self): + d1 = pikepdf.Dictionary({'/Animal': 'Dog'}) + d2 = pikepdf.Dictionary(Animal='Dog') + assert d1 == d2 + + def test_dictionary_kwargs(self): + d = pikepdf.Dictionary(A='a', B='b', C='c') + assert '/B' in d + assert 'B' in dir(d) + + def test_dictionary_iter(self): + d = pikepdf.Dictionary(A='a') + for k in d: + assert k == '/A' + assert d[k] == 'a' + + def test_dictionary_items(self): + d = pikepdf.Dictionary(A='a') + for k in d.items(): + pass + +def test_not_convertible(): + class PurePythonObj: + def __repr__(self): + return 'PurePythonObj()' + c = PurePythonObj() + with pytest.raises(RuntimeError): + encode(c) + with pytest.raises(RuntimeError): + pikepdf.Array([1, 2, c]) + + d = pikepdf.Dictionary() + with pytest.raises(RuntimeError): + d.SomeKey = c diff --git a/tests/test_pages.py b/tests/test_pages.py new file mode 100644 index 0000000..a542250 --- /dev/null +++ b/tests/test_pages.py @@ -0,0 +1,242 @@ +import pytest +from pikepdf import Pdf, Stream, PdfMatrix + +from contextlib import suppress +from shutil import copy +import gc + +from sys import getrefcount as refcount + +@pytest.fixture +def graph(resources): + return Pdf.open(resources / 'graph.pdf') + + +@pytest.fixture +def fourpages(resources): + return Pdf.open(resources / 'fourpages.pdf') + + +@pytest.fixture +def sandwich(resources): + return Pdf.open(resources / 'sandwich.pdf') + + +def test_split_pdf(fourpages, outdir): + for n, page in enumerate(fourpages.pages): + outpdf = Pdf.new() + outpdf.pages.append(page) + outpdf.save(outdir / "page{}.pdf".format(n + 1)) + + assert len([f for f in outdir.iterdir() if f.name.startswith('page')]) == 4 + + +def test_empty_pdf(outdir): + q = Pdf.new() + with pytest.raises(IndexError): + q.pages[0] + q.save(outdir / 'empty.pdf') + + +def test_delete_last_page(graph, outdir): + q = graph + del q.pages[0] + q.save(outdir / 'empty.pdf') + + +def test_replace_page(graph, fourpages): + q = fourpages + q2 = graph + + assert len(q.pages) == 4 + q.pages[1] = q2.pages[0] + assert len(q.pages) == 4 + assert q.pages[1].Resources.XObject.keys() == \ + q2.pages[0].Resources.XObject.keys() + + +def test_hard_replace_page(fourpages, graph, sandwich, outdir): + q = fourpages + q2 = graph + + q2_page = q2.pages[0] + del q2 + q.pages[1] = q2_page + + q2 = sandwich + q2_page = q2.pages[0] + q.pages[2] = q2_page + del q2 + del q2_page + gc.collect() + + q.save(outdir / 'out.pdf') + + +def test_reverse_pages(resources, outdir): + q = Pdf.open(resources / "fourpages.pdf") + qr = Pdf.open(resources / "fourpages.pdf") + + lengths = [int(page.Contents.stream_dict.Length) for page in q.pages] + + qr.pages.reverse() + qr.save(outdir / "reversed.pdf") + + for n, length in enumerate(lengths): + assert q.pages[n].Contents.stream_dict.Length == length + + for n, length in enumerate(reversed(lengths)): + assert qr.pages[n].Contents.stream_dict.Length == length + + +def test_evil_page_deletion(resources, outdir): + # str needed for py<3.6 + copy(str(resources / 'sandwich.pdf'), str(outdir / 'sandwich.pdf')) + + src = Pdf.open(outdir / 'sandwich.pdf') + pdf = Pdf.open(resources / 'graph.pdf') + + assert refcount(src) == 2 + pdf.pages.append(src.pages[0]) + assert refcount(src) == 3 + + del src.pages[0] + gc.collect() + assert refcount(src) == 3 + + with suppress(PermissionError): # Fails on Windows + (outdir / 'sandwich.pdf').unlink() + pdf.save(outdir / 'out.pdf') + + del pdf.pages[0] + pdf.save(outdir / 'out2.pdf') + + del pdf.pages[0] + pdf.save(outdir / 'out_nopages.pdf') + del pdf + gc.collect() + # Ideally we'd see the check_refcount(src, 2) at this point, but we don't + # have a way to find out when a PDF can be closed if a page was copied out + # of it to another PDF + + +def test_append_all(sandwich, fourpages, outdir): + pdf = sandwich + pdf2 = fourpages + + for page in pdf2.pages: + pdf.pages.append(page) + + assert len(pdf.pages) == 5 + pdf.save(outdir / 'out.pdf') + + +def test_extend_delete(sandwich, fourpages, outdir): + pdf = sandwich + pdf2 = fourpages + pdf.pages.extend(pdf2.pages) + + assert len(pdf.pages) == 5 + + del pdf.pages[2:4] + + pdf.save(outdir / 'out.pdf') + + +def test_slice_unequal_replacement(fourpages, sandwich, outdir): + pdf = fourpages + pdf2 = sandwich + + assert len(pdf.pages[1:]) != len(pdf2.pages) + page0_content_len = int(pdf.pages[0].Contents.Length) + page1_content_len = int(pdf.pages[1].Contents.Length) + pdf.pages[1:] = pdf2.pages + + assert len(pdf.pages) == 2, "number of pages must be changed" + pdf.save(outdir / 'out.pdf') + assert pdf.pages[0].Contents.Length == page0_content_len, \ + "page 0 should be unchanged" + assert pdf.pages[1].Contents.Length != page1_content_len, \ + "page 1's contents should have changed" + + +def test_slice_with_step(fourpages, sandwich, outdir): + pdf = fourpages + pdf2 = sandwich + + pdf2.pages.extend(pdf2.pages[:]) + assert len(pdf2.pages) == 2 + pdf2_content_len = int(pdf2.pages[0].Contents.Length) + + pdf.pages[0::2] = pdf2.pages + pdf.save(outdir / 'out.pdf') + + assert all(page.Contents.Length == pdf2_content_len + for page in pdf.pages[0::2]) + + +def test_slice_differing_lengths(fourpages, sandwich): + pdf = fourpages + pdf2 = sandwich + + with pytest.raises(ValueError, + message="attempt to assign"): + pdf.pages[0::2] = pdf2.pages[0:1] + + +@pytest.mark.timeout(1) +def test_self_extend(fourpages): + pdf = fourpages + with pytest.raises(ValueError, + message="source page list modified during iteration"): + pdf.pages.extend(pdf.pages) + + +def test_one_based_pages(fourpages): + pdf = fourpages + assert pdf.pages.p(1) == pdf.pages[0] + assert pdf.pages.p(4) == pdf.pages[-1] + with pytest.raises(IndexError): + pdf.pages.p(5) + with pytest.raises(IndexError): + pdf.pages.p(0) + + +def test_page_contents_add(graph, outdir): + pdf = graph + + mat = PdfMatrix().rotated(45) + + stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm') + stream2 = Stream(pdf, b'Q') + + pdf.pages[0].page_contents_add(stream1, True) + pdf.pages[0].page_contents_add(stream2, False) + pdf.save(outdir / 'out.pdf') + + +def test_bad_access(fourpages): + pdf = fourpages + with pytest.raises(IndexError): + pdf.pages[-100] + with pytest.raises(IndexError): + pdf.pages[500] + + +def test_bad_insert(fourpages): + pdf = fourpages + with pytest.raises(TypeError): + pdf.pages.insert(0, 'this is a string not a page') + + +def test_negative_indexing(fourpages, graph): + fourpages.pages[-1] + fourpages.pages[-1] = graph.pages[-1] + del fourpages.pages[-1] + fourpages.pages.insert(-2, graph.pages[-1]) + with pytest.raises(IndexError): + fourpages.pages[-42] + with pytest.raises(IndexError): + fourpages.pages[-42] = graph.pages[0] + with pytest.raises(IndexError): + del fourpages.pages[-42] diff --git a/tests/test_parsers.py b/tests/test_parsers.py new file mode 100644 index 0000000..fac0ccd --- /dev/null +++ b/tests/test_parsers.py @@ -0,0 +1,101 @@ +import sys + +import pytest +from pikepdf import ( + parse_content_stream, Pdf, Stream, Operator, Object, + Dictionary +) +from pikepdf.models import _Page as Page +from pikepdf._qpdf import StreamParser +from subprocess import run, PIPE +import shutil + + +# pylint: disable=useless-super-delegation + +class PrintParser(StreamParser): + def __init__(self): + super().__init__() + + def handle_object(self, obj): + print(repr(obj)) + + def handle_eof(self): + print("--EOF--") + + +class ExceptionParser(StreamParser): + def __init__(self): + super().__init__() + + def handle_object(self, obj): # pylint: disable=unused-argument + raise ValueError("I take exception to this") + + def handle_eof(self): + print("--EOF--") + + +def test_open_pdf(resources): + pdf = Pdf.open(resources / 'graph.pdf') + page = pdf.pages[0] + Object._parse_stream(page, PrintParser()) + + +def test_parser_exception(resources): + pdf = Pdf.open(resources / 'graph.pdf') + stream = pdf.pages[0]['/Contents'] + with pytest.raises(ValueError): + Object._parse_stream(stream, ExceptionParser()) + + +@pytest.mark.skipif( + shutil.which('pdftotext') is None, + reason="poppler not installed") +def test_text_filter(resources, outdir): + input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf' + + # Ensure the test PDF has detect we can find + proc = run(['pdftotext', str(input_pdf), '-'], + check=True, stdout=PIPE, encoding='utf-8') + assert proc.stdout.strip() != '', "Need input test file that contains text" + + pdf = Pdf.open(input_pdf) + page = pdf.pages[0] + + keep = [] + for operands, command in parse_content_stream(page): + if command == Operator('Tj'): + print("skipping Tj") + continue + keep.append((operands, command)) + + new_stream = Stream(pdf, keep) + print(new_stream.read_bytes()) # pylint: disable=no-member + page['/Contents'] = new_stream + page['/Rotate'] = 90 + + pdf.save(outdir / 'notext.pdf', True) + + proc = run(['pdftotext', str(outdir / 'notext.pdf'), '-'], + check=True, stdout=PIPE, encoding='utf-8') + + assert proc.stdout.strip() == '', "Expected text to be removed" + + +def test_invalid_stream_object(): + with pytest.raises(TypeError): + parse_content_stream(Dictionary({"/Hi": 3})) + + +@pytest.mark.parametrize("test_file,expected", [ + ("fourpages.pdf", True), + ("graph.pdf", False), + ("veraPDF test suite 6-2-10-t02-pass-a.pdf", True), + ("veraPDF test suite 6-2-3-3-t01-fail-c.pdf", False), + ('sandwich.pdf', True) +]) +def test_has_text(resources, test_file, expected): + pdf = Pdf.open(resources / test_file) + for p in pdf.pages: + page = Page(p) + assert page.has_text() == expected diff --git a/tests/test_pdf.py b/tests/test_pdf.py new file mode 100644 index 0000000..33b949b --- /dev/null +++ b/tests/test_pdf.py @@ -0,0 +1,177 @@ +""" +Testing focused on pikepdf.Pdf +""" + +import pytest +from pikepdf import Pdf, PasswordError, Stream, PdfError + +import sys +import os +from io import StringIO +from unittest.mock import Mock, patch +import shutil +from pikepdf._cpphelpers import fspath # For py35 + + +@pytest.fixture +def trivial(resources): + return Pdf.open(resources / 'pal-1bit-trivial.pdf') + + +def test_non_filename(): + with pytest.raises(TypeError): + Pdf.open(42) + + +def test_not_existing_file(): + with pytest.raises(FileNotFoundError): + Pdf.open('does_not_exist.pdf') + + +def test_empty(outdir): + target = outdir / 'empty.pdf' + target.touch() + with pytest.raises(PdfError): + Pdf.open(target) + + +class TestLinearization: + def test_linearization(self, resources, outdir): + pdf = Pdf.open(resources / 'graph.pdf') + assert not pdf.is_linearized + + pdf.save(outdir / 'lin.pdf', linearize=True) + + pdf = Pdf.open(outdir / 'lin.pdf') + assert pdf.is_linearized + + sio = StringIO() + pdf.check_linearization(sio) + + +def test_objgen(resources): + src = Pdf.open(resources / 'graph.pdf') + im0 = src.pages[0].Resources.XObject['/Im0'] + assert im0.objgen == (5, 0) + object5 = src.get_object((5, 0)) + assert object5.is_owned_by(src) + assert object5 == im0 + + +class TestPasswords: + def test_open_pdf_wrong_password(self, resources): + # The correct passwords are "owner" and "user" + with pytest.raises(PasswordError): + Pdf.open(resources / 'graph-encrypted.pdf', password='wrong') + + def test_open_pdf_password_encoding(self, resources): + with pytest.raises(PasswordError): + Pdf.open(resources / 'graph-encrypted.pdf', password=b'\x01\xfe') + + def test_open_pdf_no_password_but_needed(self, resources): + with pytest.raises(PasswordError): + Pdf.open(resources / 'graph-encrypted.pdf') + + +class TestStreams: + def test_stream(self, resources): + with (resources / 'pal-1bit-trivial.pdf').open('rb') as stream: + pdf = Pdf.open(stream) + assert pdf.root.Pages.Count == 1 + + def test_no_text_stream(self, resources): + with pytest.raises(TypeError): + with (resources / 'pal-1bit-trivial.pdf').open('r') as stream: + Pdf.open(stream) + + def test_save_stream(self, trivial, outdir): + from io import BytesIO + pdf = trivial + pdf.save(outdir / 'nostream.pdf', static_id=True) + + bio = BytesIO() + pdf.save(bio, static_id=True) + bio.seek(0) + + with (outdir / 'nostream.pdf').open('rb') as saved_file: + saved_file_contents = saved_file.read() + assert saved_file_contents == bio.read() + + +class TestMemory: + def test_memory(self, resources): + pdf = (resources / 'pal-1bit-trivial.pdf').read_bytes() + with pytest.raises(Exception): + pdf = Pdf.open(pdf) + + +def test_remove_unreferenced(resources, outdir): + in_ = resources / 'sandwich.pdf' + out1 = outdir / 'out1.pdf' + out2 = outdir / 'out2.pdf' + pdf = Pdf.open(in_) + pdf.pages[0].Contents = Stream(pdf, b' ') + pdf.save(out1) + + pdf.remove_unreferenced_resources() + pdf.save(out2) + + assert out2.stat().st_size < out1.stat().st_size + + +def test_show_xref(trivial): + trivial.show_xref_table() + + +@pytest.mark.skipif(sys.version_info < (3, 6), + reason='missing mock.assert_called') +def test_progress(trivial, outdir): + pdf = trivial + mock = Mock() + pdf.save(outdir / 'out.pdf', progress=mock) + mock.assert_called() + + +def test_unicode_filename(resources, outdir): + target1 = outdir / '测试.pdf' + target2 = outdir / '通过考试.pdf' + shutil.copy( + fspath(resources / 'pal-1bit-trivial.pdf'), + fspath(target1) + ) + pdf = Pdf.open(target1) + pdf.save(target2) + assert target2.exists() + + +@pytest.mark.skipif(os.name == 'nt', reason='os.dup hackery not supported') +def test_fileno_fails(resources): + with patch('os.dup') as dup: + dup.side_effect = OSError('assume dup fails') + with pytest.raises(OSError): + pdf = Pdf.open(resources / 'pal-1bit-trivial.pdf') + + with patch('os.dup') as dup: + dup.return_value = -1 + with pytest.raises(RuntimeError): + pdf = Pdf.open(resources / 'pal-1bit-trivial.pdf') + + +def test_min_and_force_version(trivial, outdir): + pdf = trivial + pdf.save(outdir / '1.7.pdf', min_version='1.7') + + pdf17 = Pdf.open(outdir / '1.7.pdf') + assert pdf17.pdf_version == '1.7' + + with pytest.raises(RuntimeError): + pdf.save('notaversion.pdf', min_version='foo') + + pdf.save(outdir / '1.2.pdf', force_version='1.2') + pdf12 = Pdf.open(outdir / '1.2.pdf') + assert pdf12.pdf_version == '1.2' + + +def test_normalize_linearize(trivial, outdir): + with pytest.raises(ValueError): + trivial.save(outdir / 'no.pdf', linearize=True, normalize_content=True) diff --git a/tests/test_pdfa.py b/tests/test_pdfa.py new file mode 100644 index 0000000..975b258 --- /dev/null +++ b/tests/test_pdfa.py @@ -0,0 +1,64 @@ +import pytest +from pikepdf import Pdf +import os +from pathlib import Path +from subprocess import run, PIPE, STDOUT +import xml.etree.ElementTree as ET + +try: + VERAPDF = Path(os.environ['HOME']) / 'verapdf' / 'verapdf' + if not VERAPDF.is_file(): + VERAPDF = None +except Exception: # pylint: disable=w0703 + VERAPDF = None + +pytestmark = pytest.mark.skipif(not VERAPDF, reason="verapdf not found") + + +def verapdf_validate(filename): + proc = run([VERAPDF, filename], stdout=PIPE, stderr=STDOUT, check=True) + result = proc.stdout.decode('utf-8') + xml_start = result.find('pdf + assert refcount(page0) == 2 + + pdf2 = Pdf.open(resources / 'fourpages.pdf') + pdf2.pages.insert(2, page0) + p2p2 = pdf2.pages[2] + + assert refcount(pdf) == 4 # this, pdf, page0->pdf, pdf2's page0 + + assert refcount(p2p2) == 2 + del pdf + del page0 + assert refcount(p2p2) == 2 + + del pdf2.pages[2] + assert before == p2p2.Contents.read_bytes() diff --git a/tests/test_sanity.py b/tests/test_sanity.py new file mode 100644 index 0000000..df1f387 --- /dev/null +++ b/tests/test_sanity.py @@ -0,0 +1,137 @@ +""" +A bunch of quick tests that confirm nothing is horribly wrong +""" + +import pytest + +import gc +from contextlib import suppress +from shutil import copy +import sys + +import pikepdf +from pikepdf import Pdf, Object, Name, Stream + + +def test_minimum_qpdf_version(): + from pikepdf import _qpdf + assert _qpdf.qpdf_version() >= '7.0.0' + + +def test_open_pdf(resources): + pdf = pikepdf.open(resources / 'graph.pdf') + assert '1.3' <= pdf.pdf_version <= '1.7' + + assert pdf.root['/Pages']['/Count'] == 1 + + +def test_open_pdf_password(resources): + pdf = Pdf.open(resources / 'graph-encrypted.pdf', password='owner') + assert pdf.root['/Pages']['/Count'] == 1 + + +def test_attr_access(resources): + pdf = Pdf.open(resources / 'graph.pdf') + assert int(pdf.root.Pages.Count) == 1 + + +def test_create_pdf(outdir): + pdf = Pdf.new() + + font = pdf.make_indirect( + Object.parse(b""" + << + /Type /Font + /Subtype /Type1 + /Name /F1 + /BaseFont /Helvetica + /Encoding /WinAnsiEncoding + >>""")) + + width, height = 100, 100 + image_data = b"\xff\x7f\x00" * (width * height) + + image = Stream(pdf, image_data) + image.stream_dict = Object.parse(b""" + << + /Type /XObject + /Subtype /Image + /ColorSpace /DeviceRGB + /BitsPerComponent 8 + /Width 100 + /Height 100 + >>""") + + rfont = {'/F1': font} + + xobj = {'/Im1': image} + + resources = { + '/Font': rfont, + '/XObject': xobj + } + + mediabox = [0, 0, 612, 792] + + stream = b""" + BT /F1 24 Tf 72 720 Td (Hi there) Tj ET + q 144 0 0 144 234 324 cm /Im1 Do Q + """ + + contents = Stream(pdf, stream) + + page_dict = { + '/Type': Name('/Page'), + '/MediaBox': mediabox, + '/Contents': contents, + '/Resources': resources + } + qpdf_page_dict = page_dict + page = pdf.make_indirect(qpdf_page_dict) + + pdf.pages.append(page) + pdf.save(outdir / 'hi.pdf') + + +def test_copy_semantics(resources): + pdf = Pdf.open(resources / 'graph.pdf') + + # Ensure that we can name a reference to a child object and view the + # changes from the parent + page = pdf.pages[0] + mediabox = page['/MediaBox'] + assert mediabox[2] != 0 + mediabox[2] = 0 + assert page['/MediaBox'][2] == mediabox[2] + + +def test_copy_page_keepalive(resources, outdir): + # str for py<3.6 + copy(str(resources / 'sandwich.pdf'), str(outdir / 'sandwich.pdf')) + src = Pdf.open(outdir / 'sandwich.pdf') + pdf = Pdf.open(resources / 'graph.pdf') + + pdf.pages.append(src.pages[0]) + + del src + src = None + gc.collect() + with suppress(PermissionError): + (outdir / 'sandwich.pdf').unlink() + pdf.save(outdir / 'out.pdf') + + +def test_open_save(resources, outdir): + out = str(outdir / 'graph.pdf') + copy(str(resources / 'graph.pdf'), out) + src = Pdf.open(out) + src.save(out) + + +def test_readme_example(resources, outdir): + # Elegant, Pythonic API + pdf = pikepdf.open(resources / 'fourpages.pdf') + assert len(pdf.pages) == 4 + del pdf.pages[-1] + assert len(pdf.pages) == 3 + pdf.save(outdir / 'output.pdf') -- cgit v1.2.3 From af37061ca196a087f40a504244d548dab3a0b798 Mon Sep 17 00:00:00 2001 From: Felix Geyer Date: Wed, 27 Feb 2019 15:33:07 -0700 Subject: Import pikepdf_1.0.5+dfsg-2.debian.tar.xz [dgit import tarball pikepdf 1.0.5+dfsg-2 pikepdf_1.0.5+dfsg-2.debian.tar.xz] --- changelog | 70 + compat | 1 + control | 59 + copyright | 1596 ++++++++++++++++++++ missing-sources/Esox_lucius1.jpg | Bin 0 -> 488754 bytes ...dware_MIDI_sequencer_brochure_page_2_300dpi.jpg | Bin 0 -> 1519590 bytes patches/disable-test_docinfo_problems.patch | 59 + patches/docs-build-use-DEB_VERSION_UPSTREAM.patch | 28 + patches/drop-installation-from-docs-contents.patch | 18 + ...-setuptools_scm_git_archive-from-setup.py.patch | 19 + .../fix_xmp_metadata_without_xmpmeta_wrapper.patch | 110 ++ patches/series | 5 + pikepdf-doc.doc-base | 9 + pikepdf-doc.install | 1 + python3-pikepdf.examples | 1 + rules | 31 + source/format | 1 + source/include-binaries | 2 + source/lintian-overrides | 2 + tests/control | 11 + tests/test-suite | 12 + watch | 3 + 22 files changed, 2038 insertions(+) create mode 100644 changelog create mode 100644 compat create mode 100644 control create mode 100644 copyright create mode 100644 missing-sources/Esox_lucius1.jpg create mode 100644 missing-sources/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg create mode 100644 patches/disable-test_docinfo_problems.patch create mode 100644 patches/docs-build-use-DEB_VERSION_UPSTREAM.patch create mode 100644 patches/drop-installation-from-docs-contents.patch create mode 100644 patches/drop-setuptools_scm_git_archive-from-setup.py.patch create mode 100644 patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch create mode 100644 patches/series create mode 100644 pikepdf-doc.doc-base create mode 100644 pikepdf-doc.install create mode 100644 python3-pikepdf.examples create mode 100755 rules create mode 100644 source/format create mode 100644 source/include-binaries create mode 100644 source/lintian-overrides create mode 100644 tests/control create mode 100755 tests/test-suite create mode 100644 watch diff --git a/changelog b/changelog new file mode 100644 index 0000000..d4360b1 --- /dev/null +++ b/changelog @@ -0,0 +1,70 @@ +pikepdf (1.0.5+dfsg-2) unstable; urgency=medium + + * Team upload. + * Fix handling of XMP metadata with no wrapper. + - Cherry-pick upstream fix as fix_xmp_metadata_without_xmpmeta_wrapper.patch + + -- Felix Geyer Wed, 27 Feb 2019 23:33:07 +0100 + +pikepdf (1.0.5+dfsg-1) unstable; urgency=medium + + * New upstream release. + * Refresh patches. + + -- Sean Whitton Sat, 26 Jan 2019 12:54:11 -0700 + +pikepdf (1.0.4+dfsg-1) unstable; urgency=medium + + * New upstream release. + - Add Files-Excluded to d/copyright. + See https://github.com/pikepdf/pikepdf/issues/21 + - Add disable-test_docinfo_problems.patch + * Install examples/find_links.py + + -- Sean Whitton Thu, 10 Jan 2019 08:44:33 -0700 + +pikepdf (0.10.1-2) unstable; urgency=medium + + * Upload to unstable. + Upstream considers the API to be stable. + + -- Sean Whitton Tue, 01 Jan 2019 09:19:08 +0000 + +pikepdf (0.10.1-1) experimental; urgency=medium + + * New upstream release. + - Add python3-defusedxml, python3-lxml build-deps + - Add python3-attr autopkgtest dep + + -- Sean Whitton Mon, 31 Dec 2018 23:25:17 +0000 + +pikepdf (0.3.7-1) experimental; urgency=medium + + * New upstream release. + + -- Sean Whitton Sun, 11 Nov 2018 14:29:02 -0700 + +pikepdf (0.3.5-1) experimental; urgency=medium + + * New upstream release. + + -- Sean Whitton Sat, 20 Oct 2018 13:06:33 -0700 + +pikepdf (0.3.4-1) experimental; urgency=medium + + * New upstream release. + * Add build-dep on python3-setuptools-scm-git-archive. + * Update d/copyright for new files. + * Drop 0001-Restore-Exhibit-B-text-clarify-license-comments-in-r.patch + Included in this upstream release. + * Add drop-setuptools_scm_git_archive-from-setup.py.patch. + * Refresh drop-installation-from-docs-contents.patch. + + -- Sean Whitton Wed, 10 Oct 2018 08:28:28 -0700 + +pikepdf (0.3.0-1) experimental; urgency=medium + + * Initial upload, to experimental (Closes: #903625). + API not yet finalised. + + -- Sean Whitton Thu, 30 Aug 2018 16:21:39 -0700 diff --git a/compat b/compat new file mode 100644 index 0000000..f599e28 --- /dev/null +++ b/compat @@ -0,0 +1 @@ +10 diff --git a/control b/control new file mode 100644 index 0000000..ee1d2cb --- /dev/null +++ b/control @@ -0,0 +1,59 @@ +Source: pikepdf +Section: python +Priority: optional +Maintainer: Debian Python Modules Team +Uploaders: Sean Whitton +Build-Depends: + debhelper (>= 10), + dh-python, + libqpdf-dev, + python3-all-dev, + python3-defusedxml, + python3-pybind11, + python3-pytest-runner, + python3-setuptools, + python3-setuptools-scm, + python3-setuptools-scm-git-archive, + python3-sphinx, + python3-sphinx-rtd-theme, + python3-matplotlib, + python3-ipython, + python3-lxml, +Standards-Version: 4.1.5 +Homepage: https://github.com/pikepdf/pikepdf +Vcs-Browser: https://salsa.debian.org/python-team/modules/pikepdf +Vcs-Git: https://salsa.debian.org/python-team/modules/pikepdf.git + +Package: python3-pikepdf +Architecture: any +Depends: ${misc:Depends}, ${python3:Depends}, ${shlibs:Depends} +Provides: ${python3:Provides} +XB-Python-Version: ${python3:Versions} +Description: Python library to read and write PDFs with QPDF + pikepdf is a Python library to read and write PDFs with QPDF. + Features include: + . + * Editing, manipulation and transformation of existing PDFs + * Based on the mature, proven QPDF C++ library + * Works with encrypted PDFs + * Supports all PDF compression filters + * Can create "fast web view" (linearized) PDFs + * Creates standards compliant PDFs that pass validation in other tools + * Automatically repairs damaged PDFs, just like QPDF + * Implements more of the PDF specification than existing Python PDF tools + * IPython notebook and Jupyter integration + +Package: pikepdf-doc +Section: doc +Architecture: all +Depends: + ${misc:Depends}, + ${sphinxdoc:Depends}, +Built-Using: ${sphinxdoc:Built-Using} +Description: Python library to read and write PDFs with QPDF - documentation + pikepdf is a Python library to read and write PDFs with QPDF. + . + This package includes pikepdf's HTML documentation. + . + See the description for the pikepdf binary package for more + information about pikepdf. diff --git a/copyright b/copyright new file mode 100644 index 0000000..815795c --- /dev/null +++ b/copyright @@ -0,0 +1,1596 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: pikepdf +Source: https://github.com/pikepdf/pikepdf +Files-Excluded: tests/resources/enron1_gs.pdf + +Files: * +Copyright: (C) 2017 James R. Barlow +License: MPL-2.0 +Comment: + The file licenses/license.wheel.txt is relevant only when a binary + artifact is produced from the combination of the source code of + pikepdf and the source code of qpdf. Nothing in pikepdf is Apache + licensed. + . + pikepdf is licensed under the (unmodified) MPL-2.0. This was always + upstream's intention; the removal of the text of Exhibit B from the + copy of the MPL included in the source package was a + misunderstanding. An upstream commit more recent than the currently + packaged upstream version restores the usual text of the MPL-2.0; + that commit is backported into the Debian delta queue, file + debian/patches/0001-Restore-Exhibit-B-text-clarify-license-comments-in-r.patch + . + Also see https://github.com/pikepdf/pikepdf/issues/8 + +Files: debian/* +Copyright: (C) 2018 Sean Whitton +License: MPL-2.0 + +Files: docs/images/pike.jpg tests/resources/pike-jp2.pdf +Copyright: Public domain +License: public-domain + From the U.S. Fish and Wildlife Service National Image Library. + . + See: https://en.wikipedia.org/wiki/File:Esox_lucius1.jpg +Comment: Maximum resolution version is in debian/missing-sources/. + +Files: tests/*.py +Copyright: (C) 2017 James R. Barlow +License: CC0-1.0 + +Files: tests/resources/* +Copyright: (C) 2017 James R. Barlow +License: CC-BY-4.0 + +Files: test/resources/congress.pdf docs/images/congress_im0.jpg tests/resources/congress-gray.pdf +Copyright: Public domain +License: public-domain + From US Congressional Records. +Comment: Converted from JPEG to PDF. + +Files: tests/resources/graph*.pdf +Copyright: Public domain +License: public-domain + Released into the public domain by author; see: + . +Comment: + For -encrypted.pdf, user password is "user" and owner password is "owner". + +Files: tests/resources/veraPDF*.pdf +Copyright: (C) 2015 veraPDF Consortium +License: CC-BY-4.0 +Comment: + Obtained from: https://github.com/veraPDF/veraPDF-corpus + +Files: tests/resources/sandwich.pdf +Copyright: (C) 1985 Forat Electronics +License: GFDL-1.2+ or CC-BY-SA-3.0 +Comment: + Created using ocrmypdf --pdf-renderer sandwich, to test Tesseract PDF + text encoding. + . + Originally obtained from: https://commons.wikimedia.org/wiki/File:LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg + . + A copy of that JPEG is included in debian/missing-sources/. + +Files: docs/images/pike-cartoon.png +Copyright: (C) 2017 creozavr +License: CC0-1.0 +Comment: + Obtained from: https://pixabay.com/en/pike-fish-predator-shchuchin-2612354/ + +Files: docs/images/pikemen.jpg +Copyright: (C) 2009 Rama +License: CeCILL-2.0 or CC-BY-SA-2.0-FR +Comment: + Obtained from: https://commons.wikimedia.org/wiki/File:Pike_square_img_3653.jpg + +License: MPL-2.0 + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. + . + On Debian systems the full text of the MPL-2.0 can be found in + /usr/share/common-licenses/MPL-2.0. + +License: CC0-1.0 + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + . + On Debian systems the full text of the CC0-1.0 license can be found + in /usr/share/common-licenses/CC0-1.0 + +License: CC-BY-4.0 + Creative Commons Attribution 4.0 International Public License + . + By exercising the Licensed Rights (defined below), You accept and agree + to be bound by the terms and conditions of this Creative Commons + Attribution 4.0 International Public License ("Public License"). To the + extent this Public License may be interpreted as a contract, You are + granted the Licensed Rights in consideration of Your acceptance of + these terms and conditions, and the Licensor grants You such rights in + consideration of benefits the Licensor receives from making the + Licensed Material available under these terms and conditions. + . + Section 1 -- Definitions. + . + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + . + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + . + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + . + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + . + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + . + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + . + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + . + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + . + i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + . + j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + . + k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + . + Section 2 -- Scope. + . + a. License grant. + . + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + . + a. reproduce and Share the Licensed Material, in whole or + in part; and + . + b. produce, reproduce, and Share Adapted Material. + . + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + . + 3. Term. The term of this Public License is specified in Section + 6(a). + . + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + . + 5. Downstream recipients. + . + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + . + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + . + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + . + b. Other rights. + . + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + . + 2. Patent and trademark rights are not licensed under this + Public License. + . + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + . + Section 3 -- License Conditions. + . + Your exercise of the Licensed Rights is expressly made subject to the + following conditions. + . + a. Attribution. + . + 1. If You Share the Licensed Material (including in modified + form), You must: + . + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + . + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + . + ii. a copyright notice; + . + iii. a notice that refers to this Public License; + . + iv. a notice that refers to the disclaimer of + warranties; + . + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + . + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + . + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + . + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + . + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + . + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + . + Section 4 -- Sui Generis Database Rights. + . + Where the Licensed Rights include Sui Generis Database Rights that + apply to Your use of the Licensed Material: + . + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + . + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + . + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + . + For the avoidance of doubt, this Section 4 supplements and does not + replace Your obligations under this Public License where the Licensed + Rights include other Copyright and Similar Rights. + . + Section 5 -- Disclaimer of Warranties and Limitation of Liability. + . + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + . + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + . + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + . + Section 6 -- Term and Termination. + . + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + . + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + . + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + . + 2. upon express reinstatement by the Licensor. + . + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + . + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + . + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + . + Section 7 -- Other Terms and Conditions. + . + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + . + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + . + Section 8 -- Interpretation. + . + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + . + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + . + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + . + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +License: GFDL-1.2+ + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.2 or + any later version published by the Free Software Foundation; with no + Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. + . + On Debian systems, the complete text of the GNU Free Documentation + License version 1.2 can be found in + "/usr/share/common-licenses/GFDL-1.2". + +License: CC-BY-SA-3.0 + THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS + CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS + PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE + WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS + PROHIBITED. + . + BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND + AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS + LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU + THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH + TERMS AND CONDITIONS. + . + 1. Definitions + . + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + . + b. "Collection" means a collection of literary or artistic works, such + as encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined below) for the purposes of this + License. + . + c. "Creative Commons Compatible License" means a license that is + listed at http://creativecommons.org/compatiblelicenses that has been + approved by Creative Commons as being essentially equivalent to this + License, including, at a minimum, because that license: (i) contains + terms that have the same purpose, meaning and effect as the License + Elements of this License; and, (ii) explicitly permits the relicensing + of adaptations of works made available under that license under this + License or a Creative Commons jurisdiction license with the same + License Elements as this License. + . + d. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + . + e. "License Elements" means the following high-level license + attributes as selected by Licensor and indicated in the title of this + License: Attribution, ShareAlike. + . + f. "Licensor" means the individual, individuals, entity or entities + that offer(s) the Work under the terms of this License. + . + g. "Original Author" means, in the case of a literary or artistic + work, the individual, individuals, entity or entities who created the + Work or if no individual or entity can be identified, the publisher; + and in addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + . + h. "Work" means the literary and/or artistic work offered under the + terms of this License including without limitation any production in + the literary, scientific and artistic domain, whatever may be the mode + or form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + . + i. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + . + j. "Publicly Perform" means to perform public recitations of the Work + and to communicate to the public those public recitations, by any + means or process, including by wire or wireless means or public + digital performances; to make available to the public Works in such a + way that members of the public may access these Works from a place and + at a place individually chosen by them; to perform the Work to the + public by any means or process and the communication to the public of + the performances of the Work, including by public digital performance; + to broadcast and rebroadcast the Work by any means including signs, + sounds or images. + . + k. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + . + 2. Fair Dealing Rights. Nothing in this License is intended to reduce, + limit, or restrict any uses free from copyright or rights arising from + limitations or exceptions that are provided for in connection with the + copyright protection under copyright law or other applicable laws. + . + 3. License Grant. Subject to the terms and conditions of this License, + Licensor hereby grants You a worldwide, royalty-free, non-exclusive, + perpetual (for the duration of the applicable copyright) license to + exercise the rights in the Work as stated below: + . + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + . + b. to create and Reproduce Adaptations provided that any such + Adaptation, including any translation in any medium, takes reasonable + steps to clearly label, demarcate or otherwise identify that changes + were made to the original Work. For example, a translation could be + marked "The original work was translated from English to Spanish," or + a modification could indicate "The original work has been modified."; + . + c. to Distribute and Publicly Perform the Work including as + incorporated in Collections; and, + . + d. to Distribute and Publicly Perform Adaptations. + . + e. For the avoidance of doubt: + . + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor reserves + the exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; + . + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You of + the rights granted under this License; and, + . + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise by + You of the rights granted under this License. + . + The above rights may be exercised in all media and formats whether now + known or hereafter devised. The above rights include the right to make + such modifications as are technically necessary to exercise the rights + in other media and formats. Subject to Section 8(f), all rights not + expressly granted by Licensor are hereby reserved. + . + 4. Restrictions. The license granted in Section 3 above is expressly + made subject to and limited by the following restrictions: + . + a. You may Distribute or Publicly Perform the Work only under the + terms of this License. You must include a copy of, or the Uniform + Resource Identifier (URI) for, this License with every copy of the + Work You Distribute or Publicly Perform. You may not offer or impose + any terms on the Work that restrict the terms of this License or the + ability of the recipient of the Work to exercise the rights granted to + that recipient under the terms of the License. You may not sublicense + the Work. You must keep intact all notices that refer to this License + and to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(c), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(c), as requested. + . + b. You may Distribute or Publicly Perform an Adaptation only under the + terms of: (i) this License; (ii) a later version of this License with + the same License Elements as this License; (iii) a Creative Commons + jurisdiction license (either this or a later license version) that + contains the same License Elements as this License (e.g., + Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible + License. If you license the Adaptation under one of the licenses + mentioned in (iv), you must comply with the terms of that license. If + you license the Adaptation under the terms of any of the licenses + mentioned in (i), (ii) or (iii) (the "Applicable License"), you must + comply with the terms of the Applicable License generally and the + following provisions: (I) You must include a copy of, or the URI for, + the Applicable License with every copy of each Adaptation You + Distribute or Publicly Perform; (II) You may not offer or impose any + terms on the Adaptation that restrict the terms of the Applicable + License or the ability of the recipient of the Adaptation to exercise + the rights granted to that recipient under the terms of the Applicable + License; (III) You must keep intact all notices that refer to the + Applicable License and to the disclaimer of warranties with every copy + of the Work as included in the Adaptation You Distribute or Publicly + Perform; (IV) when You Distribute or Publicly Perform the Adaptation, + You may not impose any effective technological measures on the + Adaptation that restrict the ability of a recipient of the Adaptation + from You to exercise the rights granted to that recipient under the + terms of the Applicable License. This Section 4(b) applies to the + Adaptation as incorporated in a Collection, but this does not require + the Collection apart from the Adaptation itself to be made subject to + the terms of the Applicable License. + . + c. If You Distribute, or Publicly Perform the Work or any Adaptations + or Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Ssection 3(b), in the case of an + Adaptation, a credit identifying the use of the Work in the Adaptation + (e.g., "French translation of the Work by Original Author," or + "Screenplay based on original Work by Original Author"). The credit + required by this Section 4(c) may be implemented in any reasonable + manner; provided, however, that in the case of a Adaptation or + Collection, at a minimum such credit will appear, if a credit for all + contributing authors of the Adaptation or Collection appears, then as + part of these credits and in a manner at least as prominent as the + credits for the other contributing authors. For the avoidance of + doubt, You may only use the credit required by this Section for the + purpose of attribution in the manner set out above and, by exercising + Your rights under this License, You may not implicitly or explicitly + assert or imply any connection with, sponsorship or endorsement by the + Original Author, Licensor and/or Attribution Parties, as appropriate, + of You or Your use of the Work, without the separate, express prior + written permission of the Original Author, Licensor and/or Attribution + Parties. + . + d. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + . + 5. Representations, Warranties and Disclaimer + . + UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, + LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR + WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, + STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF + TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, + NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, + OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT + DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED + WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + . + 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY + APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY + LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR + EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, + EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + . + 7. Termination + . + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this + License. Individuals or entities who have received Adaptations or + Collections from You under this License, however, will not have their + licenses terminated provided such individuals or entities remain in + full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 + will survive any termination of this License. + . + b. Subject to the above terms and conditions, the license granted here + is perpetual (for the duration of the applicable copyright in the + Work). Notwithstanding the above, Licensor reserves the right to + release the Work under different license terms or to stop distributing + the Work at any time; provided, however that any such election will + not serve to withdraw this License (or any other license that has + been, or is required to be, granted under the terms of this License), + and this License will continue in full force and effect unless + terminated as stated above. + . + 8. Miscellaneous + . + a. Each time You Distribute or Publicly Perform the Work or a + Collection, the Licensor offers to the recipient a license to the Work + on the same terms and conditions as the license granted to You under + this License. + . + b. Each time You Distribute or Publicly Perform an Adaptation, + Licensor offers to the recipient a license to the original Work on the + same terms and conditions as the license granted to You under this + License. + . + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + . + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + . + e. This License constitutes the entire agreement between the parties + with respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + . + f. The rights granted under, and the subject matter referenced, in + this License were drafted utilizing the terminology of the Berne + Convention for the Protection of Literary and Artistic Works (as + amended on September 28, 1979), the Rome Convention of 1961, the WIPO + Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty + of 1996 and the Universal Copyright Convention (as revised on July 24, + 1971). These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + +License: CeCILL-2.0 + CeCILL FREE SOFTWARE LICENSE AGREEMENT + . + Notice + . + This Agreement is a Free Software license agreement that is the result + of discussions between its authors in order to ensure compliance with + the two main principles guiding its drafting: + . + firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, secondly, the election of a governing law, French law, with + which it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + . + The authors of the CeCILL license are: + . + Commissariat à l'Energie Atomique - CEA, a public scientific, + technical and industrial research establishment, having its principal + place of business at 25 rue Leblanc, immeuble Le Ponant D, 75015 + Paris, France. + . + Centre National de la Recherche Scientifique - CNRS, a public + scientific and technological establishment, having its principal place + of business at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + . + Institut National de Recherche en Informatique et en Automatique - + INRIA, a public scientific and technological establishment, having its + principal place of business at Domaine de Voluceau, Rocquencourt, BP + 105, 78153 Le Chesnay cedex, France. + . + Preamble + . + The purpose of this Free Software license agreement is to grant users + the right to modify and redistribute the software governed by this + license within the framework of an open source distribution model. + . + The exercising of these rights is conditional upon certain obligations + for users so as to preserve this status for all subsequent + redistributions. + . + In consideration of access to the source code and the rights to copy, + modify and redistribute granted by the license, users are provided + only with a limited warranty and the software's author, the holder of + the economic rights, and the successive licensors only have limited + liability. + . + In this respect, the risks associated with loading, using, modifying + and/or developing or reproducing the software by the user are brought + to the user's attention, given its Free Software status, which may + make it complicated to use, with the result that its use is reserved + for developers and experienced professionals having in-depth computer + knowledge. Users are therefore encouraged to load and test the + suitability of the software as regards their requirements in + conditions enabling the security of their systems and/or data to be + ensured and, more generally, to use and operate it in the same + conditions of security. This Agreement may be freely reproduced and + published, provided it is not altered, and that no provisions are + either added or removed herefrom. + . + This Agreement may apply to any or all software for which the holder + of the economic rights decides to submit the use thereof to its + provisions. + . + Article 1 - DEFINITIONS + . + For the purpose of this Agreement, when the following expressions + commence with a capital letter, they shall have the following meaning: + . + Agreement: means this license agreement, and its possible subsequent + versions and annexes. + . + Software: means the software in its Object Code and/or Source Code + form and, where applicable, its documentation, "as is" when the + Licensee accepts the Agreement. + . + Initial Software: means the Software in its Source Code and possibly + its Object Code form and, where applicable, its documentation, "as is" + when it is first distributed under the terms and conditions of the + Agreement. + . + Modified Software: means the Software modified by at least one + Contribution. + . + Source Code: means all the Software's instructions and program lines + to which access is required so as to modify the Software. + . + Object Code: means the binary files originating from the compilation + of the Source Code. + . + Holder: means the holder(s) of the economic rights over the Initial + Software. + . + Licensee: means the Software user(s) having accepted the Agreement. + . + Contributor: means a Licensee having made at least one Contribution. + . + Licensor: means the Holder, or any other individual or legal entity, + who distributes the Software under the Agreement. + . + Contribution: means any or all modifications, corrections, + translations, adaptations and/or new functions integrated into the + Software by any or all Contributors, as well as any or all Internal + Modules. + . + Module: means a set of sources files including their documentation + that enables supplementary functions or services in addition to those + offered by the Software. + . + External Module: means any or all Modules, not derived from the + Software, so that this Module and the Software run in separate address + spaces, with one calling the other when they are run. + . + Internal Module: means any or all Module, connected to the Software so + that they both execute in the same address space. + . + GNU GPL: means the GNU General Public License version 2 or any + subsequent version, as published by the Free Software Foundation Inc. + . + Parties: mean both the Licensee and the Licensor. + . + These expressions may be used both in singular and plural form. + . + Article 2 - PURPOSE + . + The purpose of the Agreement is the grant by the Licensor to the + Licensee of a non-exclusive, transferable and worldwide license for + the Software as set forth in Article 5 hereinafter for the whole term + of the protection granted by the rights over said Software. + . + Article 3 - ACCEPTANCE + . + 3.1 The Licensee shall be deemed as having accepted the terms and + conditions of this Agreement upon the occurrence of the first of the + following events: + . + (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; (ii) the first time the Licensee exercises any of the + rights granted hereunder. + . + 3.2 One copy of the Agreement, containing a notice relating to the + characteristics of the Software, to the limited warranty, and to the + fact that its use is restricted to experienced users has been provided + to the Licensee prior to its acceptance as set forth in Article 3.1 + hereinabove, and the Licensee hereby acknowledges that it has read and + understood it. + . + Article 4 - EFFECTIVE DATE AND TERM + . + 4.1 EFFECTIVE DATE + . + The Agreement shall become effective on the date when it is accepted + by the Licensee as set forth in Article 3.1. + . + 4.2 TERM + . + The Agreement shall remain in force for the entire legal term of + protection of the economic rights over the Software. + . + Article 5 - SCOPE OF RIGHTS GRANTED + . + The Licensor hereby grants to the Licensee, who accepts, the following + rights over the Software for any or all use, and for the term of the + Agreement, on the basis of the terms and conditions set forth + hereinafter. + . + Besides, if the Licensor owns or comes to own one or more patents + protecting all or part of the functions of the Software or of its + components, the Licensor undertakes not to enforce the rights granted + by these patents against successive Licensees using, exploiting or + modifying the Software. If these patents are transferred, the Licensor + undertakes to have the transferees subscribe to the obligations set + forth in this paragraph. + . + 5.1 RIGHT OF USE + . + The Licensee is authorized to use the Software, without any limitation + as to its fields of application, with it being hereinafter specified + that this comprises: + . + permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + . + loading, displaying, running, or storing the Software on any or + all medium. + . + entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + . + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + . + The right to make Contributions includes the right to translate, + adapt, arrange, or make any or all modifications to the Software, and + the right to reproduce the resulting software. + . + The Licensee is authorized to make any or all Contributions to the + Software provided that it includes an explicit notice that it is the + author of said Contribution and indicates the date of the creation + thereof. + . + 5.3 RIGHT OF DISTRIBUTION + . + In particular, the right of distribution includes the right to + publish, transmit and communicate the Software to the general public + on any or all medium, and by any or all means, and the right to + market, either in consideration of a fee, or free of charge, one or + more copies of the Software by any means. + . + The Licensee is further authorized to distribute copies of the + modified or unmodified Software to third parties according to the + terms and conditions set forth hereinafter. + . + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + . + The Licensee is authorized to distribute true copies of the Software + in Source Code or Object Code form, provided that said distribution + complies with all the provisions of the Agreement and is accompanied + by: + . + a copy of the Agreement, + . + a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + . + and that, in the event that only the Object Code of the Software is + redistributed, the Licensee allows future Licensees unhindered access + to the full Source Code of the Software by indicating how to access + it, it being understood that the additional cost of acquiring the + Source Code shall not exceed the cost of transferring the data. + . + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + . + When the Licensee makes a Contribution to the Software, the terms and + conditions for the distribution of the resulting Modified Software + become subject to all the provisions of this Agreement. + . + The Licensee is authorized to distribute the Modified Software, in + source code or object code form, provided that said distribution + complies with all the provisions of the Agreement and is accompanied + by: + . + a copy of the Agreement, + . + a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + . + and that, in the event that only the object code of the Modified + Software is redistributed, the Licensee allows future Licensees + unhindered access to the full source code of the Modified Software by + indicating how to access it, it being understood that the additional + cost of acquiring the source code shall not exceed the cost of + transferring the data. + . + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + . + When the Licensee has developed an External Module, the terms and + conditions of this Agreement do not apply to said External Module, + that may be distributed under a separate license agreement. + . + 5.3.4 COMPATIBILITY WITH THE GNU GPL + . + The Licensee can include a code that is subject to the provisions of + one of the versions of the GNU GPL in the Modified or unmodified + Software, and distribute that entire code under the terms of the same + version of the GNU GPL. + . + The Licensee can include the Modified or unmodified Software in a code + that is subject to the provisions of one of the versions of the GNU + GPL, and distribute that entire code under the terms of the same + version of the GNU GPL. + . + Article 6 - INTELLECTUAL PROPERTY + . + 6.1 OVER THE INITIAL SOFTWARE + . + The Holder owns the economic rights over the Initial Software. Any or + all use of the Initial Software is subject to compliance with the + terms and conditions under which the Holder has elected to distribute + its work and no one shall be entitled to modify the terms and + conditions for the distribution of said Initial Software. + . + The Holder undertakes that the Initial Software will remain ruled at + least by this Agreement, for the duration set forth in Article 4.2. + . + 6.2 OVER THE CONTRIBUTIONS + . + The Licensee who develops a Contribution is the owner of the + intellectual property rights over this Contribution as defined by + applicable law. + . + 6.3 OVER THE EXTERNAL MODULES + . + The Licensee who develops an External Module is the owner of the + intellectual property rights over this External Module as defined by + applicable law and is free to choose the type of agreement that shall + govern its distribution. + . + 6.4 JOINT PROVISIONS + . + The Licensee expressly undertakes: + . + not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + . + to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + . + The Licensee undertakes not to directly or indirectly infringe the + intellectual property rights of the Holder and/or Contributors on the + Software and to take, where applicable, vis-à-vis its staff, any and + all measures required to ensure respect of said intellectual property + rights of the Holder and/or Contributors. + . + Article 7 - RELATED SERVICES + . + 7.1 Under no circumstances shall the Agreement oblige the Licensor to + provide technical assistance or maintenance services for the Software. + . + However, the Licensor is entitled to offer this type of services. The + terms and conditions of such technical assistance, and/or such + maintenance, shall be set forth in a separate instrument. Only the + Licensor offering said maintenance and/or technical assistance + services shall incur liability therefor. + . + 7.2 Similarly, any Licensor is entitled to offer to its licensees, + under its sole responsibility, a warranty, that shall only be binding + upon itself, for the redistribution of the Software and/or the + Modified Software, under terms and conditions that it is free to + decide. Said warranty, and the financial terms and conditions of its + application, shall be subject of a separate instrument executed + between the Licensor and the Licensee. + . + Article 8 - LIABILITY + . + 8.1 Subject to the provisions of Article 8.2, the Licensee shall be + entitled to claim compensation for any direct loss it may have + suffered from the Software as a result of a fault on the part of the + relevant Licensor, subject to providing evidence thereof. + . + 8.2 The Licensor's liability is limited to the commitments made under + this Agreement and shall not be incurred as a result of in particular: + (i) loss due the Licensee's total or partial failure to fulfill its + obligations, (ii) direct or consequential loss that is suffered by the + Licensee due to the use or performance of the Software, and (iii) more + generally, any consequential loss. In particular the Parties expressly + agree that any or all pecuniary or business loss (i.e. loss of data, + loss of profits, operating loss, loss of customers or orders, + opportunity cost, any disturbance to business activities) or any or + all legal proceedings instituted against the Licensee by a third + party, shall constitute consequential loss and shall not provide + entitlement to any or all compensation from the Licensor. + . + Article 9 - WARRANTY + . + 9.1 The Licensee acknowledges that the scientific and technical + state-of-the-art when the Software was distributed did not enable all + possible uses to be tested and verified, nor for the presence of + possible defects to be detected. In this respect, the Licensee's + attention has been drawn to the risks associated with loading, using, + modifying and/or developing and reproducing the Software which are + reserved for experienced users. + . + The Licensee shall be responsible for verifying, by any or all means, + the suitability of the product for its requirements, its good working + order, and for ensuring that it shall not cause damage to either + persons or properties. + . + 9.2 The Licensor hereby represents, in good faith, that it is entitled + to grant all the rights over the Software (including in particular the + rights set forth in Article 5). + . + 9.3 The Licensee acknowledges that the Software is supplied "as is" by + the Licensor without any other express or tacit warranty, other than + that provided for in Article 9.2 and, in particular, without any + warranty as to its commercial value, its secured, safe, innovative or + relevant nature. + . + Specifically, the Licensor does not warrant that the Software is free + from any error, that it will operate without interruption, that it + will be compatible with the Licensee's own equipment and software + configuration, nor that it will meet the Licensee's requirements. + . + 9.4 The Licensor does not either expressly or tacitly warrant that the + Software does not infringe any third party intellectual property right + relating to a patent, software or any other property right. Therefore, + the Licensor disclaims any and all liability towards the Licensee + arising out of any or all proceedings for infringement that may be + instituted in respect of the use, modification and redistribution of + the Software. Nevertheless, should such proceedings be instituted + against the Licensee, the Licensor shall provide it with technical and + legal assistance for its defense. Such technical and legal assistance + shall be decided on a case-by-case basis between the relevant Licensor + and the Licensee pursuant to a memorandum of understanding. The + Licensor disclaims any and all liability as regards the Licensee's use + of the name of the Software. No warranty is given as regards the + existence of prior rights over the name of the Software or as regards + the existence of a trademark. + . + Article 10 - TERMINATION + . + 10.1 In the event of a breach by the Licensee of its obligations + hereunder, the Licensor may automatically terminate this Agreement + thirty (30) days after notice has been sent to the Licensee and has + remained ineffective. + . + 10.2 A Licensee whose Agreement is terminated shall no longer be + authorized to use, modify or distribute the Software. However, any + licenses that it may have granted prior to termination of the + Agreement shall remain valid subject to their having been granted in + compliance with the terms and conditions hereof. + . + Article 11 - MISCELLANEOUS + . + 11.1 EXCUSABLE EVENTS + . + Neither Party shall be liable for any or all delay, or failure to + perform the Agreement, that may be attributable to an event of force + majeure, an act of God or an outside cause, such as defective + functioning or interruptions of the electricity or telecommunications + networks, network paralysis following a virus attack, intervention by + government authorities, natural disasters, water damage, earthquakes, + fire, explosions, strikes and labor unrest, war, etc. + . + 11.2 Any failure by either Party, on one or more occasions, to invoke + one or more of the provisions hereof, shall under no circumstances be + interpreted as being a waiver by the interested Party of its right to + invoke said provision(s) subsequently. + . + 11.3 The Agreement cancels and replaces any or all previous + agreements, whether written or oral, between the Parties and having + the same purpose, and constitutes the entirety of the agreement + between said Parties concerning said purpose. No supplement or + modification to the terms and conditions hereof shall be effective as + between the Parties unless it is made in writing and signed by their + duly authorized representatives. + . + 11.4 In the event that one or more of the provisions hereof were to + conflict with a current or future applicable act or legislative text, + said act or legislative text shall prevail, and the Parties shall make + the necessary amendments so as to comply with said act or legislative + text. All other provisions shall remain effective. Similarly, + invalidity of a provision of the Agreement, for any reason whatsoever, + shall not cause the Agreement as a whole to be invalid. + . + 11.5 LANGUAGE + . + The Agreement is drafted in both French and English and both versions + are deemed authentic. + . + Article 12 - NEW VERSIONS OF THE AGREEMENT + . + 12.1 Any person is authorized to duplicate and distribute copies of + this Agreement. + . + 12.2 So as to ensure coherence, the wording of this Agreement is + protected and may only be modified by the authors of the License, who + reserve the right to periodically publish updates or new versions of + the Agreement, each with a separate number. These subsequent versions + may address new issues encountered by Free Software. + . + 12.3 Any Software distributed under a given version of the Agreement + may only be subsequently distributed under the same version of the + Agreement or a subsequent version, subject to the provisions of + Article 5.3.4. + . + Article 13 - GOVERNING LAW AND JURISDICTION + . + 13.1 The Agreement is governed by French law. The Parties agree to + endeavor to seek an amicable solution to any disagreements or disputes + that may arise during the performance of the Agreement. + . + 13.2 Failing an amicable solution within two (2) months as from their + occurrence, and unless emergency proceedings are necessary, the + disagreements or disputes shall be referred to the Paris Courts having + jurisdiction, by the more diligent Party. + . + CeCILL stands for Ce(a) C(nrs) I(nria) L(ogiciel) L(ibre) + . + Version 2.0 dated 2006-09-05. + +License: CC-BY-SA-2.0-FR + This file is licensed under the Creative Commons Attribution-Share + Alike 2.0 France license. + . + You are free to: + . + • Share — copy and redistribute the material in any medium or format + • Adapt — remix, transform, and build upon the material for any + purpose, even commercially. + . + Under the following terms: + . + • Attribution — You must give appropriate credit, provide a link to + the license, and indicate if changes were made. You may do so in + any reasonable manner, but not in any way that suggests the + licensor endorses you or your use. + • ShareAlike — If you remix, transform, or build upon the material, + you must distribute your contributions under the same license as + the original. + • No additional restrictions — You may not apply legal terms or + technological measures that legally restrict others from doing + anything the license permits. + . + ---- Full license text follows ---- + . + [Creative Commons Legal Code] + . + Paternité - Partage Des Conditions Initiales A l'Identique 2.0 + . + Creative Commons n'est pas un cabinet d'avocats et ne fournit pas de + services de conseil juridique. La distribution de la présente version + de ce contrat ne crée aucune relation juridique entre les parties au + contrat présenté ci-après et Creative Commons. Creative Commons + fournit cette offre de contrat-type en l'état, à seule fin + d'information. Creative Commons ne saurait être tenu responsable des + éventuels préjudices résultant du contenu ou de l'utilisation de ce + contrat. + . + Contrat + . + L'Oeuvre (telle que définie ci-dessous) est mise à disposition selon + les termes du présent contrat appelé Contrat Public Creative Commons + (dénommé ici « CPCC » ou « Contrat »). L'Oeuvre est protégée par le + droit de la propriété littéraire et artistique (droit d'auteur, droits + voisins, droits des producteurs de bases de données) ou toute autre + loi applicable. Toute utilisation de l'Oeuvre autrement + qu'explicitement autorisée selon ce Contrat ou le droit applicable est + interdite. + . + L'exercice sur l'Oeuvre de tout droit proposé par le présent contrat + vaut acceptation de celui-ci. Selon les termes et les obligations du + présent contrat, la partie Offrante propose à la partie Acceptante + l'exercice de certains droits présentés ci-après, et l'Acceptant en + approuve les termes et conditions d'utilisation. + . + 1. Définitions + . + « Oeuvre » : oeuvre de l'esprit protégeable par le droit de la + propriété littéraire et artistique ou toute loi applicable et qui + est mise à disposition selon les termes du présent Contrat. « + Oeuvre dite Collective » : une oeuvre dans laquelle l'oeuvre, dans + sa forme intégrale et non modifiée, est assemblée en un ensemble + collectif avec d'autres contributions qui constituent en + elles-mêmes des oeuvres séparées et indépendantes. Constituent + notamment des Oeuvres dites Collectives les publications + périodiques, les anthologies ou les encyclopédies. Aux termes de + la présente autorisation, une oeuvre qui constitue une Oeuvre dite + Collective ne sera pas considérée comme une Oeuvre dite Dérivée + (telle que définie ci-après). « Oeuvre dite Dérivée » : une + oeuvre créée soit à partir de l'Oeuvre seule, soit à partir de + l'Oeuvre et d'autres oeuvres préexistantes. Constituent notamment + des Oeuvres dites Dérivées les traductions, les arrangements + musicaux, les adaptations théâtrales, littéraires ou + cinématographiques, les enregistrements sonores, les reproductions + par un art ou un procédé quelconque, les résumés, ou toute autre + forme sous laquelle l'Oeuvre puisse être remaniée, modifiée, + transformée ou adaptée, à l'exception d'une oeuvre qui constitue + une Oeuvre dite Collective. Une Oeuvre dite Collective ne sera pas + considérée comme une Oeuvre dite Dérivée aux termes du présent + Contrat. Dans le cas où l'Oeuvre serait une composition musicale + ou un enregistrement sonore, la synchronisation de l'oeuvre avec + une image animée sera considérée comme une Oeuvre dite Dérivée + pour les propos de ce Contrat. « Auteur original » : la ou les + personnes physiques qui ont créé l'Oeuvre. « Offrant » : la ou + les personne(s) physique(s) ou morale(s) qui proposent la mise à + disposition de l'Oeuvre selon les termes du présent Contrat. « + Acceptant » : la personne physique ou morale qui accepte le + présent contrat et exerce des droits sans en avoir violé les + termes au préalable ou qui a reçu l'autorisation expresse de + l'Offrant d'exercer des droits dans le cadre du présent contrat + malgré une précédente violation de ce contrat. « Options du + Contrat » : les attributs génériques du Contrat tels qu'ils ont + été choisis par l'Offrant et indiqués dans le titre de ce Contrat + : Paternité - Pas d'Utilisation Commerciale - Partage Des + Conditions Initiales A l'Identique. + . + 2. Exceptions aux droits exclusifs. Aucune disposition de ce contrat + n'a pour intention de réduire, limiter ou restreindre les prérogatives + issues des exceptions aux droits, de l'épuisement des droits ou + d'autres limitations aux droits exclusifs des ayants droit selon le + droit de la propriété littéraire et artistique ou les autres lois + applicables. + . + 3. Autorisation. Soumis aux termes et conditions définis dans cette + autorisation, et ceci pendant toute la durée de protection de l'Oeuvre + par le droit de la propriété littéraire et artistique ou le droit + applicable, l'Offrant accorde à l'Acceptant l'autorisation mondiale + d'exercer à titre gratuit et non exclusif les droits suivants : + . + reproduire l'Oeuvre, incorporer l'Oeuvre dans une ou plusieurs + Oeuvres dites Collectives et reproduire l'Oeuvre telle + qu'incorporée dans lesdites Oeuvres dites Collectives; créer et + reproduire des Oeuvres dites Dérivées; distribuer des exemplaires + ou enregistrements, présenter, représenter ou communiquer l'Oeuvre + au public par tout procédé technique, y compris incorporée dans + des Oeuvres Collectives; distribuer des exemplaires ou + phonogrammes, présenter, représenter ou communiquer au public des + Oeuvres dites Dérivées par tout procédé technique; lorsque + l'Oeuvre est une base de données, extraire et réutiliser des + parties substantielles de l'Oeuvre. + . + Les droits mentionnés ci-dessus peuvent être exercés sur tous les + supports, médias, procédés techniques et formats. Les droits ci-dessus + incluent le droit d'effectuer les modifications nécessaires + techniquement à l'exercice des droits dans d'autres formats et + procédés techniques. L'exercice de tous les droits qui ne sont pas + expressément autorisés par l'Offrant ou dont il n'aurait pas la + gestion demeure réservé, notamment les mécanismes de gestion + collective obligatoire applicables décrits à l'article 4(d). + . + 4. Restrictions. L'autorisation accordée par l'article 3 est + expressément assujettie et limitée par le respect des restrictions + suivantes : + . + L'Acceptant peut reproduire, distribuer, représenter ou + communiquer au public l'Oeuvre y compris par voie numérique + uniquement selon les termes de ce Contrat. L'Acceptant doit + inclure une copie ou l'adresse Internet (Identifiant Uniforme de + Ressource) du présent Contrat à toute reproduction ou + enregistrement de l'Oeuvre que l'Acceptant distribue, représente + ou communique au public y compris par voie numérique. L'Acceptant + ne peut pas offrir ou imposer de conditions d'utilisation de + l'Oeuvre qui altèrent ou restreignent les termes du présent + Contrat ou l'exercice des droits qui y sont accordés au + bénéficiaire. L'Acceptant ne peut pas céder de droits sur + l'Oeuvre. L'Acceptant doit conserver intactes toutes les + informations qui renvoient à ce Contrat et à l'exonération de + responsabilité. L'Acceptant ne peut pas reproduire, distribuer, + représenter ou communiquer au public l'Oeuvre, y compris par voie + numérique, en utilisant une mesure technique de contrôle d'accès + ou de contrôle d'utilisation qui serait contradictoire avec les + termes de cet Accord contractuel. Les mentions ci-dessus + s'appliquent à l'Oeuvre telle qu'incorporée dans une Oeuvre dite + Collective, mais, en dehors de l'Oeuvre en elle-même, ne + soumettent pas l'Oeuvre dite Collective, aux termes du présent + Contrat. Si l'Acceptant crée une Oeuvre dite Collective, à la + demande de tout Offrant, il devra, dans la mesure du possible, + retirer de l'Oeuvre dite Collective toute référence au dit + Offrant, comme demandé. Si l'Acceptant crée une Oeuvre dite + Collective, à la demande de tout Auteur, il devra, dans la mesure + du possible, retirer de l'Oeuvre dite Collective toute référence + au dit Auteur, comme demandé. Si l'Acceptant crée une Oeuvre dite + Dérivée, à la demande de tout Offrant, il devra, dans la mesure du + possible, retirer de l'Oeuvre dite Dérivée toute référence au dit + Offrant, comme demandé. Si l'Acceptant crée une Oeuvre dite + Dérivée, à la demande de tout Auteur, il devra, dans la mesure du + possible, retirer de l'Oeuvre dite Dérivée toute référence au dit + Auteur, comme demandé. L'Acceptant peut reproduire, distribuer, + représenter ou communiquer au public une Oeuvre dite Dérivée y + compris par voie numérique uniquement sous les termes de ce + Contrat, ou d'une version ultérieure de ce Contrat comprenant les + mêmes Options du Contrat que le présent Contrat, ou un Contrat + Creative Commons iCommons comprenant les mêmes Options du Contrat + que le présent Contrat (par exemple Paternité - Pas d'Utilisation + Commerciale - Partage Des Conditions Initiales A l'Identique 2.0 + Japon). L'Acceptant doit inclure une copie ou l'adresse Internet + (Identifiant Uniforme de Ressource) du présent Contrat, ou d'un + autre Contrat tel que décrit à la phrase précédente, à toute + reproduction ou enregistrement de l'Oeuvre dite Dérivée que + l'Acceptant distribue, représente ou communique au public y + compris par voie numérique. L'Acceptant ne peut pas offrir ou + imposer de conditions d'utilisation sur l'Oeuvre dite Dérivée qui + altèrent ou restreignent les termes du présent Contrat ou + l'exercice des droits qui y sont accordés au bénéficiaire, et doit + conserver intactes toutes les informations qui renvoient à ce + Contrat et à l'avertissement sur les garanties. L'Acceptant ne + peut pas reproduire, distribuer, représenter ou communiquer au + public y compris par voie numérique l'Oeuvre dite Dérivée en + utilisant une mesure technique de contrôle d'accès ou de contrôle + d'utilisation qui serait contradictoire avec les termes de cet + Accord contractuel. Les mentions ci-dessus s'appliquent à l'Oeuvre + dite Dérivée telle qu'incorporée dans une Oeuvre dite Collective, + mais, en dehors de l'Oeuvre dite Dérivée en elle-même, ne + soumettent pas l'Oeuvre Collective, aux termes du présent Contrat. + Si l'Acceptant reproduit, distribue, représente ou communique au + public, y compris par voie numérique, l'Oeuvre ou toute Oeuvre + dite Dérivée ou toute Oeuvre dite Collective, il doit conserver + intactes toutes les informations sur le régime des droits et en + attribuer la paternité à l'Auteur Original, de manière raisonnable + au regard au médium ou au moyen utilisé. Il doit communiquer le + nom de l'Auteur Original ou son éventuel pseudonyme s'il est + indiqué ; le titre de l'Oeuvre Originale s'il est indiqué ; dans + la mesure du possible, l'adresse Internet ou Identifiant Uniforme + de Ressource (URI), s'il existe, spécifié par l'Offrant comme + associé à l'Oeuvre, à moins que cette adresse ne renvoie pas aux + informations légales (paternité et conditions d'utilisation de + l'Oeuvre). Dans le cas d'une Oeuvre dite Dérivée, il doit indiquer + les éléments identifiant l'utilisation l'Oeuvre dans l'Oeuvre dite + Dérivée par exemple « Traduction anglaise de l'Oeuvre par l'Auteur + Original » ou « Scénario basé sur l'Oeuvre par l'Auteur Original + ». Ces obligations d'attribution de paternité doivent être + exécutées de manière raisonnable. Cependant, dans le cas d'une + Oeuvre dite Dérivée ou d'une Oeuvre dite Collective, ces + informations doivent, au minimum, apparaître à la place et de + manière aussi visible que celles à laquelle apparaissent les + informations de même nature. Dans le cas où une utilisation de + l'Oeuvre serait soumise à un régime légal de gestion collective + obligatoire, l'Offrant se réserve le droit exclusif de collecter + ces redevances par l'intermédiaire de la société de perception et + de répartition des droits compétente. Sont notamment concernés la + radiodiffusion et la communication dans un lieu public de + phonogrammes publiés à des fins de commerce, certains cas de + retransmission par câble et satellite, la copie privée d'Oeuvres + fixées sur phonogrammes ou vidéogrammes, la reproduction par + reprographie. + . + 5. Garantie et exonération de responsabilité + . + En mettant l'Oeuvre à la disposition du public selon les termes de + ce Contrat, l'Offrant déclare de bonne foi qu'à sa + connaissance et dans les limites d'une enquête raisonnable : + L'Offrant a obtenu tous les droits sur l'Oeuvre nécessaires + pour pouvoir autoriser l'exercice des droits accordés par le + présent Contrat, et permettre la jouissance paisible et + l'exercice licite de ces droits, ceci sans que l'Acceptant + n'ait aucune obligation de verser de rémunération ou tout + autre paiement ou droits, dans la limite des mécanismes de + gestion collective obligatoire applicables décrits à l'article + 4(e); L'Oeuvre n'est constitutive ni d'une violation des + droits de tiers, notamment du droit de la propriété littéraire + et artistique, du droit des marques, du droit de + l'information, du droit civil ou de tout autre droit, ni de + diffamation, de violation de la vie privée ou de tout autre + préjudice délictuel à l'égard de toute tierce partie. A + l'exception des situations expressément mentionnées dans le + présent Contrat ou dans un autre accord écrit, ou exigées par + la loi applicable, l'Oeuvre est mise à disposition en l'état + sans garantie d'aucune sorte, qu'elle soit expresse ou tacite, + y compris à l'égard du contenu ou de l'exactitude de l'Oeuvre. + . + 6. Limitation de responsabilité. A l'exception des garanties d'ordre + public imposées par la loi applicable et des réparations imposées par + le régime de la responsabilité vis-à-vis d'un tiers en raison de la + violation des garanties prévues par l'article 5 du présent contrat, + l'Offrant ne sera en aucun cas tenu responsable vis-à-vis de + l'Acceptant, sur la base d'aucune théorie légale ni en raison d'aucun + préjudice direct, indirect, matériel ou moral, résultant de + l'exécution du présent Contrat ou de l'utilisation de l'Oeuvre, y + compris dans l'hypothèse où l'Offrant avait connaissance de la + possible existence d'un tel préjudice. + . + 7. Résiliation + . + Tout manquement aux termes du contrat par l'Acceptant entraîne la + résiliation automatique du Contrat et la fin des droits qui en + découlent. Cependant, le contrat conserve ses effets envers les + personnes physiques ou morales qui ont reçu de la part de + l'Acceptant, en exécution du présent contrat, la mise à + disposition d'Oeuvres dites Dérivées, ou d'Oeuvres dites + Collectives, ceci tant qu'elles respectent pleinement leurs + obligations. Les sections 1, 2, 5, 6 et 7 du contrat continuent à + s'appliquer après la résiliation de celui-ci. Dans les limites + indiquées ci-dessus, le présent Contrat s'applique pendant toute + la durée de protection de l'Oeuvre selon le droit + applicable. Néanmoins, l'Offrant se réserve à tout moment le droit + d'exploiter l'Oeuvre sous des conditions contractuelles + différentes, ou d'en cesser la diffusion; cependant, le recours à + cette option ne doit pas conduire à retirer les effets du présent + Contrat (ou de tout contrat qui a été ou doit être accordé selon + les termes de ce Contrat), et ce Contrat continuera à s'appliquer + dans tous ses effets jusqu'à ce que sa résiliation intervienne + dans les conditions décrites ci-dessus. + . + 8. Divers + . + A chaque reproduction ou communication au public par voie + numérique de l'Oeuvre ou d'une Oeuvre dite Collective par + l'Acceptant, l'Offrant propose au bénéficiaire une offre de mise à + disposition de l'Oeuvre dans des termes et conditions identiques à + ceux accordés à la partie Acceptante dans le présent Contrat. A + chaque reproduction ou communication au public par voie numérique + d'une Oeuvre dite Dérivée par l'Acceptant, l'Offrant propose au + bénéficiaire une offre de mise à disposition du bénéficiaire de + l'Oeuvre originale dans des termes et conditions identiques à ceux + accordés à la partie Acceptante dans le présent Contrat. La + nullité ou l'inapplicabilité d'une quelconque disposition de ce + Contrat au regard de la loi applicable n'affecte pas celle des + autres dispositions qui resteront pleinement valides et + applicables. Sans action additionnelle par les parties à cet + accord, lesdites dispositions devront être interprétées dans la + mesure minimum nécessaire à leur validité et leur applicabilité. + Aucune limite, renonciation ou modification des termes ou + dispositions du présent Contrat ne pourra être acceptée sans le + consentement écrit et signé de la partie compétente. Ce Contrat + constitue le seul accord entre les parties à propos de l'Oeuvre + mise ici à disposition. Il n'existe aucun élément annexe, accord + supplémentaire ou mandat portant sur cette Oeuvre en dehors des + éléments mentionnés ici. L'Offrant ne sera tenu par aucune + disposition supplémentaire qui pourrait apparaître dans une + quelconque communication en provenance de l'Acceptant. Ce Contrat + ne peut être modifié sans l'accord mutuel écrit de l'Offrant et de + l'Acceptant. Le droit applicable est le droit français. + . + Creative Commons n'est pas partie à ce Contrat et n'offre aucune forme + de garantie relative à l'Oeuvre. Creative Commons décline toute + responsabilité à l'égard de l'Acceptant ou de toute autre partie, quel + que soit le fondement légal de cette responsabilité et quel que soit + le préjudice subi, direct, indirect, matériel ou moral, qui + surviendrait en rapport avec le présent Contrat. Cependant, si + Creative Commons s'est expressément identifié comme Offrant pour + mettre une Oeuvre à disposition selon les termes de ce Contrat, + Creative Commons jouira de tous les droits et obligations d'un + Offrant. + . + A l'exception des fins limitées à informer le public que l'Oeuvre est + mise à disposition sous CPCC, aucune des parties n'utilisera la marque + « Creative Commons » ou toute autre indication ou logo afférent sans + le consentement préalable écrit de Creative Commons. Toute utilisation + autorisée devra être effectuée en conformité avec les lignes + directrices de Creative Commons à jour au moment de l'utilisation, + telles qu'elles sont disponibles sur son site Internet ou sur simple + demande. + . + Creative Commons peut être contacté à https://creativecommons.org/. diff --git a/missing-sources/Esox_lucius1.jpg b/missing-sources/Esox_lucius1.jpg new file mode 100644 index 0000000..094ac6b Binary files /dev/null and b/missing-sources/Esox_lucius1.jpg differ diff --git a/missing-sources/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg b/missing-sources/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg new file mode 100644 index 0000000..59d6240 Binary files /dev/null and b/missing-sources/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg differ diff --git a/patches/disable-test_docinfo_problems.patch b/patches/disable-test_docinfo_problems.patch new file mode 100644 index 0000000..a69ca9a --- /dev/null +++ b/patches/disable-test_docinfo_problems.patch @@ -0,0 +1,59 @@ +From: Sean Whitton +Date: Thu, 10 Jan 2019 08:32:44 -0700 +Subject: disable test_docinfo_problems + +Needs a test resource whose DFSG status is in doubt. +--- + tests/test_metadata.py | 17 ----------------- + 1 file changed, 17 deletions(-) + +--- a/tests/test_metadata.py ++++ b/tests/test_metadata.py +@@ -50,12 +50,6 @@ def trivial(resources): + + + @pytest.fixture +-def enron1(resources): +- # Has nuls in docinfo, old PDF +- return Pdf.open(resources / 'enron1_gs.pdf') +- +- +-@pytest.fixture + def invalid_creationdate(resources): + # Has nuls in docinfo, old PDF + return Pdf.open(resources / 'invalid_creationdate.pdf') +@@ -320,34 +314,6 @@ def test_remove_attribute_metadata(sandw + + # Ensure the whole node was deleted + assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) +- +- +-def test_docinfo_problems(enron1, invalid_creationdate): +- meta = enron1.open_metadata() +- meta._load() # File has invalid XML sequence � +- with meta: +- with pytest.warns(UserWarning) as warned: +- meta.load_from_docinfo(invalid_creationdate.docinfo) +- assert 'could not be copied' in warned[0].message.args[0] +- with pytest.raises(ValueError): +- meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) +- +- with pytest.warns(UserWarning) as warned: +- with meta as xmp: +- xmp['xmp:CreateDate'] = 'invalid date' +- assert 'could not be updated' in warned[0].message.args[0] +- +- +-def test_wrong_xml(enron1): +- enron1.Root.Metadata = Stream(enron1, b""" +- This is valid xml but not valid XMP +- """.strip()) +- meta = enron1.open_metadata() +- with pytest.raises(ValueError, message='not XMP'): +- with meta: +- pass +- with pytest.raises(ValueError, message='not XMP'): +- meta['pdfaid:part'] + + + def test_no_x_xmpmeta(trivial): diff --git a/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch b/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch new file mode 100644 index 0000000..e440dca --- /dev/null +++ b/patches/docs-build-use-DEB_VERSION_UPSTREAM.patch @@ -0,0 +1,28 @@ +From: Sean Whitton +Date: Sat, 18 Aug 2018 09:37:58 -0700 +Subject: docs build use DEB_VERSION_UPSTREAM + +--- + docs/conf.py | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/docs/conf.py ++++ b/docs/conf.py +@@ -52,8 +52,6 @@ else: + # documentation root, use os.path.abspath to make it absolute, like shown here. + sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) + +-import pikepdf +- + # -- General configuration ------------------------------------------------ + + # If your documentation needs a minimal Sphinx version, state it here. +@@ -101,7 +99,7 @@ author = u'James R. Barlow' + # |version| and |release|, also used in various other places throughout the + # built documents. + +-release = get_distribution('pikepdf').version ++release = os.environ['DEB_VERSION_UPSTREAM'] + version = '.'.join(release.split('.')[:2]) + + # The language for content autogenerated by Sphinx. Refer to documentation diff --git a/patches/drop-installation-from-docs-contents.patch b/patches/drop-installation-from-docs-contents.patch new file mode 100644 index 0000000..65ca879 --- /dev/null +++ b/patches/drop-installation-from-docs-contents.patch @@ -0,0 +1,18 @@ +From: Sean Whitton +Date: Sat, 18 Aug 2018 10:52:46 -0700 +Subject: drop installation from docs contents + +--- + docs/index.rst | 1 - + 1 file changed, 1 deletion(-) + +--- a/docs/index.rst ++++ b/docs/index.rst +@@ -111,7 +111,6 @@ practical examples, particular in ``pdfi + :caption: Introduction + :name: intro_toc + +- installation + changelog + tutorial + objects diff --git a/patches/drop-setuptools_scm_git_archive-from-setup.py.patch b/patches/drop-setuptools_scm_git_archive-from-setup.py.patch new file mode 100644 index 0000000..9349e43 --- /dev/null +++ b/patches/drop-setuptools_scm_git_archive-from-setup.py.patch @@ -0,0 +1,19 @@ +From: Sean Whitton +Date: Wed, 10 Oct 2018 08:17:05 -0700 +Subject: drop setuptools_scm_git_archive from setup.py + +Pending resolution of #910742. +--- + setup.py | 1 - + 1 file changed, 1 deletion(-) + +--- a/setup.py ++++ b/setup.py +@@ -134,7 +134,6 @@ setup( + setup_requires=[ + 'pytest-runner', + 'setuptools_scm', +- 'setuptools_scm_git_archive', + 'pybind11 >= 2.2.4, < 3' + ], + use_scm_version=True, diff --git a/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch b/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch new file mode 100644 index 0000000..0254bee --- /dev/null +++ b/patches/fix_xmp_metadata_without_xmpmeta_wrapper.patch @@ -0,0 +1,110 @@ +From d31ea8fed2004345b3c274172ff0c28b7c6aca16 Mon Sep 17 00:00:00 2001 +From: "James R. Barlow" +Date: Wed, 6 Feb 2019 00:36:59 -0800 +Subject: [PATCH] Fix handling of XMP metadata with no wrapper + +--- + src/pikepdf/models/metadata.py | 12 ++++++++-- + tests/test_metadata.py | 41 ++++++++++++++++++++++++++++++++-- + 2 files changed, 49 insertions(+), 4 deletions(-) + +diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py +index 58c5a9d..23d246b 100644 +--- a/src/pikepdf/models/metadata.py ++++ b/src/pikepdf/models/metadata.py +@@ -428,6 +428,14 @@ def _get_subelements(self, node): + return result + return '' + ++ def _get_rdf_root(self): ++ rdf = self._xmp.find('.//rdf:RDF', self.NS) ++ if rdf is None: ++ rdf = self._xmp.getroot() ++ if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': ++ raise ValueError("Metadata seems to be XML but not XMP") ++ return rdf ++ + def _get_elements(self, name=''): + """Get elements from XMP + +@@ -452,7 +460,7 @@ def _get_elements(self, name=''): + + """ + qname = self._qname(name) +- rdf = self._xmp.find('.//rdf:RDF', self.NS) ++ rdf = self._get_rdf_root() + for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS): + if qname and qname in rdfdesc.keys(): + yield (rdfdesc, qname, rdfdesc.get(qname), rdf) +@@ -540,7 +548,7 @@ def add_array(node, items): + raise TypeError(val) + except StopIteration: + # Insert a new node +- rdf = self._xmp.find('.//rdf:RDF', self.NS) ++ rdf = self._get_rdf_root() + if str(self._qname(key)) in LANG_ALTS: + val = AltList([clean(val)]) + if isinstance(val, (list, set)): +diff --git a/tests/test_metadata.py b/tests/test_metadata.py +index abe05ff..be654c8 100644 +--- a/tests/test_metadata.py ++++ b/tests/test_metadata.py +@@ -6,7 +6,7 @@ + from hypothesis import given + from hypothesis.strategies import integers + import pikepdf +-from pikepdf import Pdf, Dictionary, Name, PasswordError ++from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream + from pikepdf.models.metadata import ( + decode_pdf_date, encode_pdf_date, + XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP, +@@ -285,7 +285,7 @@ def test_bad_char_rejection(trivial): + ET.fromstring(str(xmp)) + + +-def test_xpacket(sandwich): ++def test_xpacket_generation(sandwich): + xmpstr1 = sandwich.Root.Metadata.read_bytes() + xpacket_begin = b'This is valid xml but not valid XMP ++ """.strip()) ++ meta = enron1.open_metadata() ++ with pytest.raises(ValueError, message='not XMP'): ++ with meta: ++ pass ++ with pytest.raises(ValueError, message='not XMP'): ++ meta['pdfaid:part'] ++ ++ ++def test_no_x_xmpmeta(trivial): ++ trivial.Root.Metadata = Stream(trivial, b""" ++ ++ ++ ++ 1 ++ A ++ Simple Scan 3.30.2 ++ 2019-02-05T07:08:46+01:00 ++ 2019-02-05T07:08:46+01:00 ++ 2019-02-05T07:08:46+01:00 ++ ++ ++ ++ """.strip()) ++ ++ with trivial.open_metadata() as xmp: ++ assert xmp._get_rdf_root() is not None ++ xmp['pdfaid:part'] = '2' ++ assert xmp['pdfaid:part'] == '2' diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..1b13db8 --- /dev/null +++ b/patches/series @@ -0,0 +1,5 @@ +docs-build-use-DEB_VERSION_UPSTREAM.patch +drop-installation-from-docs-contents.patch +drop-setuptools_scm_git_archive-from-setup.py.patch +fix_xmp_metadata_without_xmpmeta_wrapper.patch +disable-test_docinfo_problems.patch diff --git a/pikepdf-doc.doc-base b/pikepdf-doc.doc-base new file mode 100644 index 0000000..04a2a8d --- /dev/null +++ b/pikepdf-doc.doc-base @@ -0,0 +1,9 @@ +Document: pikepdf +Title: pikepdf documentation +Author: James R. Barlow +Abstract: Instructions for using pikepdf +Section: Graphics + +Format: HTML +Index: /usr/share/doc/python3-pikepdf/html/index.html +Files: /usr/share/doc/python3-pikepdf/html/*.html diff --git a/pikepdf-doc.install b/pikepdf-doc.install new file mode 100644 index 0000000..17df768 --- /dev/null +++ b/pikepdf-doc.install @@ -0,0 +1 @@ +docs/_build/html /usr/share/doc/python3-pikepdf/ diff --git a/python3-pikepdf.examples b/python3-pikepdf.examples new file mode 100644 index 0000000..7916f87 --- /dev/null +++ b/python3-pikepdf.examples @@ -0,0 +1 @@ +examples/find_links.py diff --git a/rules b/rules new file mode 100755 index 0000000..232de3a --- /dev/null +++ b/rules @@ -0,0 +1,31 @@ +#!/usr/bin/make -f + +include /usr/share/dpkg/pkg-info.mk +export DEB_VERSION_UPSTREAM + +# deal with unicode encoding/decoding errors in the package build. +# This isn't needed in d/tests/test-suite because autopkgtest sets +# LANG=C.UTF-8 +export LC_ALL=C.UTF-8 + +# needed as we have more than one binary package +export PYBUILD_DESTDIR=debian/python3-pikepdf + +export DEB_BUILD_MAINT_OPTIONS = hardening=+all + +%: + dh $@ --with python3,sphinxdoc --buildsystem=pybuild + +override_dh_auto_build: + @set -ex; cd docs; make html + dh_auto_build -O--buildsystem=pybuild + +# don't install installation instructions +override_dh_install: + dh_install -Xinstallation +override_dh_sphinxdoc: + dh_sphinxdoc -Xinstallation + +# for some reason this gets compressed, breaking links +override_dh_compress: + dh_compress -Xchangelog.html diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/source/include-binaries b/source/include-binaries new file mode 100644 index 0000000..46b2737 --- /dev/null +++ b/source/include-binaries @@ -0,0 +1,2 @@ +debian/missing-sources/LinnSequencer_hardware_MIDI_sequencer_brochure_page_2_300dpi.jpg +debian/missing-sources/Esox_lucius1.jpg diff --git a/source/lintian-overrides b/source/lintian-overrides new file mode 100644 index 0000000..50862e1 --- /dev/null +++ b/source/lintian-overrides @@ -0,0 +1,2 @@ +# false positive +incomplete-creative-commons-license diff --git a/tests/control b/tests/control new file mode 100644 index 0000000..c20747f --- /dev/null +++ b/tests/control @@ -0,0 +1,11 @@ +Tests: test-suite +Restrictions: rw-build-tree +# ^ pytest-runner writes stuff to the source tree +Depends: @, + python3-pytest, + python3-pytest-helpers-namespace, + python3-pytest-timeout, + python3-pytest-xdist, + python3-hypothesis, + python3-pil, + python3-attr, diff --git a/tests/test-suite b/tests/test-suite new file mode 100755 index 0000000..1718b87 --- /dev/null +++ b/tests/test-suite @@ -0,0 +1,12 @@ +#!/bin/bash + +# ensure that a test suite failure is registered +set -e + +# ensure we test installed version +mv src/pikepdf src/disabled + +py.test-3 + +# comply with rw-build-tree +mv src/disabled src/pikepdf diff --git a/watch b/watch new file mode 100644 index 0000000..a0e77c4 --- /dev/null +++ b/watch @@ -0,0 +1,3 @@ +version=4 +opts="mode=git,uversionmangle=s/(rc|a|b|c)/~$1/,dversionmangle=s/\+dfsg\d*$//" \ +https://github.com/pikepdf/pikepdf refs/tags/v([\d\.\d\.]+) debian -- cgit v1.2.3 From c84b83b8572fc9b0b69200930ae4db66c4934a21 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index e0f367e..f0dd83d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,8 +52,6 @@ else: # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -101,7 +99,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From baf9a489c6f1623bd0bd710be9fd38bf5e819068 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 4ec5ad1..bf31c04 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -111,7 +111,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation changelog tutorial objects -- cgit v1.2.3 From cf520f14f86d8cef9d6a8f671732760b704baa45 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Wed, 10 Oct 2018 08:17:05 -0700 Subject: drop setuptools_scm_git_archive from setup.py Pending resolution of #910742. Gbp-Pq: Name drop-setuptools_scm_git_archive-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index ab5c9c2..3793c56 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,6 @@ setup( setup_requires=[ 'pytest-runner', 'setuptools_scm', - 'setuptools_scm_git_archive', 'pybind11 >= 2.2.4, < 3' ], use_scm_version=True, -- cgit v1.2.3 From 4647642afa196d9ee07da509e9697f5687e4c8ea Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 6 Feb 2019 00:36:59 -0800 Subject: [PATCH] Fix handling of XMP metadata with no wrapper Gbp-Pq: Name fix_xmp_metadata_without_xmpmeta_wrapper.patch --- src/pikepdf/models/metadata.py | 12 ++++++++++-- tests/test_metadata.py | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py index 9f98b8a..1a0eeb2 100644 --- a/src/pikepdf/models/metadata.py +++ b/src/pikepdf/models/metadata.py @@ -434,6 +434,14 @@ class PdfMetadata(MutableMapping): return result return '' + def _get_rdf_root(self): + rdf = self._xmp.find('.//rdf:RDF', self.NS) + if rdf is None: + rdf = self._xmp.getroot() + if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': + raise ValueError("Metadata seems to be XML but not XMP") + return rdf + def _get_elements(self, name=''): """Get elements from XMP @@ -458,7 +466,7 @@ class PdfMetadata(MutableMapping): """ qname = self._qname(name) - rdf = self._xmp.find('.//rdf:RDF', self.NS) + rdf = self._get_rdf_root() for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS): if qname and qname in rdfdesc.keys(): yield (rdfdesc, qname, rdfdesc.get(qname), rdf) @@ -546,7 +554,7 @@ class PdfMetadata(MutableMapping): raise TypeError(val) except StopIteration: # Insert a new node - rdf = self._xmp.find('.//rdf:RDF', self.NS) + rdf = self._get_rdf_root() if str(self._qname(key)) in LANG_ALTS: val = AltList([clean(val)]) if isinstance(val, (list, set)): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index db54463..12f1c13 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -6,7 +6,7 @@ import pytest from hypothesis import given from hypothesis.strategies import integers import pikepdf -from pikepdf import Pdf, Dictionary, Name, PasswordError +from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream from pikepdf.models.metadata import ( decode_pdf_date, encode_pdf_date, XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP, @@ -285,7 +285,7 @@ def test_bad_char_rejection(trivial): ET.fromstring(str(xmp)) -def test_xpacket(sandwich): +def test_xpacket_generation(sandwich): xmpstr1 = sandwich.Root.Metadata.read_bytes() xpacket_begin = b'This is valid xml but not valid XMP + """.strip()) + meta = enron1.open_metadata() + with pytest.raises(ValueError, message='not XMP'): + with meta: + pass + with pytest.raises(ValueError, message='not XMP'): + meta['pdfaid:part'] + + +def test_no_x_xmpmeta(trivial): + trivial.Root.Metadata = Stream(trivial, b""" + + + + 1 + A + Simple Scan 3.30.2 + 2019-02-05T07:08:46+01:00 + 2019-02-05T07:08:46+01:00 + 2019-02-05T07:08:46+01:00 + + + + """.strip()) + + with trivial.open_metadata() as xmp: + assert xmp._get_rdf_root() is not None + xmp['pdfaid:part'] = '2' + assert xmp['pdfaid:part'] == '2' -- cgit v1.2.3 From 628754b07a24e0333ad3786583cd8582f93d44cd Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 12f1c13..1d41878 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -49,12 +49,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -322,34 +316,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream(enron1, b""" - This is valid xml but not valid XMP - """.strip()) - meta = enron1.open_metadata() - with pytest.raises(ValueError, message='not XMP'): - with meta: - pass - with pytest.raises(ValueError, message='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream(trivial, b""" -- cgit v1.2.3 From b0051c564ba19f7be6ac356910e508f9b2f8a723 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index e0f367e..f0dd83d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,8 +52,6 @@ else: # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -101,7 +99,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From 9967e4416836f0543299b42c1a323479463230a8 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 4ec5ad1..bf31c04 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -111,7 +111,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation changelog tutorial objects -- cgit v1.2.3 From 349e9d48502fe5a21520535767b0f01a3603c9cf Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Wed, 10 Oct 2018 08:17:05 -0700 Subject: drop setuptools_scm_git_archive from setup.py Pending resolution of #910742. Gbp-Pq: Name drop-setuptools_scm_git_archive-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index ab5c9c2..3793c56 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,6 @@ setup( setup_requires=[ 'pytest-runner', 'setuptools_scm', - 'setuptools_scm_git_archive', 'pybind11 >= 2.2.4, < 3' ], use_scm_version=True, -- cgit v1.2.3 From 224eb8f49e53f33b0a7ac75cc892ddb4916ed81e Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 6 Feb 2019 00:36:59 -0800 Subject: [PATCH] Fix handling of XMP metadata with no wrapper Gbp-Pq: Name fix_xmp_metadata_without_xmpmeta_wrapper.patch --- src/pikepdf/models/metadata.py | 12 ++++++++++-- tests/test_metadata.py | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py index 9f98b8a..1a0eeb2 100644 --- a/src/pikepdf/models/metadata.py +++ b/src/pikepdf/models/metadata.py @@ -434,6 +434,14 @@ class PdfMetadata(MutableMapping): return result return '' + def _get_rdf_root(self): + rdf = self._xmp.find('.//rdf:RDF', self.NS) + if rdf is None: + rdf = self._xmp.getroot() + if not rdf.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF': + raise ValueError("Metadata seems to be XML but not XMP") + return rdf + def _get_elements(self, name=''): """Get elements from XMP @@ -458,7 +466,7 @@ class PdfMetadata(MutableMapping): """ qname = self._qname(name) - rdf = self._xmp.find('.//rdf:RDF', self.NS) + rdf = self._get_rdf_root() for rdfdesc in rdf.findall('rdf:Description[@rdf:about=""]', self.NS): if qname and qname in rdfdesc.keys(): yield (rdfdesc, qname, rdfdesc.get(qname), rdf) @@ -546,7 +554,7 @@ class PdfMetadata(MutableMapping): raise TypeError(val) except StopIteration: # Insert a new node - rdf = self._xmp.find('.//rdf:RDF', self.NS) + rdf = self._get_rdf_root() if str(self._qname(key)) in LANG_ALTS: val = AltList([clean(val)]) if isinstance(val, (list, set)): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index db54463..12f1c13 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -6,7 +6,7 @@ import pytest from hypothesis import given from hypothesis.strategies import integers import pikepdf -from pikepdf import Pdf, Dictionary, Name, PasswordError +from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream from pikepdf.models.metadata import ( decode_pdf_date, encode_pdf_date, XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP, @@ -285,7 +285,7 @@ def test_bad_char_rejection(trivial): ET.fromstring(str(xmp)) -def test_xpacket(sandwich): +def test_xpacket_generation(sandwich): xmpstr1 = sandwich.Root.Metadata.read_bytes() xpacket_begin = b'This is valid xml but not valid XMP + """.strip()) + meta = enron1.open_metadata() + with pytest.raises(ValueError, message='not XMP'): + with meta: + pass + with pytest.raises(ValueError, message='not XMP'): + meta['pdfaid:part'] + + +def test_no_x_xmpmeta(trivial): + trivial.Root.Metadata = Stream(trivial, b""" + + + + 1 + A + Simple Scan 3.30.2 + 2019-02-05T07:08:46+01:00 + 2019-02-05T07:08:46+01:00 + 2019-02-05T07:08:46+01:00 + + + + """.strip()) + + with trivial.open_metadata() as xmp: + assert xmp._get_rdf_root() is not None + xmp['pdfaid:part'] = '2' + assert xmp['pdfaid:part'] == '2' -- cgit v1.2.3 From 2a955498988d432462c18462e02da3074e1e0a61 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 12f1c13..1d41878 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -49,12 +49,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -322,34 +316,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream(enron1, b""" - This is valid xml but not valid XMP - """.strip()) - meta = enron1.open_metadata() - with pytest.raises(ValueError, message='not XMP'): - with meta: - pass - with pytest.raises(ValueError, message='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream(trivial, b""" -- cgit v1.2.3 From cc23069a1ecb2b33bf5123705aa77c02dbebf43c Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 12 Feb 2019 20:42:11 -0800 Subject: Fix issue #25 - year missing leading zero on some platforms Closes #25 (cherry picked from commit 4d22fe47912c518e8b3348aedccdac3f11ed81d7) Gbp-Pq: Name Fix-issue-25-year-missing-leading-zero-on-some-platforms.patch --- src/pikepdf/models/metadata.py | 7 +++++-- tests/test_metadata.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pikepdf/models/metadata.py b/src/pikepdf/models/metadata.py index 1a0eeb2..65934cd 100644 --- a/src/pikepdf/models/metadata.py +++ b/src/pikepdf/models/metadata.py @@ -121,8 +121,11 @@ def encode_pdf_date(d: datetime) -> str: the local time. """ - pdfmark_date_fmt = r'%Y%m%d%H%M%S' - s = d.strftime(pdfmark_date_fmt) + # The formatting of %Y is not consistent as described in + # https://bugs.python.org/issue13305 and underspecification in libc. + # So explicitly format the year with leading zeros + s = "{:04d}".format(d.year) + s += d.strftime(r'%m%d%H%M%S') tz = d.strftime('%z') if tz: sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5] diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 1d41878..41a879c 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -3,7 +3,7 @@ from datetime import datetime, timezone, timedelta import re import pytest -from hypothesis import given +from hypothesis import given, example from hypothesis.strategies import integers import pikepdf from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream @@ -252,6 +252,7 @@ def test_date_docinfo_from_xmp(): integers(0, 99), integers(0, 99), ) +@example(1, 1, 1, 0, 0, 0) def test_random_dates(year, month, day, hour, mins, sec): date_args = year, month, day, hour, mins, sec xmp = '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}'.format(*date_args) -- cgit v1.2.3 From d9d556d61194179df883920a1bc8a800d9de730f Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5d3c986..f01a2ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,8 +76,6 @@ else: sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext')) sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -123,7 +121,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From e5f9a7c19332821757b76502f5e598264aab0b7a Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index e64e10f..9cad0bb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -111,7 +111,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation release_notes tutorial objects -- cgit v1.2.3 From 77a66f0f641875280eafb26446369fbba2c4d7f6 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 1a1417f..3de8ccf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,12 +57,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -337,37 +331,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream( - enron1, - b""" - This is valid xml but not valid XMP - """.strip(), - ) - meta = enron1.open_metadata() - with pytest.raises(ValueError, match='not XMP'): - with meta: - pass - with pytest.raises(ValueError, match='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, -- cgit v1.2.3 From 231d65d6e2827720126c324cb7af247afcb03830 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:12:41 +0100 Subject: drop pybind11 from setup.py We are successfully providing pybind11 as a build-dep but ocrmypdf's setup.py fails to detect it for some reason. Gbp-Pq: Name drop-pybind11-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 59e9b3b..c1f62b2 100644 --- a/setup.py +++ b/setup.py @@ -138,7 +138,6 @@ setup( setup_requires=[ 'setuptools_scm', 'setuptools_scm_git_archive', - 'pybind11 >= 2.3.0, < 3', ], use_scm_version=True, tests_require=tests_require, -- cgit v1.2.3 From f63baf06228eeca289200fa0b72406a88b3ed10d Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:42:49 +0100 Subject: disable test_icc_extract Requires a test resource in Files-Excluded. Gbp-Pq: Name disable-test_icc_extract.patch --- tests/test_image_access.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_image_access.py b/tests/test_image_access.py index d4625d5..113a5ef 100644 --- a/tests/test_image_access.py +++ b/tests/test_image_access.py @@ -317,13 +317,6 @@ def test_icc_use(resources): assert pim.icc.profile.xcolor_space == 'GRAY' -def test_icc_extract(resources): - xobj, _pdf = first_image_in(resources / 'tree-icc.pdf') - - pim = PdfImage(xobj) - assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes() - - def test_stacked_compression(resources): xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf') -- cgit v1.2.3 From a61f101999f7b5d9ff2b3a43afaacf57b1001a5f Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5d3c986..f01a2ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,8 +76,6 @@ else: sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext')) sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -123,7 +121,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From 16c947b01f6374943d35324d071dda248ed2376f Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index b078ba0..f0f637e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -110,7 +110,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation release_notes tutorial -- cgit v1.2.3 From 4c19ed63d536c6c3bc9dfe5a958c533482a5cbd3 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 1a1417f..3de8ccf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,12 +57,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -337,37 +331,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream( - enron1, - b""" - This is valid xml but not valid XMP - """.strip(), - ) - meta = enron1.open_metadata() - with pytest.raises(ValueError, match='not XMP'): - with meta: - pass - with pytest.raises(ValueError, match='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, -- cgit v1.2.3 From 1a31069f5f76335bcd482d0e1a724c5f432e263a Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:12:41 +0100 Subject: drop pybind11 from setup.py We are successfully providing pybind11 as a build-dep but ocrmypdf's setup.py fails to detect it for some reason. Gbp-Pq: Name drop-pybind11-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 98ccf65..c0bddf8 100644 --- a/setup.py +++ b/setup.py @@ -147,7 +147,6 @@ if __name__ == '__main__': # for mp_compile setup_requires=[ 'setuptools_scm', 'setuptools_scm_git_archive', - 'pybind11 >= 2.3.0, < 3', ], use_scm_version=True, tests_require=tests_require, -- cgit v1.2.3 From d95740e00b6fd03bfd2b8f2ec2c2ae7646be653d Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:42:49 +0100 Subject: disable test_icc_extract Requires a test resource in Files-Excluded. Gbp-Pq: Name disable-test_icc_extract.patch --- tests/test_image_access.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_image_access.py b/tests/test_image_access.py index d4625d5..113a5ef 100644 --- a/tests/test_image_access.py +++ b/tests/test_image_access.py @@ -317,13 +317,6 @@ def test_icc_use(resources): assert pim.icc.profile.xcolor_space == 'GRAY' -def test_icc_extract(resources): - xobj, _pdf = first_image_in(resources / 'tree-icc.pdf') - - pim = PdfImage(xobj) - assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes() - - def test_stacked_compression(resources): xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf') -- cgit v1.2.3 From f1441130f616361f2212e3ff0ccc12c38b9d06fa Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5d3c986..f01a2ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,8 +76,6 @@ else: sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext')) sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -123,7 +121,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From 360b3878f85749e4cade1622f876e7637d71f44a Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index bd47eac..a932eea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -110,7 +110,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation release_notes tutorial -- cgit v1.2.3 From a42abc8e6505816a06fbe650dd08844798b7b42f Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 1a1417f..3de8ccf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,12 +57,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -337,37 +331,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream( - enron1, - b""" - This is valid xml but not valid XMP - """.strip(), - ) - meta = enron1.open_metadata() - with pytest.raises(ValueError, match='not XMP'): - with meta: - pass - with pytest.raises(ValueError, match='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, -- cgit v1.2.3 From dba1ebfde52c5f54e1e57de0f010475cdabbb9c4 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:12:41 +0100 Subject: drop pybind11 from setup.py We are successfully providing pybind11 as a build-dep but ocrmypdf's setup.py fails to detect it for some reason. Gbp-Pq: Name drop-pybind11-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index c26840a..ba0c4dc 100644 --- a/setup.py +++ b/setup.py @@ -147,7 +147,6 @@ if __name__ == '__main__': # for mp_compile setup_requires=[ 'setuptools_scm', 'setuptools_scm_git_archive', - 'pybind11 >= 2.3.0, < 3', ], use_scm_version=True, tests_require=tests_require, -- cgit v1.2.3 From 4590da5f889de84b997e37191316d70eb612c686 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:42:49 +0100 Subject: disable test_icc_extract Requires a test resource in Files-Excluded. Gbp-Pq: Name disable-test_icc_extract.patch --- tests/test_image_access.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_image_access.py b/tests/test_image_access.py index d4625d5..113a5ef 100644 --- a/tests/test_image_access.py +++ b/tests/test_image_access.py @@ -317,13 +317,6 @@ def test_icc_use(resources): assert pim.icc.profile.xcolor_space == 'GRAY' -def test_icc_extract(resources): - xobj, _pdf = first_image_in(resources / 'tree-icc.pdf') - - pim = PdfImage(xobj) - assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes() - - def test_stacked_compression(resources): xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf') -- cgit v1.2.3 From 45ace893d36f58caf67a042c235863d935eda72e Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5d3c986..f01a2ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,8 +76,6 @@ else: sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext')) sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -123,7 +121,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From b5339567839af1c4f1db6d01cf0024f2cd7b67ce Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index bd47eac..a932eea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -110,7 +110,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation release_notes tutorial -- cgit v1.2.3 From e1ab760d837c6f3f77eb9ad07870583fe8569b87 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 1a1417f..3de8ccf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,12 +57,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -337,37 +331,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream( - enron1, - b""" - This is valid xml but not valid XMP - """.strip(), - ) - meta = enron1.open_metadata() - with pytest.raises(ValueError, match='not XMP'): - with meta: - pass - with pytest.raises(ValueError, match='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, -- cgit v1.2.3 From 084eb9d6ddae9eea61c3f5d450429312bc6ebc74 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:12:41 +0100 Subject: drop pybind11 from setup.py We are successfully providing pybind11 as a build-dep but ocrmypdf's setup.py fails to detect it for some reason. Gbp-Pq: Name drop-pybind11-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index c26840a..ba0c4dc 100644 --- a/setup.py +++ b/setup.py @@ -147,7 +147,6 @@ if __name__ == '__main__': # for mp_compile setup_requires=[ 'setuptools_scm', 'setuptools_scm_git_archive', - 'pybind11 >= 2.3.0, < 3', ], use_scm_version=True, tests_require=tests_require, -- cgit v1.2.3 From b3196873b0a05fd59896fa31b85c1e22af240842 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:42:49 +0100 Subject: disable test_icc_extract Requires a test resource in Files-Excluded. Gbp-Pq: Name disable-test_icc_extract.patch --- tests/test_image_access.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_image_access.py b/tests/test_image_access.py index b88a5ad..9f0426d 100644 --- a/tests/test_image_access.py +++ b/tests/test_image_access.py @@ -321,13 +321,6 @@ def test_icc_use(resources): assert pim.icc.profile.xcolor_space == 'GRAY' -def test_icc_extract(resources): - xobj, _pdf = first_image_in(resources / 'tree-icc.pdf') - - pim = PdfImage(xobj) - assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes() - - def test_stacked_compression(resources): xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf') -- cgit v1.2.3 From e1894f101a0903cfa477e7b6db2ebc4e3b54137c Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 09:37:58 -0700 Subject: docs build use DEB_VERSION_UPSTREAM Gbp-Pq: Name docs-build-use-DEB_VERSION_UPSTREAM.patch --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5d3c986..f01a2ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,8 +76,6 @@ else: sys.path.insert(0, os.path.join(os.path.abspath('.'), './_ext')) sys.path.insert(0, os.path.join(os.path.abspath('.'), '..')) -import pikepdf - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -123,7 +121,7 @@ author = u'James R. Barlow' # |version| and |release|, also used in various other places throughout the # built documents. -release = get_distribution('pikepdf').version +release = os.environ['DEB_VERSION_UPSTREAM'] version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation -- cgit v1.2.3 From c2bdee11a58aafff4fcdf50d6e80fc1bb3040890 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Sat, 18 Aug 2018 10:52:46 -0700 Subject: drop installation from docs contents Gbp-Pq: Name drop-installation-from-docs-contents.patch --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index bd47eac..a932eea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -110,7 +110,6 @@ practical examples, particular in ``pdfinfo.py``, ``_weave.py``, and :caption: Introduction :name: intro_toc - installation release_notes tutorial -- cgit v1.2.3 From 20f2ab5ae19340472d2f33ef7eb9c53bf5ff90fe Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 10 Jan 2019 08:32:44 -0700 Subject: disable test_docinfo_problems Needs a test resource whose DFSG status is in doubt. Gbp-Pq: Name disable-test_docinfo_problems.patch --- tests/test_metadata.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 1a1417f..3de8ccf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,12 +57,6 @@ def trivial(resources): return Pdf.open(resources / 'pal-1bit-trivial.pdf') -@pytest.fixture -def enron1(resources): - # Has nuls in docinfo, old PDF - return Pdf.open(resources / 'enron1_gs.pdf') - - @pytest.fixture def invalid_creationdate(resources): # Has nuls in docinfo, old PDF @@ -337,37 +331,6 @@ def test_remove_attribute_metadata(sandwich): assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp)) -def test_docinfo_problems(enron1, invalid_creationdate): - meta = enron1.open_metadata() - meta._load() # File has invalid XML sequence � - with meta: - with pytest.warns(UserWarning) as warned: - meta.load_from_docinfo(invalid_creationdate.docinfo) - assert 'could not be copied' in warned[0].message.args[0] - with pytest.raises(ValueError): - meta.load_from_docinfo(invalid_creationdate.docinfo, raise_failure=True) - - with pytest.warns(UserWarning) as warned: - with meta as xmp: - xmp['xmp:CreateDate'] = 'invalid date' - assert 'could not be updated' in warned[0].message.args[0] - - -def test_wrong_xml(enron1): - enron1.Root.Metadata = Stream( - enron1, - b""" - This is valid xml but not valid XMP - """.strip(), - ) - meta = enron1.open_metadata() - with pytest.raises(ValueError, match='not XMP'): - with meta: - pass - with pytest.raises(ValueError, match='not XMP'): - meta['pdfaid:part'] - - def test_no_x_xmpmeta(trivial): trivial.Root.Metadata = Stream( trivial, -- cgit v1.2.3 From 5c478fb4edaeea560cf811518ecbee4a67387317 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:12:41 +0100 Subject: drop pybind11 from setup.py We are successfully providing pybind11 as a build-dep but ocrmypdf's setup.py fails to detect it for some reason. Gbp-Pq: Name drop-pybind11-from-setup.py.patch --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 6fd094a..8f55f90 100644 --- a/setup.py +++ b/setup.py @@ -147,7 +147,6 @@ if __name__ == '__main__': # for mp_compile setup_requires=[ 'setuptools_scm', 'setuptools_scm_git_archive', - 'pybind11 >= 2.4.3, < 3', ], use_scm_version=True, tests_require=tests_require, -- cgit v1.2.3 From 7b8b54380a9334086df2f1305757ce90a90f6c70 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Thu, 15 Aug 2019 18:42:49 +0100 Subject: disable test_icc_extract Requires a test resource in Files-Excluded. Gbp-Pq: Name disable-test_icc_extract.patch --- tests/test_image_access.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_image_access.py b/tests/test_image_access.py index b88a5ad..9f0426d 100644 --- a/tests/test_image_access.py +++ b/tests/test_image_access.py @@ -321,13 +321,6 @@ def test_icc_use(resources): assert pim.icc.profile.xcolor_space == 'GRAY' -def test_icc_extract(resources): - xobj, _pdf = first_image_in(resources / 'tree-icc.pdf') - - pim = PdfImage(xobj) - assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes() - - def test_stacked_compression(resources): xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf') -- cgit v1.2.3