diff options
author | James R. Barlow <jim@purplerock.ca> | 2019-07-24 15:37:29 -0700 |
---|---|---|
committer | James R. Barlow <jim@purplerock.ca> | 2019-07-24 15:37:29 -0700 |
commit | 81871d3e099277ff8f47fdb541a9d7e95b9e55ea (patch) | |
tree | 4e3b61308971daad9f3a8b9b8f863f3a055ab37d /src | |
parent | 00688818af464c9d8e685274cdfe2f92f0475a86 (diff) | |
parent | 9a7d8a45feae9020980b1afe4bab9ff7f6e9e0cc (diff) |
Merge branch 'feature/page-object-helper'
Diffstat (limited to 'src')
-rw-r--r-- | src/pikepdf/__init__.py | 12 | ||||
-rw-r--r-- | src/pikepdf/_methods.py | 67 | ||||
-rw-r--r-- | src/pikepdf/_version.py | 2 | ||||
-rw-r--r-- | src/pikepdf/codec.py | 2 | ||||
-rw-r--r-- | src/pikepdf/models/__init__.py | 41 | ||||
-rw-r--r-- | src/pikepdf/models/image.py | 9 | ||||
-rw-r--r-- | src/qpdf/object.cpp | 11 | ||||
-rw-r--r-- | src/qpdf/object_parsers.h | 1 | ||||
-rw-r--r-- | src/qpdf/page.cpp | 281 | ||||
-rw-r--r-- | src/qpdf/pikepdf.cpp | 1 | ||||
-rw-r--r-- | src/qpdf/pikepdf.h | 3 | ||||
-rw-r--r-- | src/qpdf/pipeline.cpp | 60 | ||||
-rw-r--r-- | src/qpdf/pipeline.h (renamed from src/qpdf/qpdf_pipeline.h) | 33 | ||||
-rw-r--r-- | src/qpdf/qpdf.cpp | 2 |
14 files changed, 440 insertions, 85 deletions
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py index 8de467a..50e3af0 100644 --- a/src/pikepdf/__init__.py +++ b/src/pikepdf/__init__.py @@ -12,7 +12,17 @@ except ImportError: raise ImportError("pikepdf's extension library failed to import") from ._version import __version__ -from ._qpdf import PdfError, Pdf, PasswordError, ObjectStreamMode, StreamDecodeLevel +from ._qpdf import ( + PdfError, + Pdf, + PasswordError, + ObjectStreamMode, + StreamDecodeLevel, + Page, + Token, + TokenFilter, + TokenType, +) from .objects import ( Object, ObjectType, diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py index ac6134c..1f44da4 100644 --- a/src/pikepdf/_methods.py +++ b/src/pikepdf/_methods.py @@ -19,12 +19,14 @@ from io import BytesIO from subprocess import PIPE, run from tempfile import NamedTemporaryFile -from . import Array, Dictionary, Name, Object, Pdf, Stream -from ._qpdf import _ObjectMapping +from . import Array, Dictionary, Name, Object, Pdf, Stream, Page +from ._qpdf import _ObjectMapping, Token from .models import PdfMetadata, Permissions, EncryptionInfo # pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object +__all__ = [] + def augments(cls_cpp): """Attach methods of a Python support class to an existing class @@ -316,7 +318,7 @@ class Extend_Pdf: def close(self): """ - Close a Pdf object and release resources acquired by pikepdf + Close a Pdf object and release resources acquired by pikepdf. If pikepdf opened the file handle it will close it (e.g. when opened with a file path). If the caller opened the file for pikepdf, the caller close the file. @@ -380,7 +382,8 @@ class Extend_Pdf: pikepdf has no way of enforcing permissions. - Returns: pikepdf.models.Permissions + Returns: + pikepdf.models.Permissions """ results = {} for field in Permissions.fields(): @@ -489,3 +492,59 @@ class Extend_ObjectMapping: def values(self): return (v for _k, v in self.items()) + + +def check_is_box(obj): + try: + if obj.is_rectangle: + return True + except AttributeError: + pass + + try: + pdfobj = Array(obj) + if pdfobj.is_rectangle: + return True + except Exception: + pass + + raise ValueError("object is not a rectangle") + + +@augments(Page) +class Extend_Page: + @property + def mediabox(self): + return self._get_mediabox(True) + + @mediabox.setter + def mediabox(self, value): + check_is_box(value) + self.obj['/MediaBox'] = value + + @property + def cropbox(self): + return self._get_cropbox(True) + + @cropbox.setter + def cropbox(self, value): + check_is_box(value) + self.obj['/CropBox'] = value + + @property + def trimbox(self): + return self._get_trimbox(True) + + @trimbox.setter + def trimbox(self, value): + check_is_box(value) + self.obj['/TrimBox'] = value + + def __repr__(self): + return repr(self.obj).replace('Dictionary', 'Page') + + +@augments(Token) +class Extend_Token: + def __repr__(self): + return 'pikepdf.Token({}, {})'.format(self.type_, self.raw_value) diff --git a/src/pikepdf/_version.py b/src/pikepdf/_version.py index c9d4b7b..ab4ff9c 100644 --- a/src/pikepdf/_version.py +++ b/src/pikepdf/_version.py @@ -11,3 +11,5 @@ try: __version__ = _get_distribution(__package__).version except DistributionNotFound: __version__ = "Not installed" + +__all__ = ['__version__'] diff --git a/src/pikepdf/codec.py b/src/pikepdf/codec.py index d008fb2..df2f6e3 100644 --- a/src/pikepdf/codec.py +++ b/src/pikepdf/codec.py @@ -46,3 +46,5 @@ def find_pdfdoc(encoding): codecs.register(find_pdfdoc) + +__all__ = [] diff --git a/src/pikepdf/models/__init__.py b/src/pikepdf/models/__init__.py index 023b836..1de973c 100644 --- a/src/pikepdf/models/__init__.py +++ b/src/pikepdf/models/__init__.py @@ -72,44 +72,3 @@ def parse_content_stream(page_or_stream, operators=''): raise e from e return instructions - - -class _Page: - def __init__(self, obj): - self.obj = obj - - def __getattr__(self, item): - return getattr(self.obj, item) - - def __setattr__(self, item, value): - if item == 'obj': - object.__setattr__(self, item, value) - elif hasattr(self.obj, item): - setattr(self.obj, item, value) - else: - raise AttributeError(item) - - def __repr__(self): - return repr(self.obj).replace('pikepdf.Dictionary', 'pikepdf.Page', 1) - - @property - def mediabox(self): - return self.obj.MediaBox - - def has_text(self): - """Check if this page print text - - Search the content stream for any of the four text showing operators. - We ignore text positioning operators because some editors might - generate maintain these even if text is deleted etc. - - This cannot detect raster text (text in a bitmap), text rendered as - curves. It also cannot determine if the text is visible to the user. - - :return: True if there is text - """ - text_showing_operators = """TJ " ' Tj""" - text_showing_insts = parse_content_stream(self.obj, text_showing_operators) - if len(text_showing_insts) > 0: - return True - return False diff --git a/src/pikepdf/models/image.py b/src/pikepdf/models/image.py index 6493d85..a75cbf8 100644 --- a/src/pikepdf/models/image.py +++ b/src/pikepdf/models/image.py @@ -491,9 +491,12 @@ class PdfImage(PdfImageBase): without the file extension. Returns: - str: If *fileprefix* was provided, then the fileprefix with the - appropriate extension. If no *fileprefix*, then an extension - indicating the file type. + If *fileprefix* was provided, then the fileprefix with the + appropriate extension. If no *fileprefix*, then an extension + indicating the file type. + + Return type: + str """ if bool(stream) == bool(fileprefix): diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp index cebf690..470bb41 100644 --- a/src/qpdf/object.cpp +++ b/src/qpdf/object.cpp @@ -550,6 +550,9 @@ void init_object(py::module& m) }, "Return the object wrapped in an array if not already an array." ) + .def_property_readonly("is_rectangle", &QPDFObjectHandle::isRectangle, + "Returns True if the object is a rectangle (an array of 4 numbers)" + ) .def("get_stream_buffer", [](QPDFObjectHandle &h) { PointerHolder<Buffer> phbuf = h.getStreamData(); @@ -711,7 +714,7 @@ void init_object(py::module& m) representable and will be serialized as ``null``. Args: - dereference (bool): If True, deference the object is this is an + dereference (bool): If True, dereference the object is this is an indirect object. Returns: @@ -817,11 +820,11 @@ void init_object(py::module& m) "Construct a PDF Null object" ); - py::class_<QPDFObjectHandle::ParserCallbacks, PyParserCallbacks> parsercallbacks(m, "StreamParser"); - parsercallbacks + py::class_<QPDFObjectHandle::ParserCallbacks, PyParserCallbacks>(m, "StreamParser") .def(py::init<>()) .def("handle_object", &QPDFObjectHandle::ParserCallbacks::handleObject) - .def("handle_eof", &QPDFObjectHandle::ParserCallbacks::handleEOF); + .def("handle_eof", &QPDFObjectHandle::ParserCallbacks::handleEOF) + ; m.def("_encode", [](py::handle handle) { diff --git a/src/qpdf/object_parsers.h b/src/qpdf/object_parsers.h index 2d5d986..b8f9f48 100644 --- a/src/qpdf/object_parsers.h +++ b/src/qpdf/object_parsers.h @@ -12,6 +12,7 @@ #include <pybind11/stl.h> #include "pikepdf.h" +#include <qpdf/QPDFTokenizer.hh> class PyParserCallbacks : public QPDFObjectHandle::ParserCallbacks { diff --git a/src/qpdf/page.cpp b/src/qpdf/page.cpp new file mode 100644 index 0000000..80b35c6 --- /dev/null +++ b/src/qpdf/page.cpp @@ -0,0 +1,281 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include <sstream> +#include <iostream> +#include <iomanip> +#include <cctype> + +#include "pikepdf.h" + +#include <qpdf/QPDFPageObjectHelper.hh> +#include <qpdf/Pipeline.hh> +#include <qpdf/Pl_Buffer.hh> + + +class TokenFilter : public QPDFObjectHandle::TokenFilter { +public: + using QPDFObjectHandle::TokenFilter::TokenFilter; + virtual ~TokenFilter() = default; + using Token = QPDFTokenizer::Token; + + void handleToken(Token const& token) override { + py::object result = this->handle_token(token); + if (result.is_none()) + return; + try { + if (py::hasattr(result, "__iter__")) { + for (auto item : result) { + const auto returned_token = item.cast<Token>(); + this->writeToken(returned_token); + } + } else { + const auto returned_token = result.cast<Token>(); + this->writeToken(returned_token); + } + } catch (const py::cast_error &e) { + throw py::type_error("returned object that is not a token"); + } + } + + virtual py::object handle_token(Token const& token) = 0; +}; + + +class TokenFilterTrampoline : public TokenFilter { +public: + using TokenFilter::TokenFilter; + using Token = QPDFTokenizer::Token; + + py::object handle_token(Token const& token) override { + PYBIND11_OVERLOAD_PURE( + py::object, + TokenFilter, + handle_token, + token + ); + } +}; + +void init_page(py::module& m) +{ + py::class_<QPDFPageObjectHelper>(m, "Page") + .def(py::init<QPDFObjectHandle &>()) + .def_property_readonly("obj", + [](QPDFPageObjectHelper &poh) { + return poh.getObjectHandle(); + }, + R"~~~( + Get the underlying :class:`pikepdf.Object`. + )~~~" + ) + .def_property_readonly("_images", &QPDFPageObjectHelper::getPageImages) + .def("_get_mediabox", &QPDFPageObjectHelper::getMediaBox) + .def("_get_cropbox", &QPDFPageObjectHelper::getCropBox) + .def("_get_trimbox", &QPDFPageObjectHelper::getTrimBox) + .def("externalize_inline_images", &QPDFPageObjectHelper::externalizeInlineImages, + py::arg("min_size") = 0, + R"~~~( + Convert inlines image to normal (external) images. + + Args: + min_size (int): minimum size in bytes + )~~~" + ) + .def("rotate", &QPDFPageObjectHelper::rotatePage, + py::arg("angle"), py::arg("relative"), + R"~~~( + Rotate a page. + + If ``relative`` is ``False``, set the rotation of the + page to angle. Otherwise, add angle to the rotation of the + page. ``angle`` must be a multiple of ``90``. Adding ``90`` to + the rotation rotates clockwise by ``90`` degrees. + )~~~" + ) + .def("contents_coalesce", &QPDFPageObjectHelper::coalesceContentStreams, + R"~~~( + Coalesce a page's content streams. + + A page's content may be a + stream or an array of streams. If this page's content is an + array, concatenate the streams into a single stream. This can + be useful when working with files that split content streams in + arbitrary spots, such as in the middle of a token, as that can + confuse some software. + )~~~" + ) + .def("remove_unreferenced_resources", &QPDFPageObjectHelper::removeUnreferencedResources, + R"~~~( + Removes from the resources dictionary any object not referenced in the content stream. + + A page's resources dictionary maps names to objects elsewhere + in the file. This method walks through a page's contents and + keeps tracks of which resources are referenced somewhere in the + contents. Then it removes from the resources dictionary any + object that is not referenced in the contents. This + method is used by page splitting code to avoid copying unused + objects in files that used shared resource dictionaries across + multiple pages. + )~~~" + ) + .def("as_form_xobject", &QPDFPageObjectHelper::getFormXObjectForPage, + py::arg("handle_transformations") = true, + R"~~~( + Return a form XObject that draws this page. + + This is useful for + n-up operations, underlay, overlay, thumbnail generation, or + any other case in which it is useful to replicate the contents + of a page in some other context. The dictionaries are shallow + copies of the original page dictionary, and the contents are + coalesced from the page's contents. The resulting object handle + is not referenced anywhere. + + Args: + handle_transformations (bool): If True, the resulting form + XObject's ``/Matrix`` will be set to replicate rotation + (``/Rotate``) and scaling (``/UserUnit``) in the page's + dictionary. In this way, the page's transformations will + be preserved when placing this object on another page. + )~~~" + ) + .def("get_filtered_contents", + [](QPDFPageObjectHelper &poh, TokenFilter &tf) { + Pl_Buffer pl_buffer("filter_page"); + poh.filterPageContents(&tf, &pl_buffer); + + PointerHolder<Buffer> buf(pl_buffer.getBuffer()); + auto data = reinterpret_cast<const char*>(buf->getBuffer()); + auto size = buf->getSize(); + return py::bytes(data, size); + }, + py::arg("tf"), + R"~~~( + Apply a :class:`pikepdf.TokenFilter` to a content stream, without modifying it. + + This may be used when the results of a token filter do not need + to be applied, such as when filtering is being used to retrieve + information rather than edit the content stream. + + Note that it is possible to create a subclassed ``TokenFilter`` + that saves information of interest to its object attributes; it + is not necessary to return data in the content stream. + + To modify the content stream, use :meth:`pikepdf.Page.add_content_token_filter`. + + Returns: + bytes: the modified content stream + )~~~" + ) + .def("add_content_token_filter", + [](QPDFPageObjectHelper &poh, PointerHolder<QPDFObjectHandle::TokenFilter> tf) { + poh.addContentTokenFilter(tf); + }, + py::keep_alive<1, 2>(), py::arg("tf"), + R"~~~( + Attach a :class:`pikepdf.TokenFilter` to a page's content stream. + + This function applies token filters lazily, if/when the page's + content stream is read for any reason, such as when the PDF is + saved. If never access, the token filter is not applied. + + Multiple token filters may be added to a page/content stream. + + If the page's contents is an array of streams, it is coalesced. + )~~~" + ) + ; + + py::enum_<QPDFTokenizer::token_type_e>(m, "TokenType") + .value("bad", QPDFTokenizer::token_type_e::tt_bad) + .value("array_close", QPDFTokenizer::token_type_e::tt_array_close) + .value("array_open", QPDFTokenizer::token_type_e::tt_array_open) + .value("brace_close", QPDFTokenizer::token_type_e::tt_brace_close) + .value("brace_open", QPDFTokenizer::token_type_e::tt_brace_open) + .value("dict_close", QPDFTokenizer::token_type_e::tt_dict_close) + .value("dict_open", QPDFTokenizer::token_type_e::tt_dict_open) + .value("integer", QPDFTokenizer::token_type_e::tt_integer) + .value("name", QPDFTokenizer::token_type_e::tt_name) + .value("real", QPDFTokenizer::token_type_e::tt_real) + .value("string", QPDFTokenizer::token_type_e::tt_string) + .value("null", QPDFTokenizer::token_type_e::tt_null) + .value("bool", QPDFTokenizer::token_type_e::tt_bool) + .value("word", QPDFTokenizer::token_type_e::tt_word) + .value("eof", QPDFTokenizer::token_type_e::tt_eof) + .value("space", QPDFTokenizer::token_type_e::tt_space) + .value("comment", QPDFTokenizer::token_type_e::tt_comment) + .value("inline_image", QPDFTokenizer::token_type_e::tt_inline_image) + ; + + py::class_<QPDFTokenizer::Token>(m, "Token") + .def(py::init<QPDFTokenizer::token_type_e, py::bytes>()) + .def_property_readonly("type_", &QPDFTokenizer::Token::getType, + R"~~~( + Returns the type of token. + + Return type: + pikepdf.TokenType + )~~~" + ) + .def_property_readonly("value", &QPDFTokenizer::Token::getValue, + R"~~~( + Interprets the token as a string. + + Return type: + str or bytes + )~~~" + ) + .def_property_readonly("raw_value", + [](const QPDFTokenizer::Token& t) -> py::bytes { + return t.getRawValue(); + }, + R"~~~( + The binary representation of a token. + + Return type: + bytes + )~~~" + ) + .def_property_readonly("error_msg", &QPDFTokenizer::Token::getErrorMessage) + .def("__eq__", &QPDFTokenizer::Token::operator==) + ; + + py::class_<QPDFObjectHandle::TokenFilter, + PointerHolder<QPDFObjectHandle::TokenFilter>>qpdftokenfilter (m, "_QPDFTokenFilter"); + + py::class_<TokenFilter, TokenFilterTrampoline, PointerHolder<TokenFilter>>(m, "TokenFilter", qpdftokenfilter) + .def(py::init<>()) + .def("handle_token", &TokenFilter::handle_token, + R"~~~( + Handle a :class:`pikepdf.Token`. + + This is an abstract method that must be defined in a subclass + of ``TokenFilter``. The method will be called for each token. + The implementation may return either ``None`` to discard the + token, the original token to include it, a new token, or an + iterable containing zero or more tokens. An implementation may + also buffer tokens and release them in groups (for example, it + could collect an entire PDF command with all of its operands, + and then return all of it). + + The final token will always be a token of type ``TokenType.eof``, + (unless an exception is raised). + + Any Python exception raised in this function will be trapped by + C++ code and converted to a PdfError. If you need to learn what + Python exception caused a problem, you must store this information + elsewhere. + + Return type: + None or list or pikepdf.Token + )~~~", + py::arg_v("token", QPDFTokenizer::Token(), "pikepdf.Token()") + ) + ; +} diff --git a/src/qpdf/pikepdf.cpp b/src/qpdf/pikepdf.cpp index 2daa69a..e213e5e 100644 --- a/src/qpdf/pikepdf.cpp +++ b/src/qpdf/pikepdf.cpp @@ -54,6 +54,7 @@ PYBIND11_MODULE(_qpdf, m) { init_pagelist(m); init_object(m); init_annotation(m); + init_page(m); m.def("utf8_to_pdf_doc", [](py::str utf8, char unknown) { diff --git a/src/qpdf/pikepdf.h b/src/qpdf/pikepdf.h index d5821e9..5fa9e53 100644 --- a/src/qpdf/pikepdf.h +++ b/src/qpdf/pikepdf.h @@ -168,6 +168,9 @@ std::map<std::string, QPDFObjectHandle> dict_builder(const py::dict dict); // From annotation.cpp void init_annotation(py::module &m); +// From page.cpp +void init_page(py::module &m); + // Support for recursion checks class StackGuard { diff --git a/src/qpdf/pipeline.cpp b/src/qpdf/pipeline.cpp new file mode 100644 index 0000000..1856987 --- /dev/null +++ b/src/qpdf/pipeline.cpp @@ -0,0 +1,60 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + + +#include <qpdf/Constants.h> +#include <qpdf/Types.h> +#include <qpdf/DLL.h> +#include <qpdf/QPDFExc.hh> +#include <qpdf/PointerHolder.hh> +#include <qpdf/Buffer.hh> +#include <qpdf/QPDF.hh> +#include <qpdf/Pipeline.hh> +#include <qpdf/QUtil.hh> + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> + +#include "pikepdf.h" +#include "pipeline.h" + + +void Pl_PythonOutput::write(unsigned char *buf, size_t len) +{ + py::gil_scoped_acquire gil; + ssize_t so_far = 0; + while (len > 0) { + py::buffer_info buffer(buf, len); + py::memoryview view_buffer(buffer); + py::object result = this->stream.attr("write")(view_buffer); + try { + so_far = result.cast<ssize_t>(); + } catch (const py::cast_error &e) { + throw py::type_error("Unexpected return type of write()"); + } + if (so_far <= 0) { + QUtil::throw_system_error(this->identifier); + } else { + auto diff = len - so_far; + if (diff > len) + throw py::value_error("Wrote more bytes than requested"); + buf += so_far; + len -= so_far; + } + } +} + +void Pl_PythonOutput::finish() +{ + py::gil_scoped_acquire gil; + try { + this->stream.attr("flush")(); + } catch (const py::attr_error &e) { + // Suppress + } +} diff --git a/src/qpdf/qpdf_pipeline.h b/src/qpdf/pipeline.h index f922827..a448d03 100644 --- a/src/qpdf/qpdf_pipeline.h +++ b/src/qpdf/pipeline.h @@ -40,37 +40,8 @@ public: Pl_PythonOutput(Pl_PythonOutput&&) = delete; Pl_PythonOutput& operator= (Pl_PythonOutput&&) = delete; - void write(unsigned char *buf, size_t len) - { - py::gil_scoped_acquire gil; - size_t so_far = 0; - while (len > 0) { - py::buffer_info buffer(buf, len); - py::memoryview view_buffer(buffer); - py::object result = this->stream.attr("write")(view_buffer); - try { - so_far = result.cast<size_t>(); - } catch (const py::cast_error &e) { - throw py::type_error("Unexpected return type of write()"); - } - if (so_far == 0) { - QUtil::throw_system_error(this->identifier); - } else { - buf += so_far; - len -= so_far; - } - } - } - - void finish() - { - py::gil_scoped_acquire gil; - try { - this->stream.attr("flush")(); - } catch (const py::attr_error &e) { - // Suppress - } - } + void write(unsigned char *buf, size_t len) override; + void finish() override; private: py::object stream; diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp index 121b1a3..855a411 100644 --- a/src/qpdf/qpdf.cpp +++ b/src/qpdf/qpdf.cpp @@ -28,7 +28,7 @@ #include "qpdf_pagelist.h" #include "qpdf_inputsource.h" -#include "qpdf_pipeline.h" +#include "pipeline.h" #include "utils.h" |