diff options
Diffstat (limited to 'src/qpdf')
-rw-r--r-- | src/qpdf/annotation.cpp | 52 | ||||
-rw-r--r-- | src/qpdf/object.cpp | 102 | ||||
-rw-r--r-- | src/qpdf/pikepdf.cpp | 98 | ||||
-rw-r--r-- | src/qpdf/pikepdf.h | 30 | ||||
-rw-r--r-- | src/qpdf/qpdf.cpp | 585 | ||||
-rw-r--r-- | src/qpdf/qpdf_inputsource.h | 2 | ||||
-rw-r--r-- | src/qpdf/qpdf_pagelist.cpp | 87 | ||||
-rw-r--r-- | src/qpdf/qpdf_pipeline.h | 77 |
8 files changed, 814 insertions, 219 deletions
diff --git a/src/qpdf/annotation.cpp b/src/qpdf/annotation.cpp new file mode 100644 index 0000000..f82ebdf --- /dev/null +++ b/src/qpdf/annotation.cpp @@ -0,0 +1,52 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2019, James R. Barlow (https://github.com/jbarlow83/) + */ + + + +#include <qpdf/Constants.h> +#include <qpdf/Types.h> +#include <qpdf/DLL.h> +#include <qpdf/QPDFExc.hh> +#include <qpdf/PointerHolder.hh> +#include <qpdf/QPDFAnnotationObjectHelper.hh> + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> + +#include "pikepdf.h" + + +void init_annotation(py::module &m) +{ + py::class_<QPDFAnnotationObjectHelper>(m, "Annotation") + .def(py::init<QPDFObjectHandle &>(), py::keep_alive<0, 1>()) + .def_property_readonly("subtype", &QPDFAnnotationObjectHelper::getSubtype) + .def_property_readonly("flags", &QPDFAnnotationObjectHelper::getFlags) + .def_property_readonly("appearance_state", &QPDFAnnotationObjectHelper::getAppearanceState) + .def_property_readonly("appearance_dict", &QPDFAnnotationObjectHelper::getAppearanceDictionary) + .def("get_appearance_stream", + [](QPDFAnnotationObjectHelper& anno, QPDFObjectHandle& which, std::string const& state = "") { + // if (!which.isName()) + // throw py::type_error("which must be pikepdf.Name"); + return anno.getAppearanceStream(which.getName(), state); + }, + py::arg("which"), + py::arg("state") = "" + ) + .def("get_page_content_for_appearance", + [](QPDFAnnotationObjectHelper& anno, QPDFObjectHandle& name, int rotate, int required_flags, int forbidden_flags) { + //auto name = name_.getName(); + return anno.getPageContentForAppearance(name.getName(), rotate, required_flags, forbidden_flags); + }, + py::arg("name"), + py::arg("rotate"), + py::arg("required_flags") = 0, + py::arg("forbidden_flags") = an_invisible | an_hidden + ) + ; +} diff --git a/src/qpdf/object.cpp b/src/qpdf/object.cpp index 392d9ff..1270961 100644 --- a/src/qpdf/object.cpp +++ b/src/qpdf/object.cpp @@ -363,7 +363,8 @@ void init_object(py::module& m) [](QPDFObjectHandle &h, std::shared_ptr<QPDF> possible_owner) { return (h.getOwningQPDF() == possible_owner.get()); }, - "Test if this object is owned by the indicated *possible_owner*." + "Test if this object is owned by the indicated *possible_owner*.", + py::arg("possible_owner") ) .def_property_readonly("is_indirect", &QPDFObjectHandle::isIndirect) .def("__repr__", &objecthandle_repr) @@ -555,9 +556,9 @@ void init_object(py::module& m) } return py::cast(value); }, - "for dictionary objects, behave as dict.get(key, default=None)", + "For ``pikepdf.Dictionary`` objects, behave as ``dict.get(key, default=None)``", py::arg("key"), - py::arg("default_") = py::none(), + py::arg("default") = py::none(), py::return_value_policy::reference_internal ) .def("get", @@ -570,9 +571,9 @@ void init_object(py::module& m) } return py::cast(value); }, - "for dictionary objects, behave as dict.get(key, default=None)", + "For ``pikepdf.Dictionary`` objects, behave as ``dict.get(key, default=None)``", py::arg("key"), - py::arg("default_") = py::none(), + py::arg("default") = py::none(), py::return_value_policy::reference_internal ) .def("keys", &QPDFObjectHandle::getKeys) @@ -664,6 +665,12 @@ void init_object(py::module& m) h.eraseItem(u_index); } ) + .def("wrap_in_array", + [](QPDFObjectHandle &h) { + return h.wrapInArray(); + }, + "Return the object wrapped in an array if not already an array." + ) .def("get_stream_buffer", [](QPDFObjectHandle &h) { PointerHolder<Buffer> phbuf = h.getStreamData(); @@ -694,37 +701,17 @@ void init_object(py::module& m) }, "Read the content stream associated with this object without decoding" ) - .def("write", - [](QPDFObjectHandle &h, py::bytes data, py::args args, py::kwargs kwargs) { + .def("_write", + [](QPDFObjectHandle &h, py::bytes data, py::object filter, py::object decode_parms) { std::string sdata = data; - QPDFObjectHandle filter = QPDFObjectHandle::newNull(); - QPDFObjectHandle decode_parms = QPDFObjectHandle::newNull(); - if (args.size() != 0) - throw py::value_error("Too many positional arguments"); - if (kwargs.contains("filter")) - filter = objecthandle_encode(kwargs["filter"]); - if (kwargs.contains("decode_parms")) - decode_parms = objecthandle_encode(kwargs["decode_parms"]); - h.replaceStreamData(sdata, filter, decode_parms); + QPDFObjectHandle h_filter = objecthandle_encode(filter); + QPDFObjectHandle h_decode_parms = objecthandle_encode(decode_parms); + h.replaceStreamData(sdata, h_filter, h_decode_parms); }, R"~~~( - Replace the content stream with `data`, compressed according to `filter` and `decode_parms` - - :param data: the new data to use for replacement - :type data: bytes - :param filter: The filter(s) with which the data is (already) encoded - :param decode_parms: Parameters for the filters with which the object is encode - - If only one `filter` is specified, it may be a name such as - `Name('/FlateDecode')`. If there are multiple filters, then array - of names should be given. - - If there is only one filter, `decode_parms` is a Dictionary of - parameters for that filter. If there are multiple filters, then - `decode_parms` is an Array of Dictionary, where each array index - is corresponds to the filter. - - )~~~" + Low level write/replace stream data without argument checking. Use .write(). + )~~~", + py::arg("data"), py::arg("filter"), py::arg("decode_parms") ) .def_property_readonly("images", [](QPDFObjectHandle &h) { @@ -749,7 +736,16 @@ void init_object(py::module& m) py::arg("prepend") = false, py::keep_alive<1, 2>() ) - .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams) + .def("page_contents_coalesce", &QPDFObjectHandle::coalesceContentStreams, + R"~~~( + Coalesce an array of page content streams into a single content stream. + + The PDF specification allows the ``/Contents`` object to contain either + an array of content streams or a single content stream. However, it + simplifies parsing and editing if there is only a single content stream. + This function merges all content streams. + )~~~" + ) .def_property_readonly("_objgen", &object_get_objgen ) @@ -811,6 +807,41 @@ void init_object(py::module& m) py::arg("resolved") = false, "Convert PDF objects into their binary representation, optionally resolving indirect objects." ) + .def("to_json", + [](QPDFObjectHandle &h, bool dereference = false) -> py::bytes { + return h.getJSON(dereference).unparse(); + }, + py::arg("dereference") = false, + R"~~~( + Convert to a QPDF JSON representation of the object. + + See the QPDF manual for a description of its JSON representation. + http://qpdf.sourceforge.net/files/qpdf-manual.html#ref.json + + Not necessarily compatible with other PDF-JSON representations that + exist in the wild. + + * Names are encoded as UTF-8 strings + * Indirect references are encoded as strings containing ``obj gen R`` + * Strings are encoded as UTF-8 strings with unrepresentable binary + characters encoded as ``\uHHHH`` + * Encoding streams just encodes the stream's dictionary; the stream + data is not represented + * Object types that are only valid in content streams (inline + image, operator) as well as "reserved" objects are not + representable and will be serialized as ``null``. + + Args: + dereference (bool): If True, deference the object is this is an + indirect object. + + Returns: + bytes: JSON bytestring of object. The object is UTF-8 encoded + and may be decoded to a Python str that represents the binary + values ``\x00-\xFF`` as ``U+0000`` to ``U+00FF``; that is, + it may contain mojibake. + )~~~" + ) ; // end of QPDFObjectHandle bindings m.def("_new_boolean", &QPDFObjectHandle::newBool, "Construct a PDF Boolean object"); @@ -900,7 +931,8 @@ void init_object(py::module& m) [](const std::string& op) { return QPDFObjectHandle::newOperator(op); }, - "Construct a PDF Operator object for use in content streams" + "Construct a PDF Operator object for use in content streams.", + py::arg("op") ); m.def("_Null", &QPDFObjectHandle::newNull, "Construct a PDF Null object" diff --git a/src/qpdf/pikepdf.cpp b/src/qpdf/pikepdf.cpp new file mode 100644 index 0000000..2daa69a --- /dev/null +++ b/src/qpdf/pikepdf.cpp @@ -0,0 +1,98 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2019, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include <sstream> +#include <type_traits> +#include <cerrno> +#include <cstring> + +#include "pikepdf.h" + +#include <qpdf/QPDFExc.hh> +#include <qpdf/QPDFSystemError.hh> +#include <qpdf/QUtil.hh> + +#include <pybind11/stl.h> +#include <pybind11/iostream.h> +#include <pybind11/buffer_info.h> + +#include "qpdf_pagelist.h" +#include "utils.h" + + +extern "C" const char* qpdf_get_qpdf_version(); + + +class TemporaryErrnoChange { +public: + TemporaryErrnoChange(int val) { + stored = errno; + errno = val; + } + ~TemporaryErrnoChange() { + errno = stored; + } +private: + int stored; +}; + + +PYBIND11_MODULE(_qpdf, m) { + //py::options options; + //options.disable_function_signatures(); + + m.doc() = "pikepdf provides a Pythonic interface for QPDF"; + + m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version"); + + init_qpdf(m); + init_pagelist(m); + init_object(m); + init_annotation(m); + + m.def("utf8_to_pdf_doc", + [](py::str utf8, char unknown) { + std::string pdfdoc; + bool success = QUtil::utf8_to_pdf_doc(std::string(utf8), pdfdoc, unknown); + return py::make_tuple(success, py::bytes(pdfdoc)); + } + ); + m.def("pdf_doc_to_utf8", + [](py::bytes pdfdoc) -> py::str { + return py::str(QUtil::pdf_doc_to_utf8(pdfdoc)); + } + ); + + static py::exception<QPDFExc> exc_main(m, "PdfError"); + static py::exception<QPDFExc> exc_password(m, "PasswordError"); + py::register_exception_translator([](std::exception_ptr p) { + try { + if (p) std::rethrow_exception(p); + } catch (const QPDFExc &e) { + if (e.getErrorCode() == qpdf_e_password) { + exc_password(e.what()); + } else { + exc_main(e.what()); + } + } catch (const QPDFSystemError &e) { + if (e.getErrno() != 0) { + TemporaryErrnoChange errno_holder(e.getErrno()); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, e.getDescription().c_str()); + } else { + exc_main(e.what()); + } + } + }); + + +#ifdef VERSION_INFO + m.attr("__version__") = VERSION_INFO; +#else + m.attr("__version__") = "dev"; +#endif +} diff --git a/src/qpdf/pikepdf.h b/src/qpdf/pikepdf.h index 7fbd6e8..0acd807 100644 --- a/src/qpdf/pikepdf.h +++ b/src/qpdf/pikepdf.h @@ -40,9 +40,6 @@ namespace pybind11 { namespace detail { }; }} -#define CUSTOM_TYPE_CONVERSION 1 -#if CUSTOM_TYPE_CONVERSION - // From object_convert.cpp pybind11::object decimal_from_pdfobject(QPDFObjectHandle h); @@ -57,24 +54,9 @@ namespace pybind11 { namespace detail { * Conversion part 1 (Python->C++): convert a PyObject into a Object */ bool load(handle src, bool convert) { - // if (src.is_none()) { - // if (!convert) return false; - // value = QPDFObjectHandle::newNull(); - // return true; - // } - // Attempting to construct these does not work... - // if (convert) { - // if (PYBIND11_LONG_CHECK(src.ptr())) { - // auto as_int = src.cast<long long>(); - // value = QPDFObjectHandle::newInteger(as_int); - // } /*else if (PyFloat_Check(src.ptr())) { - // auto as_double = src.cast<double>(); - // value = QPDFObjectHandle::newReal(as_double); - // } */ else { - // return base::load(src, convert); - // } - // return true; - // } + // Do whatever our base does + // Potentially we could convert some scalrs to QPDFObjectHandle here, + // but most of the interfaces just expect straight C++ types. return base::load(src, convert); } @@ -157,7 +139,6 @@ namespace pybind11 { namespace detail { } }; }} // namespace pybind11::detail -#endif namespace py = pybind11; @@ -166,6 +147,8 @@ PYBIND11_MAKE_OPAQUE(std::vector<QPDFObjectHandle>); typedef std::map<std::string, QPDFObjectHandle> ObjectMap; PYBIND11_MAKE_OPAQUE(ObjectMap); +// From qpdf.cpp +void init_qpdf(py::module& m); // From object.cpp size_t list_range_check(QPDFObjectHandle h, int index); @@ -183,6 +166,9 @@ QPDFObjectHandle objecthandle_encode(const py::handle handle); std::vector<QPDFObjectHandle> array_builder(const py::iterable iter); std::map<std::string, QPDFObjectHandle> dict_builder(const py::dict dict); +// From annotation.cpp +void init_annotation(py::module &m); + // Support for recursion checks class StackGuard { diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp index 5bb8ea9..0a5fc26 100644 --- a/src/qpdf/qpdf.cpp +++ b/src/qpdf/qpdf.cpp @@ -28,10 +28,9 @@ #include "qpdf_pagelist.h" #include "qpdf_inputsource.h" +#include "qpdf_pipeline.h" #include "utils.h" -extern "C" const char* qpdf_get_qpdf_version(); - void check_stream_is_usable(py::object stream) { @@ -58,6 +57,7 @@ open_pdf( q->setPasswordIsHexKey(hex_password); q->setIgnoreXRefStreams(ignore_xref_streams); q->setAttemptRecovery(attempt_recovery); + q->setImmediateCopyFrom(true); if (py::hasattr(filename_or_stream, "read") && py::hasattr(filename_or_stream, "seek")) { // Python code gave us an object with a stream interface @@ -66,7 +66,7 @@ open_pdf( check_stream_is_usable(stream); // The PythonInputSource object will be owned by q - InputSource* input_source = new PythonInputSource(stream); + auto input_source = PointerHolder<InputSource>(new PythonInputSource(stream)); py::gil_scoped_release release; q->processInputSource(input_source, password.c_str()); } else { @@ -80,7 +80,7 @@ open_pdf( q->processFile( description.c_str(), file, // transferring ownership - true, // QPDF will close the file + true, // QPDF will close the file (including if there are exceptions) password.c_str() ); file = nullptr; // QPDF owns the file and will close it @@ -116,122 +116,320 @@ private: }; +void update_xmp_pdfversion(QPDF &q, std::string version) +{ + auto impl = py::module::import("pikepdf._cpphelpers").attr("update_xmp_pdfversion"); + auto pypdf = py::cast(q); + impl(pypdf, version); +} + + +void setup_encryption( + QPDFWriter &w, + py::object encryption, + std::string &owner, + std::string &user +) +{ + bool aes = true; + bool metadata = true; + std::map<std::string, bool> allow; + int encryption_level = 6; + + if (encryption.contains("R")) { + if (!py::isinstance<py::int_>(encryption["R"])) + throw py::type_error("Encryption level 'R' must be an integer"); + encryption_level = py::int_(encryption["R"]); + } + if (encryption_level < 2 || encryption_level > 6) + throw py::value_error("Invalid encryption level: must be 2, 3, 4 or 6"); + + if (encryption_level == 5) { + auto warn = py::module::import("warnings").attr("warn"); + warn("Encryption R=5 is deprecated"); + } + + if (encryption.contains("owner")) { + if (encryption_level <= 4) { + auto success = QUtil::utf8_to_pdf_doc(encryption["owner"].cast<std::string>(), owner); + if (!success) + throw py::value_error("Encryption level is R3/R4 and password is not encodable as PDFDocEncoding"); + } else { + owner = encryption["owner"].cast<std::string>(); + } + } + if (encryption.contains("user")) { + if (encryption_level <= 4) { + auto success = QUtil::utf8_to_pdf_doc(encryption["user"].cast<std::string>(), user); + if (!success) + throw py::value_error("Encryption level is R3/R4 and password is not encodable as PDFDocEncoding"); + } else { + user = encryption["user"].cast<std::string>(); + } + } + if (encryption.contains("allow")) { + auto pyallow = encryption["allow"]; + allow["accessibility"] = pyallow.attr("accessibility").cast<bool>(); + allow["extract"] = pyallow.attr("extract").cast<bool>(); + allow["modify_assembly"] = pyallow.attr("modify_assembly").cast<bool>(); + allow["modify_annotation"] = pyallow.attr("modify_annotation").cast<bool>(); + allow["modify_form"] = pyallow.attr("modify_form").cast<bool>(); + allow["modify_other"] = pyallow.attr("modify_other").cast<bool>(); + allow["print_lowres"] = pyallow.attr("print_lowres").cast<bool>(); + allow["print_highres"] = pyallow.attr("print_highres").cast<bool>(); + } + if (encryption.contains("aes")) { + if (py::isinstance<py::bool_>(encryption["aes"])) + aes = py::bool_(encryption["aes"]); + else + throw py::type_error("aes must be bool"); + } else { + aes = (encryption_level >= 4); + } + if (encryption.contains("metadata")) { + if (py::isinstance<py::bool_>(encryption["metadata"])) + metadata = py::bool_(encryption["metadata"]); + else + throw py::type_error("metadata must be bool"); + } else { + metadata = (encryption_level >= 4); + } + + if (metadata && encryption_level < 4) { + throw py::value_error("Cannot encrypt metadata when R < 4"); + } + if (aes && encryption_level < 4) { + throw py::value_error("Cannot encrypt with AES when R < 4"); + } + if (encryption_level == 6 && !aes) { + throw py::value_error("When R = 6, AES encryption must be enabled"); + } + if (metadata && !aes) { + throw py::value_error("Cannot encrypt metadata unless AES encryption is enabled"); + } + + qpdf_r3_print_e print; + if (allow["print_highres"]) + print = qpdf_r3p_full; + else if (allow["print_lowres"]) + print = qpdf_r3p_low; + else + print = qpdf_r3p_none; + + if (encryption_level == 6) { + w.setR6EncryptionParameters( + user.c_str(), owner.c_str(), + allow["accessibility"], + allow["extract"], + allow["modify_assembly"], + allow["modify_annotation"], + allow["modify_form"], + allow["modify_other"], + print, + metadata + ); + } else if (encryption_level == 5) { + // TODO WARNING + w.setR5EncryptionParameters( + user.c_str(), owner.c_str(), + allow["accessibility"], + allow["extract"], + allow["modify_assembly"], + allow["modify_annotation"], + allow["modify_form"], + allow["modify_other"], + print, + metadata + ); + } else if (encryption_level == 4) { + w.setR4EncryptionParameters( + user.c_str(), owner.c_str(), + allow["accessibility"], + allow["extract"], + allow["modify_assembly"], + allow["modify_annotation"], + allow["modify_form"], + allow["modify_other"], + print, + metadata, + aes + ); + } else if (encryption_level == 3) { + w.setR3EncryptionParameters( + user.c_str(), owner.c_str(), + allow["accessibility"], + allow["extract"], + allow["modify_assembly"], + allow["modify_annotation"], + allow["modify_form"], + allow["modify_other"], + print + ); + } else if (encryption_level == 2) { + w.setR2EncryptionParameters( + user.c_str(), owner.c_str(), + (print != qpdf_r3p_none), + allow["modify_assembly"], + allow["extract"], + allow["modify_annotation"] + ); + } +} + + +typedef std::pair<std::string, int> pdf_version_extension; + +pdf_version_extension get_version_extension(py::object ver_ext) +{ + std::string version = ""; + int extension = 0; + try { + version = ver_ext.cast<std::string>(); + extension = 0; + } catch (py::cast_error) { + try { + auto version_ext = ver_ext.cast<pdf_version_extension>(); + version = version_ext.first; + extension = version_ext.second; + } catch (py::cast_error) { + throw py::type_error("PDF version must be a tuple: (str, int)"); + } + } + return pdf_version_extension(version, extension); +} + + +/* Helper class to ensure streams we open get closed by destructor */ +class Closer +{ +public: + Closer() : monitored(py::none()) {} + ~Closer() { + if (!this->monitored.is_none()) { + this->monitored.attr("close")(); + } + } + void set(py::object monitored) { + this->monitored = monitored; + } + Closer(const Closer& other) = delete; + Closer(Closer&& other) = delete; + Closer& operator= (const Closer& other) = delete; + Closer& operator= (Closer&& other) = delete; + +private: + py::object monitored; +}; + void save_pdf( QPDF& q, py::object filename_or_stream, bool static_id=false, bool preserve_pdfa=true, - std::string min_version="", - std::string force_version="", + py::object min_version=py::none(), + py::object force_version=py::none(), + bool fix_metadata_version=true, bool compress_streams=true, - qpdf_stream_decode_level_e stream_decode_level=qpdf_dl_generalized, + py::object stream_decode_level=py::none(), qpdf_object_stream_e object_stream_mode=qpdf_o_preserve, bool normalize_content=false, bool linearize=false, bool qdf=false, - py::object progress=py::none()) + py::object progress=py::none(), + py::object encryption=py::none()) { + std::string owner; + std::string user; + std::string description; QPDFWriter w(q); - // Parameters if (static_id) { w.setStaticID(true); } w.setNewlineBeforeEndstream(preserve_pdfa); - if (!min_version.empty()) { - w.setMinimumPDFVersion(min_version, 0); - } - if (!force_version.empty()) { - w.forcePDFVersion(force_version, 0); + + if (!min_version.is_none()) { + auto version_ext = get_version_extension(min_version); + w.setMinimumPDFVersion(version_ext.first, version_ext.second); } w.setCompressStreams(compress_streams); - w.setDecodeLevel(stream_decode_level); + if (!stream_decode_level.is_none()) { + // Unconditionally calling setDecodeLevel has side effects, disabling + // preserve encryption in particular + w.setDecodeLevel(stream_decode_level.cast<qpdf_stream_decode_level_e>()); + } w.setObjectStreamMode(object_stream_mode); - if (normalize_content && linearize) { - throw py::value_error("cannot save with both normalize_content and linearize"); - } - w.setContentNormalization(normalize_content); - w.setLinearization(linearize); - w.setQDFMode(qdf); - - if (!progress.is_none()) { - auto reporter = PointerHolder<QPDFWriter::ProgressReporter>(new PikeProgressReporter(progress)); - w.registerProgressReporter(reporter); - } + py::object stream; + Closer stream_closer; if (py::hasattr(filename_or_stream, "write") && py::hasattr(filename_or_stream, "seek")) { // Python code gave us an object with a stream interface - py::object stream = filename_or_stream; + stream = filename_or_stream; check_stream_is_usable(stream); + description = py::repr(stream); + } else { + py::object filename = fspath(filename_or_stream); + py::object ospath = py::module::import("os").attr("path"); + py::object samefile = ospath.attr("samefile"); + py::object exists = ospath.attr("exists"); + if (exists(filename).cast<bool>() && samefile(filename, q.getFilename()).cast<bool>()) { + throw py::value_error("Cannot overwrite input file"); + } + stream = py::module::import("io").attr("open")(filename, "wb"); + stream_closer.set(stream); + description = py::str(filename); + } - // TODO might be able to improve this by streaming rather than buffering - // using subclass of Pipeline that routes calls to Python. - w.setOutputMemory(); + // We must set up the output pipeline before we configure encryption + Pl_PythonOutput output_pipe(description.c_str(), stream); + w.setOutputPipeline(&output_pipe); - // It would be kind to release the GIL here, but this is not possible if - // another thread has an object and tries to mess with it. Correctness - // is more important than performance. - w.write(); + if (encryption.is(py::bool_(true)) && !q.isEncrypted()) { + throw py::value_error("can't perserve encryption parameters on a file with no encryption"); + } - // But now that we've held the GIL forever, we can release it and take - // it back again; at least in theory giving other threads a chance to - // to do something. - { - py::gil_scoped_release release; - } + if ( + (encryption.is(py::bool_(true)) || py::isinstance<py::dict>(encryption)) + && (normalize_content || !stream_decode_level.is_none()) + ) { + throw py::value_error("cannot save with encryption and normalize_content or stream_decode_level"); + } - // getBuffer returns Buffer* and qpdf says we are responsible for - // deleting it, so capture it in a unique_ptr - std::unique_ptr<Buffer> output_buffer(w.getBuffer()); - - // Create a memoryview of the buffer that libqpdf created - // Awkward API alert: - // QPDFWriter::getBuffer -> Buffer* (caller frees memory) - // and Buffer::getBuffer -> unsigned char* (caller does not own memory) - py::buffer_info output_buffer_info( - output_buffer->getBuffer(), - output_buffer->getSize()); - py::memoryview view_output_buffer(output_buffer_info); - - // Send it to the stream object (probably copying) - stream.attr("write")(view_output_buffer); + if (encryption.is(py::bool_(true))) { + w.setPreserveEncryption(true); // Keep existing encryption + } else if (encryption.is_none() || encryption.is(py::bool_(false))) { + w.setPreserveEncryption(false); // Remove encryption } else { - py::object filename = filename_or_stream; - std::string description = py::str(filename); - // Delete the intended filename, in case it is the same as the input file. - // This ensures that the input file will continue to exist in memory on Linux. - portable_unlink(filename); - FILE* file = portable_fopen(filename, "wb"); - w.setOutputFile(description.c_str(), file, true); - w.write(); - file = nullptr; // QPDF will close it + setup_encryption(w, encryption, owner, user); } -} + if (normalize_content && linearize) { + throw py::value_error("cannot save with both normalize_content and linearize"); + } + w.setContentNormalization(normalize_content); + w.setLinearization(linearize); + w.setQDFMode(qdf); -PYBIND11_MODULE(_qpdf, m) { - //py::options options; - //options.disable_function_signatures(); + if (!force_version.is_none()) { + auto version_ext = get_version_extension(force_version); + w.forcePDFVersion(version_ext.first, version_ext.second); + } + if (fix_metadata_version) { + update_xmp_pdfversion(q, w.getFinalVersion()); + } - m.doc() = "pikepdf provides a Pythonic interface for QPDF"; + if (!progress.is_none()) { + auto reporter = PointerHolder<QPDFWriter::ProgressReporter>(new PikeProgressReporter(progress)); + w.registerProgressReporter(reporter); + } - m.def("qpdf_version", &qpdf_get_qpdf_version, "Get libqpdf version"); + w.write(); +} - static py::exception<QPDFExc> exc_main(m, "PdfError"); - static py::exception<QPDFExc> exc_password(m, "PasswordError"); - py::register_exception_translator([](std::exception_ptr p) { - try { - if (p) std::rethrow_exception(p); - } catch (const QPDFExc &e) { - if (e.getErrorCode() == qpdf_e_password) { - exc_password(e.what()); - } else { - exc_main(e.what()); - } - } - }); +void init_qpdf(py::module &m) +{ py::enum_<qpdf_object_stream_e>(m, "ObjectStreamMode") .value("disable", qpdf_object_stream_e::qpdf_o_disable) .value("preserve", qpdf_object_stream_e::qpdf_o_preserve) @@ -243,7 +441,12 @@ PYBIND11_MODULE(_qpdf, m) { .value("specialized", qpdf_stream_decode_level_e::qpdf_dl_specialized) .value("all", qpdf_stream_decode_level_e::qpdf_dl_all); - init_pagelist(m); + py::enum_<QPDF::encryption_method_e>(m, "EncryptionMethod") + .value("none", QPDF::encryption_method_e::e_none) + .value("unknown", QPDF::encryption_method_e::e_unknown) + .value("rc4", QPDF::encryption_method_e::e_rc4) + .value("aes", QPDF::encryption_method_e::e_aes) + .value("aesv3", QPDF::encryption_method_e::e_aesv3); py::class_<QPDF, std::shared_ptr<QPDF>>(m, "Pdf", "In-memory representation of a PDF") .def_static("new", @@ -253,20 +456,31 @@ PYBIND11_MODULE(_qpdf, m) { q->setSuppressWarnings(true); return q; }, - "create a new empty PDF from stratch" + "Create a new empty PDF from stratch." ) .def_static("open", open_pdf, R"~~~( - Open an existing file at `filename_or_stream`. + Open an existing file at *filename_or_stream*. - If `filename_or_stream` is path-like, the file will be opened. The - file should not be modified by another process while it is open in - pikepdf. + If *filename_or_stream* is path-like, the file will be opened for reading. + The file should not be modified by another process while it is open in + pikepdf. The file will not be altered when opened in this way. Any changes + to the file must be persisted by using ``.save()``. - If `filename_or_stream` has `.read()` and `.seek()` methods, the file + If *filename_or_stream* has ``.read()`` and ``.seek()`` methods, the file will be accessed as a readable binary stream. pikepdf will read the entire stream into a private buffer. + ``.open()`` may be used in a ``with``-block, ``.close()`` will be called when + the block exists. + + Examples: + + >>> with Pdf.open("test.pdf") as pdf: + ... + + >>> pdf = Pdf.open("test.pdf", password="rosebud") + Args: filename_or_stream (os.PathLike): Filename of PDF to open password (str or bytes): User or owner password to open an @@ -278,7 +492,8 @@ PYBIND11_MODULE(_qpdf, m) { ignore_xref_streams (bool): If True, ignore cross-reference streams. See qpdf documentation. suppress_warnings (bool): If True (default), warnings are not - printed to stderr. Use `get_warnings()` to retrieve warnings. + printed to stderr. Use :meth:`pikepdf.Pdf.get_warnings()` to + retrieve warnings. attempt_recovery (bool): If True (default), attempt to recover from PDF parsing errors. inherit_page_attributes (bool): If True (default), push attributes @@ -289,7 +504,7 @@ PYBIND11_MODULE(_qpdf, m) { file. pikepdf.PdfError: If for other reasons we could not open the file. - TypeError: If the type of `filename_or_stream` is not + TypeError: If the type of ``filename_or_stream`` is not usable. FileNotFoundError: If the file was not found. )~~~", @@ -307,15 +522,15 @@ PYBIND11_MODULE(_qpdf, m) { } ) .def_property_readonly("filename", &QPDF::getFilename, - "the source filename of an existing PDF, when available") + "The source filename of an existing PDF, when available.") .def_property_readonly("pdf_version", &QPDF::getPDFVersion, - "the PDF standard version, such as '1.7'") + "The PDF standard version, such as '1.7'.") .def_property_readonly("extension_level", &QPDF::getExtensionLevel) .def_property_readonly("Root", &QPDF::getRoot, - "the /Root object of the PDF" + "The /Root object of the PDF." ) .def_property_readonly("root", &QPDF::getRoot, - "alias for .Root, the /Root object of the PDF" + "Alias for .Root, the /Root object of the PDF." ) .def_property("docinfo", [](QPDF& q) { @@ -330,7 +545,16 @@ PYBIND11_MODULE(_qpdf, m) { throw py::value_error("docinfo must be an indirect object - use Pdf.make_indirect"); q.getTrailer().replaceKey("/Info", replace); }, - "access the document information dictionary" + R"~~~( + Access the (deprecated) document information dictionary. + + The document information dictionary is a brief metadata record + that can store some information about the origin of a PDF. It is + deprecated and removed in the PDF 2.0 specification. Use the + ``.open_metadata()`` API instead, which will edit the modern (and + unfortunately, more complicated) XMP metadata object and synchronize + changes to the document information dictionary. + )~~~" ) .def_property_readonly("trailer", &QPDF::getTrailer, R"~~~( @@ -394,8 +618,9 @@ PYBIND11_MODULE(_qpdf, m) { The page can be either be a newly constructed PDF object or it can be obtained from another PDF. - :param pikepdf.Object page: The page object to attach - :param bool first: If True, prepend this before the first page; if False append after last page + Args: + page (pikepdf.Object): The page object to attach + first (bool): If True, prepend this before the first page; if False append after last page )~~~", py::arg("page"), py::arg("first")=false, @@ -423,11 +648,14 @@ PYBIND11_MODULE(_qpdf, m) { .def("save", save_pdf, R"~~~( - Save all modifications to this :class:`pikepdf.Pdf` + Save all modifications to this :class:`pikepdf.Pdf`. Args: filename (str or stream): Where to write the output. If a file - exists in this location it will be overwritten. + exists in this location it will be overwritten. The file + should not be the same as the input file, because data from + the input file may be lazily loaded; as such overwriting + in place will null-out objects. static_id (bool): Indicates that the ``/ID`` metadata, normally calculated as a hash of certain PDF contents and metadata @@ -437,12 +665,20 @@ PYBIND11_MODULE(_qpdf, m) { manner compliant with PDF/A and other stricter variants. This should be True, the default, in most cases. - min_version (str): Sets the minimum version of PDF + min_version (str or tuple): Sets the minimum version of PDF specification that should be required. If left alone QPDF - will decide. - force_version (str): Override the version recommend by QPDF, + will decide. If a tuple, the second element is an integer, the + extension level. + force_version (str or tuple): Override the version recommend by QPDF, potentially creating an invalid file that does not display - in old versions. See QPDF manual for details. + in old versions. See QPDF manual for details. If a tuple, the + second element is an integer, the extension level. + fix_metadata_version (bool): If True (default) and the XMP metadata + contains the optional PDF version field, ensure the version in + metadata is correct. If the XMP metadata does not contain a PDF + version field, none will be added. To ensure that the field is + added, edit the metadata and insert a placeholder value in + ``pdf:PDFVersion``. object_stream_mode (pikepdf.ObjectStreamMode): ``disable`` prevents the use of object streams. @@ -472,10 +708,24 @@ PYBIND11_MODULE(_qpdf, m) { the program ``fix-qdf`` to fix convert back to a standard PDF. + progress (callable): Specify a callback function that is called + as the PDF is written. The function will be called with an + integer between 0-100 as the sole parameter, the progress + percentage. This function may not access or modify the PDF + while it is being written, or data corruption will almost + certainly occur. + + encryption (pikepdf.models.Encryption or bool): If ``False`` + or omitted, existing encryption will be removed. If ``True`` + encryption settings are copied from the originating PDF. + Alternately, an ``Encryption`` object may be provided that + sets the parameters for new encryption. + You may call ``.save()`` multiple times with different parameters to generate different versions of a file, and you *may* continue to modify the file after saving it. ``.save()`` does not modify - the ``Pdf`` object in memory. + the ``Pdf`` object in memory, except possibly by updating the XMP + metadata version with ``fix_metadata_version``. .. note:: @@ -491,13 +741,15 @@ PYBIND11_MODULE(_qpdf, m) { py::arg("preserve_pdfa")=true, py::arg("min_version")="", py::arg("force_version")="", + py::arg("fix_metadata_version")=true, py::arg("compress_streams")=true, - py::arg("stream_decode_level")=qpdf_stream_decode_level_e::qpdf_dl_generalized, + py::arg("stream_decode_level")=py::none(), py::arg("object_stream_mode")=qpdf_object_stream_e::qpdf_o_preserve, py::arg("normalize_content")=false, py::arg("linearize")=false, py::arg("qdf")=false, - py::arg("progress")=py::none() + py::arg("progress")=py::none(), + py::arg("encryption")=py::none() ) .def("_get_object_id", &QPDF::getObjectByID) .def("get_object", @@ -510,7 +762,8 @@ PYBIND11_MODULE(_qpdf, m) { Returns: pikepdf.Object )~~~", - py::return_value_policy::reference_internal + py::return_value_policy::reference_internal, + py::arg("objgen") ) .def("get_object", [](QPDF &q, int objid, int gen) { @@ -522,7 +775,9 @@ PYBIND11_MODULE(_qpdf, m) { Returns: pikepdf.Object )~~~", - py::return_value_policy::reference_internal + py::return_value_policy::reference_internal, + py::arg("objid"), + py::arg("gen") ) .def("make_indirect", &QPDF::makeIndirectObject, R"~~~( @@ -544,7 +799,8 @@ PYBIND11_MODULE(_qpdf, m) { Returns: pikepdf.Object - )~~~" + )~~~", + py::arg("h") ) .def("make_indirect", [](QPDF &q, py::object obj) -> QPDFObjectHandle { @@ -555,7 +811,8 @@ PYBIND11_MODULE(_qpdf, m) { Returns: pikepdf.Object - )~~~" + )~~~", + py::arg("obj") ) .def("copy_foreign", [](QPDF &q, QPDFObjectHandle &h) -> QPDFObjectHandle { @@ -563,20 +820,106 @@ PYBIND11_MODULE(_qpdf, m) { }, "Copy object from foreign PDF to this one.", py::return_value_policy::reference_internal, - py::keep_alive<1, 2>() + py::keep_alive<1, 2>(), + py::arg("h") ) .def("_replace_object", [](QPDF &q, int objid, int gen, QPDFObjectHandle &h) { q.replaceObject(objid, gen, h); } ) - ; // class Pdf - - init_object(m); + .def("_swap_objects", + [](QPDF &q, std::pair<int, int> objgen1, std::pair<int, int> objgen2) { + QPDFObjGen o1(objgen1.first, objgen1.second); + QPDFObjGen o2(objgen2.first, objgen2.second); + q.swapObjects(o1, o2); + } + ) + .def("_process", + [](QPDF &q, std::string description, py::bytes data) { + std::string s = data; + q.processMemoryFile( + description.c_str(), + s.data(), + s.size() + ); + }, + R"~~~( + Process a new in-memory PDF, replacing the existing PDF -#ifdef VERSION_INFO - m.attr("__version__") = VERSION_INFO; -#else - m.attr("__version__") = "dev"; -#endif + Used to implement Pdf.close(). + )~~~" + ) + .def_property_readonly("_allow_accessibility", + [](QPDF &q) { + return q.allowAccessibility(); + } + ) + .def_property_readonly("_allow_extract", + [](QPDF &q) { + return q.allowExtractAll(); + } + ) + .def_property_readonly("_allow_print_lowres", + [](QPDF &q) { + return q.allowPrintLowRes(); + } + ) + .def_property_readonly("_allow_print_highres", + [](QPDF &q) { + return q.allowPrintHighRes(); + } + ) + .def_property_readonly("_allow_modify_assembly", + [](QPDF &q) { + return q.allowModifyAssembly(); + } + ) + .def_property_readonly("_allow_modify_form", + [](QPDF &q) { + return q.allowModifyForm(); + } + ) + .def_property_readonly("_allow_modify_annotation", + [](QPDF &q) { + return q.allowModifyAnnotation(); + } + ) + .def_property_readonly("_allow_modify_other", + [](QPDF &q) { + return q.allowModifyOther(); + } + ) + .def_property_readonly("_allow_modify_all", + [](QPDF &q) { + return q.allowModifyAll(); + } + ) + .def_property_readonly("_encryption_data", + [](QPDF &q) { + int R = 0; + int P = 0; + int V = 0; + QPDF::encryption_method_e stream_method = QPDF::e_unknown; + QPDF::encryption_method_e string_method = QPDF::e_unknown; + QPDF::encryption_method_e file_method = QPDF::e_unknown; + if (!q.isEncrypted(R, P, V, stream_method, string_method, file_method)) + return py::dict(); + + auto user_passwd = q.getTrimmedUserPassword(); + auto encryption_key = q.getEncryptionKey(); + + return py::dict( + py::arg("R") = R, + py::arg("P") = P, + py::arg("V") = V, + py::arg("stream") = stream_method, + py::arg("string") = string_method, + py::arg("file") = file_method, + py::arg("user_passwd") = py::bytes(user_passwd), + py::arg("encryption_key") = py::bytes(encryption_key) + ); + } + ) + ; // class Pdf } diff --git a/src/qpdf/qpdf_inputsource.h b/src/qpdf/qpdf_inputsource.h index dc26267..b29b309 100644 --- a/src/qpdf/qpdf_inputsource.h +++ b/src/qpdf/qpdf_inputsource.h @@ -17,7 +17,7 @@ #include <qpdf/Buffer.hh> #include <qpdf/QPDF.hh> #include <qpdf/InputSource.hh> - +#include <qpdf/QUtil.hh> #include <pybind11/pybind11.h> #include <pybind11/stl.h> diff --git a/src/qpdf/qpdf_pagelist.cpp b/src/qpdf/qpdf_pagelist.cpp index d8222dd..07c496d 100644 --- a/src/qpdf/qpdf_pagelist.cpp +++ b/src/qpdf/qpdf_pagelist.cpp @@ -121,18 +121,6 @@ void PageList::set_pages_from_iterable(py::slice slice, py::iterable other) void PageList::delete_page(size_t index) { auto page = this->get_page(index); - /* - // Need a dec_ref to match the inc_ref in insert_page, but it's unclear - // how to do that. The item will be set the current QPDF always. - // Accessing data from another PDF seems to involve some pipeline - // magic in QPDF around libqpdf/QPDFWriter.cc:1614 - if (original page owner != &this->getQPDF()) { - // If we are removing a page not originally owned by our QPDF, - // remove the reference count we put it in insert_page() - py::object pyqpdf = py::cast(page_owner); - pyqpdf.dec_ref(); - } - */ this->qpdf->removePage(page); } @@ -175,24 +163,6 @@ void PageList::insert_page(size_t index, QPDFObjectHandle page) // qpdf does not accept duplicating pages within the same file, // so manually create a copy page = this->qpdf->makeIndirectObject(page); - } else { - // libqpdf does not transfer a page's contents to the new QPDF. - // Instead WHEN ASKED TO WRITE it will go back and get the data - // from objecthandle->getOwningQPDF(). Therefore we must ensure - // our previous owner is kept alive. -#if 1 - auto tinfo = py::detail::get_type_info(typeid(QPDF)); - py::handle pyqpdf = py::detail::get_object_handle(page_owner, tinfo); - py::handle pypage = py::cast(page); - py::detail::keep_alive_impl(pypage, pyqpdf); -#else - // MSVC++ complains about the symbol - // QPDF::Members::~Members() not being exported when this version - // is used, but it works for GCC and Clang. - py::handle pyqpdf = py::cast(page_owner); - py::handle pypage = py::cast(page); - py::detail::keep_alive_impl(pypage, pyqpdf); -#endif } if (index != this->count()) { QPDFObjectHandle refpage = this->get_page(index); @@ -202,7 +172,6 @@ void PageList::insert_page(size_t index, QPDFObjectHandle page) } } - void init_pagelist(py::module &m) { py::class_<PageList>(m, "PageList") @@ -229,12 +198,13 @@ void init_pagelist(py::module &m) .def("__delitem__", &PageList::delete_pages_from_iterable) .def("__len__", &PageList::count) .def("p", - [](PageList &pl, size_t index) { - if (index == 0) // Indexing past end is checked in .get_page + [](PageList &pl, size_t pnum) { + if (pnum == 0) // Indexing past end is checked in .get_page throw py::index_error("page access out of range in 1-based indexing"); - return pl.get_page(index - 1); + return pl.get_page(pnum - 1); }, - "convenience - look up page number in ordinal numbering, .p(1) is first page" + "Convenience - look up page number in ordinal numbering, ``.p(1)`` is first page", + py::arg("pnum") ) .def("__iter__", [](PageList &pl) { @@ -252,7 +222,16 @@ void init_pagelist(py::module &m) [](PageList &pl, ssize_t index, py::object obj) { size_t uindex = uindex_from_index(pl, index); pl.insert_page(uindex, obj); - }, py::keep_alive<1, 3>() + }, py::keep_alive<1, 3>(), + R"~~~( + Insert a page at the specified location. + + Args: + index (int): location at which to insert page, 0-based indexing + obj (pikepdf.Object): page object to insert + )~~~", + py::arg("index"), + py::arg("obj") ) .def("reverse", [](PageList &pl) { @@ -262,13 +241,16 @@ void init_pagelist(py::module &m) PySlice_New(Py_None, Py_None, step.ptr())); py::list reversed_pages = pl.get_pages(reversed); pl.set_pages_from_iterable(ordinary_indices, reversed_pages); - } + }, + "Reverse the order of pages." ) .def("append", [](PageList &pl, py::object page) { pl.insert_page(pl.count(), page); }, - py::keep_alive<1, 2>() + py::keep_alive<1, 2>(), + "Add another page to the end.", + py::arg("page") ) .def("extend", [](PageList &pl, PageList &other) { @@ -279,7 +261,9 @@ void init_pagelist(py::module &m) pl.insert_page(pl.count(), other.get_page(i)); } }, - py::keep_alive<1, 2>() + py::keep_alive<1, 2>(), + "Extend the ``Pdf`` by adding pages from another ``Pdf.pages``.", + py::arg("other") ) .def("extend", [](PageList &pl, py::iterable iterable) { @@ -290,6 +274,29 @@ void init_pagelist(py::module &m) ++it; } }, - py::keep_alive<1, 2>() + py::keep_alive<1, 2>(), + "Extend the ``Pdf`` by adding pages from an iterable of pages.", + py::arg("iterable") + ) + .def("remove", + [](PageList &pl, py::kwargs kwargs) { + auto pnum = kwargs["p"].cast<size_t>(); + if (pnum == 0) // Indexing past end is checked in .get_page + throw py::index_error("page access out of range in 1-based indexing"); + pl.delete_page(pnum - 1); + }, + R"~~~( + Remove a page (using 1-based numbering) + + Args: + p (int): 1-based page number + )~~~" + ) + .def("__repr__", + [](PageList &pl) { + return std::string("<pikepdf._qpdf.PageList len=") + + std::to_string(pl.count()) + + std::string(">"); + } ); } diff --git a/src/qpdf/qpdf_pipeline.h b/src/qpdf/qpdf_pipeline.h new file mode 100644 index 0000000..f922827 --- /dev/null +++ b/src/qpdf/qpdf_pipeline.h @@ -0,0 +1,77 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2017, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include <cstdio> +#include <cstring> + +#include <qpdf/Constants.h> +#include <qpdf/Types.h> +#include <qpdf/DLL.h> +#include <qpdf/QPDFExc.hh> +#include <qpdf/PointerHolder.hh> +#include <qpdf/Buffer.hh> +#include <qpdf/QPDF.hh> +#include <qpdf/Pipeline.hh> +#include <qpdf/QUtil.hh> + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> + +#include "pikepdf.h" + + +class Pl_PythonOutput : public Pipeline +{ +public: + Pl_PythonOutput(const char *identifier, py::object stream) : + Pipeline(identifier, nullptr), + stream(stream) + { + } + + virtual ~Pl_PythonOutput() = default; + Pl_PythonOutput(const Pl_PythonOutput&) = delete; + Pl_PythonOutput& operator= (const Pl_PythonOutput&) = delete; + Pl_PythonOutput(Pl_PythonOutput&&) = delete; + Pl_PythonOutput& operator= (Pl_PythonOutput&&) = delete; + + void write(unsigned char *buf, size_t len) + { + py::gil_scoped_acquire gil; + size_t so_far = 0; + while (len > 0) { + py::buffer_info buffer(buf, len); + py::memoryview view_buffer(buffer); + py::object result = this->stream.attr("write")(view_buffer); + try { + so_far = result.cast<size_t>(); + } catch (const py::cast_error &e) { + throw py::type_error("Unexpected return type of write()"); + } + if (so_far == 0) { + QUtil::throw_system_error(this->identifier); + } else { + buf += so_far; + len -= so_far; + } + } + } + + void finish() + { + py::gil_scoped_acquire gil; + try { + this->stream.attr("flush")(); + } catch (const py::attr_error &e) { + // Suppress + } + } + +private: + py::object stream; +}; |