diff options
author | James R. Barlow <james@purplerock.ca> | 2022-02-05 00:02:30 -0800 |
---|---|---|
committer | James R. Barlow <james@purplerock.ca> | 2022-02-05 00:02:30 -0800 |
commit | 8a47d2b0c2c8539245d5a81e324a94c51418c763 (patch) | |
tree | ad82c3b68446f3189ea13278885c9677f899a05e /src | |
parent | 280dd44a44ce1f725eddbffad937d0c485efa0f8 (diff) |
Register JBIG2Decode using QPDF StreamFilter
More flexible than previous approach.
Diffstat (limited to 'src')
-rw-r--r-- | src/pikepdf/jbig2.py | 38 | ||||
-rw-r--r-- | src/qpdf/jbig2-inl.h | 120 | ||||
-rw-r--r-- | src/qpdf/qpdf.cpp | 3 |
3 files changed, 160 insertions, 1 deletions
diff --git a/src/pikepdf/jbig2.py b/src/pikepdf/jbig2.py index 026b2ab..87f8550 100644 --- a/src/pikepdf/jbig2.py +++ b/src/pikepdf/jbig2.py @@ -25,7 +25,14 @@ def extract_jbig2( global_path = Path(tmpdir) / "global" output_path = Path(tmpdir) / "outfile" - args = ["jbig2dec", "-e", "-o", os.fspath(output_path)] + args = [ + "jbig2dec", + "--embedded", + "--format", + "png", + "--output", + os.fspath(output_path), + ] # Get the raw stream, because we can't decode im_obj - that is why we are here # (Strictly speaking we should remove any non-JBIG2 filters if double encoded) @@ -45,6 +52,35 @@ def extract_jbig2( return im +def extract_jbig2_bytes(jbig2: bytes, jbig2_globals: bytes) -> bytes: + with TemporaryDirectory(prefix='pikepdf', suffix='.jbig2') as tmpdir: + image_path = Path(tmpdir) / "image" + global_path = Path(tmpdir) / "global" + output_path = Path(tmpdir) / "outfile" + + args = [ + "jbig2dec", + "--embedded", + "--format", + "png", + "--output", + os.fspath(output_path), + ] + + # Get the raw stream, because we can't decode im_obj - that is why we are here + # (Strictly speaking we should remove any non-JBIG2 filters if double encoded) + image_path.write_bytes(jbig2) + + if len(jbig2_globals) > 0: + global_path.write_bytes(jbig2_globals) + args.append(os.fspath(global_path)) + + args.append(os.fspath(image_path)) + + run(args, stdout=DEVNULL, check=True) + return output_path.read_bytes() + + def jbig2dec_available() -> bool: try: proc = run(['jbig2dec', '--version'], stdout=PIPE, check=True, encoding='ascii') diff --git a/src/qpdf/jbig2-inl.h b/src/qpdf/jbig2-inl.h new file mode 100644 index 0000000..1904f46 --- /dev/null +++ b/src/qpdf/jbig2-inl.h @@ -0,0 +1,120 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright (C) 2022, James R. Barlow (https://github.com/jbarlow83/) + */ + +#include "pikepdf.h" + +#include <cstdio> +#include <cstring> + +#include <qpdf/Constants.h> +#include <qpdf/Types.h> +#include <qpdf/DLL.h> +#include <qpdf/QPDFExc.hh> +#include <qpdf/PointerHolder.hh> +#include <qpdf/Buffer.hh> +#include <qpdf/QPDF.hh> +#include <qpdf/QPDFStreamFilter.hh> +#include <qpdf/QUtil.hh> +#include <qpdf/Pipeline.hh> + +unsigned char *pipeline_caster(const char *s) +{ + // QPDF indicates Pipeline::write(unsigned char*) is effectively const + // but not actually const for historical reasons, so we can discard the const. + // unsigned char* to char* should be safe. + return const_cast<unsigned char *>(reinterpret_cast<const unsigned char *>(s)); +} + +class Pl_JBIG2 : public Pipeline { +public: + Pl_JBIG2( + const char *identifier, Pipeline *next, const std::string &jbig2globals = "") + : Pipeline(identifier, next), jbig2globals(jbig2globals) + { + } + virtual ~Pl_JBIG2() = default; + + virtual void write(unsigned char *data, size_t len) override + { + this->ss.write(reinterpret_cast<const char *>(data), len); + } + virtual void finish() override + { + std::string data = this->ss.str(); + if (data.empty()) { + if (this->getNext(true)) + this->getNext()->finish(); + return; + } + + py::bytes pydata = py::bytes(data); + py::function extract_jbig2 = + py::module_::import("pikepdf.jbig2").attr("extract_jbig2_bytes"); + + py::bytes extracted = extract_jbig2(pydata, this->jbig2globals); + + std::string extracted_cpp = std::string(extracted); + + this->getNext()->write( + pipeline_caster(extracted_cpp.data()), extracted_cpp.length()); + + if (this->getNext(true)) { + this->getNext()->finish(); + } + this->ss.clear(); + } + +private: + py::bytes jbig2globals; + std::stringstream ss; +}; + +class JBIG2StreamFilter : public QPDFStreamFilter { +public: + JBIG2StreamFilter() = default; + virtual ~JBIG2StreamFilter() = default; + + virtual bool setDecodeParms(QPDFObjectHandle decode_parms) override + { + try { + auto jbig2dec_available = + py::module_::import("pikepdf.jbig2").attr("jbig2dec_available"); + if (!jbig2dec_available()) + return false; + + auto jbig2globals_obj = decode_parms.getKey("/JBIG2Globals"); + if (jbig2globals_obj.isNull()) + return true; + + auto buf = jbig2globals_obj.getStreamData(); + this->jbig2globals = + std::string(reinterpret_cast<char *>(buf->getBuffer()), buf->getSize()); + return true; + } catch (const std::exception &e) { + } + return false; + } + virtual Pipeline *getDecodePipeline(Pipeline *next) override + { + this->pipeline = + std::make_shared<Pl_JBIG2>("JBIG2 decode", next, this->jbig2globals); + return this->pipeline.get(); + } + + static std::shared_ptr<JBIG2StreamFilter> factory() + { + return std::make_shared<JBIG2StreamFilter>(); + } + + virtual bool isSpecializedCompression() override { return true; } + virtual bool isLossyCompression() override { return false; } + +private: + std::string jbig2globals; + std::shared_ptr<Pipeline> pipeline; +};
\ No newline at end of file diff --git a/src/qpdf/qpdf.cpp b/src/qpdf/qpdf.cpp index bf54bdd..f54aa33 100644 --- a/src/qpdf/qpdf.cpp +++ b/src/qpdf/qpdf.cpp @@ -32,6 +32,7 @@ #include "qpdf_pagelist.h" #include "qpdf_inputsource-inl.h" #include "mmap_inputsource-inl.h" +#include "jbig2-inl.h" #include "pipeline.h" #include "utils.h" #include "gsl.h" @@ -473,6 +474,8 @@ void save_pdf(QPDF &q, void init_qpdf(py::module_ &m) { + QPDF::registerStreamFilter("/JBIG2Decode", &JBIG2StreamFilter::factory); + py::enum_<qpdf_object_stream_e>(m, "ObjectStreamMode") .value("disable", qpdf_object_stream_e::qpdf_o_disable) .value("preserve", qpdf_object_stream_e::qpdf_o_preserve) |