diff options
author | James R. Barlow <jim@purplerock.ca> | 2018-05-24 15:59:47 -0700 |
---|---|---|
committer | James R. Barlow <jim@purplerock.ca> | 2018-05-24 15:59:47 -0700 |
commit | 5acb9dcf13ab04dce7a1c2e76de3658d5fe49fcc (patch) | |
tree | da2b50c454e28f67c03cd02b91096264969290e2 | |
parent | a7dd5030463e9d4ed592361ac8eac7cef8a325f3 (diff) |
Add support for accessing inline images
-rw-r--r-- | src/pikepdf/__init__.py | 28 | ||||
-rw-r--r-- | src/pikepdf/_objects.py | 2 | ||||
-rw-r--r-- | src/pikepdf/_pdfimage.py | 82 | ||||
-rw-r--r-- | tests/resources/image-mono-inline.pdf | 74 | ||||
-rw-r--r-- | tests/test_image_access.py | 18 |
5 files changed, 194 insertions, 10 deletions
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py index e3f8046..2549d45 100644 --- a/src/pikepdf/__init__.py +++ b/src/pikepdf/__init__.py @@ -6,6 +6,7 @@ from collections import namedtuple +from enum import Enum from pkg_resources import get_distribution, DistributionNotFound import os @@ -26,7 +27,7 @@ from ._qpdf import (Object, ObjectType, PdfError, Pdf, PasswordError, from ._objects import (Boolean, Integer, Real, Name, String, Array, Dictionary, Stream, Operator, Null) -from ._pdfimage import PdfImage, UnsupportedImageTypeError +from ._pdfimage import PdfImage, PdfInlineImage, UnsupportedImageTypeError __libqpdf_version__ = _qpdf.qpdf_version() @@ -46,13 +47,36 @@ class _OperandGrouper(_qpdf.StreamParser): super().__init__() self.instructions = [] self._tokens = [] + self._inline_image = False + self._inline_image_metadata = [] def handle_object(self, obj): if obj.type_code == ObjectType.operator: instruction = self.PdfInstruction( operands=self._tokens, operator=obj) - self.instructions.append(instruction) self._tokens = [] + + if obj == Operator('BI'): + self._inline_image= True + elif self._inline_image: + if obj == Operator('ID'): + self._inline_image_metadata = instruction.operands + elif obj == Operator('EI'): + inline_image_data = instruction.operands[0] + iimage = PdfInlineImage( + image_data=inline_image_data, + image_object=self._inline_image_metadata + ) + instruction = self.PdfInstruction( + operands=[iimage], + operator=Operator('INLINE IMAGE') + ) + self.instructions.append(instruction) + self._inline_image = False + self._inline_image_metadata = [] + else: + self.instructions.append(instruction) + else: self._tokens.append(obj) diff --git a/src/pikepdf/_objects.py b/src/pikepdf/_objects.py index f4a45df..a02e110 100644 --- a/src/pikepdf/_objects.py +++ b/src/pikepdf/_objects.py @@ -47,6 +47,8 @@ class Integer(metaclass=_ObjectMeta): def __new__(cls, n): if n.bit_length() >= 64: raise ValueError('Value is too large for 64-bit integer') + if not isinstance(n, int) and n == int(n): + n = int(n) return _qpdf._new_integer(n) diff --git a/src/pikepdf/_pdfimage.py b/src/pikepdf/_pdfimage.py index 1163781..43bd1d7 100644 --- a/src/pikepdf/_pdfimage.py +++ b/src/pikepdf/_pdfimage.py @@ -21,18 +21,27 @@ class UnsupportedImageTypeError(Exception): class _PdfImageDescriptor: - def __init__(self, name, type_, default): + def __init__(self, name, type_, default, inline_name=None, inline_map=None): self.name = name self.type = type_ self.default = default + self.inline_name = inline_name + self.inline_map = inline_map def __get__(self, wrapper, wrapperclass): - val = getattr(wrapper.obj, self.name, self.default) + sentinel = object() + val = sentinel + if self.inline_name: + val = getattr(wrapper.obj, self.inline_name, sentinel) + if val is sentinel: + val = getattr(wrapper.obj, self.name, self.default) if self.type == bool: return val.as_bool() if isinstance(val, Object) else bool(val) return self.type(val) def __set__(self, wrapper, val): + if self.inline_name: + raise NotImplementedError("editing inline images") setattr(wrapper.obj, self.name, val) @@ -57,12 +66,9 @@ class PdfImage: SIMPLE_COLORSPACES = ('/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray') def __init__(self, obj): - if obj.type_code not in (ObjectType.stream, ObjectType.inlineimage): - raise TypeError("can't construct PdfImage from non-image") if obj.type_code == ObjectType.stream and \ obj.stream_dict.get("/Subtype") != "/Image": raise TypeError("can't construct PdfImage from non-image") - self.obj = obj width = _PdfImageDescriptor('Width', int, None) @@ -96,6 +102,10 @@ class PdfImage: raise NotImplementedError("not sure how to get colorspace") @property + def is_inline(self): + return False + + @property def indexed(self): return self._colorspaces[0] == '/Indexed' @@ -180,6 +190,11 @@ class PdfImage: raise UnsupportedImageTypeError() + def read_bytes(self): + return self.obj.read_bytes() + + def get_stream_buffer(self): + return self.obj.get_stream_buffer() def as_pil_image(self): """ @@ -199,10 +214,10 @@ class PdfImage: if self.mode == 'RGB': # No point in accessing the buffer here, size qpdf decodes to 3-byte # RGB and Pillow needs RGBX for raw access - data = self.obj.read_bytes() + data = self.read_bytes() im = Image.frombytes('RGB', self.size, data) elif self.mode in ('L', 'P'): - buffer = self.obj.get_stream_buffer() + buffer = self.get_stream_buffer() stride = 0 # tell Pillow to calculate stride from line width ystep = 1 # image is top to bottom in memory im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride, @@ -268,6 +283,59 @@ class PdfImage: return b.getvalue() +def inline_remove_abbrevs(value): + abbrevs = { + '/G': '/DeviceGray', + '/RGB': '/DeviceRGB', + '/CMYK': '/DeviceCMYK', + '/I': '/Indexed', + '/AHx': '/ASCIIHexDecode', + '/A85': '/ASCII85Decode', + '/LZW': '/LZWDecode', + '/RL': '/RunLengthDecode', + '/CCF': '/CCITTFaxDecode', + '/DCT': '/DCTDecode' + } + return [abbrevs.get(value, value) for value in array_str(value)] + + +class PdfInlineImage(PdfImage): + + def __init__(self, *, image_data, image_object: tuple): + self._data = image_data + self._image_object = image_object + + reparse = ' '.join([obj.unparse_resolved() for obj in image_object]) + self.obj = Object.parse(('<< ' + reparse + ' >>').encode('ascii')) + + width = _PdfImageDescriptor('Width', int, None, 'W') + height = _PdfImageDescriptor('Height', int, None, 'H') + image_mask = _PdfImageDescriptor('ImageMask', bool, False, 'IM') + _bpc = _PdfImageDescriptor('BitsPerComponent', int, None, 'BPC') + _colorspaces = _PdfImageDescriptor('ColorSpace', inline_remove_abbrevs, [], 'CS') + filters = _PdfImageDescriptor('Filter', inline_remove_abbrevs, [], 'F') + decode_parms = _PdfImageDescriptor('DecodeParms', dict_or_array_dict, [], 'DP') + + @property + def is_inline(self): + return True + + def __repr__(self): + return '<pikepdf.PdfInlineImage image mode={} size={}x{} at {}>'.format( + self.mode, self.width, self.height, hex(id(self))) + + def extract_to(self, *, stream): + raise UnsupportedImageTypeError("inline images don't support extract") + + def read_bytes(self): + raise NotImplementedError("qpdf returns compressed") + #return self._data._inline_image_bytes() + + def get_stream_buffer(self): + raise NotImplementedError("qpdf returns compressed") + #return memoryview(self._data.inline_image_bytes()) + + def page_to_svg(page): pdf = Pdf.new() pdf.pages.append(page) diff --git a/tests/resources/image-mono-inline.pdf b/tests/resources/image-mono-inline.pdf new file mode 100644 index 0000000..6099821 --- /dev/null +++ b/tests/resources/image-mono-inline.pdf @@ -0,0 +1,74 @@ +%PDF-1.3 +%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 7 0 R /MediaBox [ 0 0 576 432 ] /Parent 6 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/Outlines 8 0 R /PageMode /UseNone /Pages 6 0 R /Type /Catalog +>> +endobj +5 0 obj +<< +/Author (anonymous) /CreationDate (D:20180523163359+08'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20180523163359+08'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (untitled) /Trapped /False +>> +endobj +6 0 obj +<< +/Count 1 /Kids [ 3 0 R ] /Type /Pages +>> +endobj +7 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 160 +>> +stream +GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_H!OBD?NuCV35o1S34@S_\F^:oK>Hm;h-Z[EXl-s%rujq)PReFmq?,:AE:[LH+It*-RhJ''s"89(Znk7AqhhifU*t"6lX_1EOgC9"`c&2!<C(o*SU~>endstream +endobj +8 0 obj +<< +/Count 0 /Type /Outlines +>> +endobj +xref +0 9 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000404 00000 n +0000000488 00000 n +0000000784 00000 n +0000000843 00000 n +0000001093 00000 n +trailer +<< +/ID +[<d320ef377e84a0f70d9470e6d94ea664><d320ef377e84a0f70d9470e6d94ea664>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 5 0 R +/Root 4 0 R +/Size 9 +>> +startxref +1139 +%%EOF diff --git a/tests/test_image_access.py b/tests/test_image_access.py index 56ca82c..9124414 100644 --- a/tests/test_image_access.py +++ b/tests/test_image_access.py @@ -5,7 +5,8 @@ from PIL import Image import zlib import sys -from pikepdf import Pdf, Object, PdfImage, PdfError, Name, Null +from pikepdf import (Pdf, Object, PdfImage, PdfError, Name, Null, + parse_content_stream, ObjectType, PdfInlineImage) @pytest.fixture @@ -74,3 +75,18 @@ def test_lowlevel_replace_jpeg(congress, outdir): pdf = congress[1] pdf.save(outdir / 'congress_gray.pdf') + + +@pytest.fixture +def inline(resources): + pdf = Pdf.open(resources / 'image-mono-inline.pdf') + for operands, command in parse_content_stream(pdf.pages[0]): + if operands and isinstance(operands[0], PdfInlineImage): + return operands[0], pdf + + +def test_inline(inline): + iimage, pdf = inline + assert iimage.width == 8 + assert iimage.image_mask == False + assert iimage.mode == 'RGB' |