Add support for accessing inline images

author: James R. Barlow <jim@purplerock.ca> 2018-05-24 15:59:47 -0700
committer: James R. Barlow <jim@purplerock.ca> 2018-05-24 15:59:47 -0700
commit: 5acb9dcf13ab04dce7a1c2e76de3658d5fe49fcc (patch)
tree: da2b50c454e28f67c03cd02b91096264969290e2
parent: a7dd5030463e9d4ed592361ac8eac7cef8a325f3 (diff)
5 files changed, 194 insertions, 10 deletions
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py
index e3f8046..2549d45 100644
--- a/src/pikepdf/__init__.py
+++ b/src/pikepdf/__init__.py
@@ -6,6 +6,7 @@
 
 
 from collections import namedtuple
+from enum import Enum
 from pkg_resources import get_distribution, DistributionNotFound
 
 import os
@@ -26,7 +27,7 @@ from ._qpdf import (Object, ObjectType, PdfError, Pdf, PasswordError,
 from ._objects import (Boolean, Integer, Real, Name, String, Array, Dictionary,
         Stream, Operator, Null)
 
-from ._pdfimage import PdfImage, UnsupportedImageTypeError
+from ._pdfimage import PdfImage, PdfInlineImage, UnsupportedImageTypeError
 
 __libqpdf_version__ = _qpdf.qpdf_version()
 
@@ -46,13 +47,36 @@ class _OperandGrouper(_qpdf.StreamParser):
         super().__init__()
         self.instructions = []
         self._tokens = []
+        self._inline_image = False
+        self._inline_image_metadata = []
 
     def handle_object(self, obj):
         if obj.type_code == ObjectType.operator:
             instruction = self.PdfInstruction(
                 operands=self._tokens, operator=obj)
-            self.instructions.append(instruction)
             self._tokens = []
+
+            if obj == Operator('BI'):
+                self._inline_image= True
+            elif self._inline_image:
+                if obj == Operator('ID'):
+                    self._inline_image_metadata = instruction.operands
+                elif obj == Operator('EI'):
+                    inline_image_data = instruction.operands[0]
+                    iimage = PdfInlineImage(
+                        image_data=inline_image_data,
+                        image_object=self._inline_image_metadata
+                    )
+                    instruction = self.PdfInstruction(
+                        operands=[iimage],
+                        operator=Operator('INLINE IMAGE')
+                    )
+                    self.instructions.append(instruction)
+                    self._inline_image = False
+                    self._inline_image_metadata = []
+            else:
+                self.instructions.append(instruction)
+
         else:
             self._tokens.append(obj)
 
diff --git a/src/pikepdf/_objects.py b/src/pikepdf/_objects.py
index f4a45df..a02e110 100644
--- a/src/pikepdf/_objects.py
+++ b/src/pikepdf/_objects.py
@@ -47,6 +47,8 @@ class Integer(metaclass=_ObjectMeta):
     def __new__(cls, n):
         if n.bit_length() >= 64:
             raise ValueError('Value is too large for 64-bit integer')
+        if not isinstance(n, int) and n == int(n):
+            n = int(n)
         return _qpdf._new_integer(n)
 
 
diff --git a/src/pikepdf/_pdfimage.py b/src/pikepdf/_pdfimage.py
index 1163781..43bd1d7 100644
--- a/src/pikepdf/_pdfimage.py
+++ b/src/pikepdf/_pdfimage.py
@@ -21,18 +21,27 @@ class UnsupportedImageTypeError(Exception):
 
 
 class _PdfImageDescriptor:
-    def __init__(self, name, type_, default):
+    def __init__(self, name, type_, default, inline_name=None, inline_map=None):
         self.name = name
         self.type = type_
         self.default = default
+        self.inline_name = inline_name
+        self.inline_map = inline_map
 
     def __get__(self, wrapper, wrapperclass):
-        val = getattr(wrapper.obj, self.name, self.default)
+        sentinel = object()
+        val = sentinel
+        if self.inline_name:
+            val = getattr(wrapper.obj, self.inline_name, sentinel)
+        if val is sentinel:
+            val = getattr(wrapper.obj, self.name, self.default)
         if self.type == bool:
             return val.as_bool() if isinstance(val, Object) else bool(val)
         return self.type(val)
 
     def __set__(self, wrapper, val):
+        if self.inline_name:
+            raise NotImplementedError("editing inline images")
         setattr(wrapper.obj, self.name, val)
 
 
@@ -57,12 +66,9 @@ class PdfImage:
     SIMPLE_COLORSPACES = ('/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray')
 
     def __init__(self, obj):
-        if obj.type_code not in (ObjectType.stream, ObjectType.inlineimage):
-            raise TypeError("can't construct PdfImage from non-image")
         if obj.type_code == ObjectType.stream and \
                 obj.stream_dict.get("/Subtype") != "/Image":
             raise TypeError("can't construct PdfImage from non-image")
-
         self.obj = obj
 
     width = _PdfImageDescriptor('Width', int, None)
@@ -96,6 +102,10 @@ class PdfImage:
         raise NotImplementedError("not sure how to get colorspace")
 
     @property
+    def is_inline(self):
+        return False
+
+    @property
     def indexed(self):
         return self._colorspaces[0] == '/Indexed'
 
@@ -180,6 +190,11 @@ class PdfImage:
 
         raise UnsupportedImageTypeError()
 
+    def read_bytes(self):
+        return self.obj.read_bytes()
+
+    def get_stream_buffer(self):
+        return self.obj.get_stream_buffer()
 
     def as_pil_image(self):
         """
@@ -199,10 +214,10 @@ class PdfImage:
         if self.mode == 'RGB':
             # No point in accessing the buffer here, size qpdf decodes to 3-byte
             # RGB and Pillow needs RGBX for raw access
-            data = self.obj.read_bytes()
+            data = self.read_bytes()
             im = Image.frombytes('RGB', self.size, data)
         elif self.mode in ('L', 'P'):
-            buffer = self.obj.get_stream_buffer()
+            buffer = self.get_stream_buffer()
             stride = 0  # tell Pillow to calculate stride from line width
             ystep = 1  # image is top to bottom in memory
             im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride,
@@ -268,6 +283,59 @@ class PdfImage:
         return b.getvalue()
 
 
+def inline_remove_abbrevs(value):
+    abbrevs = {
+        '/G': '/DeviceGray',
+        '/RGB': '/DeviceRGB',
+        '/CMYK': '/DeviceCMYK',
+        '/I': '/Indexed',
+        '/AHx': '/ASCIIHexDecode',
+        '/A85': '/ASCII85Decode',
+        '/LZW': '/LZWDecode',
+        '/RL': '/RunLengthDecode',
+        '/CCF': '/CCITTFaxDecode',
+        '/DCT': '/DCTDecode'
+    }
+    return [abbrevs.get(value, value) for value in array_str(value)]
+
+
+class PdfInlineImage(PdfImage):
+
+    def __init__(self, *, image_data, image_object: tuple):
+        self._data = image_data
+        self._image_object = image_object
+
+        reparse = ' '.join([obj.unparse_resolved() for obj in image_object])
+        self.obj = Object.parse(('<< ' + reparse + ' >>').encode('ascii'))
+
+    width = _PdfImageDescriptor('Width', int, None, 'W')
+    height = _PdfImageDescriptor('Height', int, None, 'H')
+    image_mask = _PdfImageDescriptor('ImageMask', bool, False, 'IM')
+    _bpc = _PdfImageDescriptor('BitsPerComponent', int, None, 'BPC')
+    _colorspaces = _PdfImageDescriptor('ColorSpace', inline_remove_abbrevs, [], 'CS')
+    filters = _PdfImageDescriptor('Filter', inline_remove_abbrevs, [], 'F')
+    decode_parms = _PdfImageDescriptor('DecodeParms', dict_or_array_dict, [], 'DP')
+
+    @property
+    def is_inline(self):
+        return True
+
+    def __repr__(self):
+        return '<pikepdf.PdfInlineImage image mode={} size={}x{} at {}>'.format(
+            self.mode, self.width, self.height, hex(id(self)))
+
+    def extract_to(self, *, stream):
+        raise UnsupportedImageTypeError("inline images don't support extract")
+
+    def read_bytes(self):
+        raise NotImplementedError("qpdf returns compressed")
+        #return self._data._inline_image_bytes()
+
+    def get_stream_buffer(self):
+        raise NotImplementedError("qpdf returns compressed")
+        #return memoryview(self._data.inline_image_bytes())
+
+
 def page_to_svg(page):
     pdf = Pdf.new()
     pdf.pages.append(page)
diff --git a/tests/resources/image-mono-inline.pdf b/tests/resources/image-mono-inline.pdf
new file mode 100644
index 0000000..6099821
--- /dev/null
+++ b/tests/resources/image-mono-inline.pdf
@@ -0,0 +1,74 @@
+%PDF-1.3
+%���� ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/Contents 7 0 R /MediaBox [ 0 0 576 432 ] /Parent 6 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+4 0 obj
+<<
+/Outlines 8 0 R /PageMode /UseNone /Pages 6 0 R /Type /Catalog
+>>
+endobj
+5 0 obj
+<<
+/Author (anonymous) /CreationDate (D:20180523163359+08'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20180523163359+08'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
+  /Subject (unspecified) /Title (untitled) /Trapped /False
+>>
+endobj
+6 0 obj
+<<
+/Count 1 /Kids [ 3 0 R ] /Type /Pages
+>>
+endobj
+7 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 160
+>>
+stream
+GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_H!OBD?NuCV35o1S34@S_\F^:oK>Hm;h-Z[EXl-s%rujq)PReFmq?,:AE:[LH+It*-RhJ''s"89(Znk7AqhhifU*t"6lX_1EOgC9"`c&2!<C(o*SU~>endstream
+endobj
+8 0 obj
+<<
+/Count 0 /Type /Outlines
+>>
+endobj
+xref
+0 9
+0000000000 65535 f 
+0000000073 00000 n 
+0000000104 00000 n 
+0000000211 00000 n 
+0000000404 00000 n 
+0000000488 00000 n 
+0000000784 00000 n 
+0000000843 00000 n 
+0000001093 00000 n 
+trailer
+<<
+/ID 
+[<d320ef377e84a0f70d9470e6d94ea664><d320ef377e84a0f70d9470e6d94ea664>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 5 0 R
+/Root 4 0 R
+/Size 9
+>>
+startxref
+1139
+%%EOF
diff --git a/tests/test_image_access.py b/tests/test_image_access.py
index 56ca82c..9124414 100644
--- a/tests/test_image_access.py
+++ b/tests/test_image_access.py
@@ -5,7 +5,8 @@ from PIL import Image
 import zlib
 import sys
 
-from pikepdf import Pdf, Object, PdfImage, PdfError, Name, Null
+from pikepdf import (Pdf, Object, PdfImage, PdfError, Name, Null,
+        parse_content_stream, ObjectType, PdfInlineImage)
 
 
 @pytest.fixture
@@ -74,3 +75,18 @@ def test_lowlevel_replace_jpeg(congress, outdir):
 
     pdf = congress[1]
     pdf.save(outdir / 'congress_gray.pdf')
+
+
+@pytest.fixture
+def inline(resources):
+    pdf = Pdf.open(resources / 'image-mono-inline.pdf')
+    for operands, command in parse_content_stream(pdf.pages[0]):
+        if operands and isinstance(operands[0], PdfInlineImage):
+            return operands[0], pdf
+
+
+def test_inline(inline):
+    iimage, pdf = inline
+    assert iimage.width == 8
+    assert iimage.image_mask == False
+    assert iimage.mode == 'RGB'
author	James R. Barlow <jim@purplerock.ca>	2018-05-24 15:59:47 -0700
committer	James R. Barlow <jim@purplerock.ca>	2018-05-24 15:59:47 -0700
commit	5acb9dcf13ab04dce7a1c2e76de3658d5fe49fcc (patch)
tree	da2b50c454e28f67c03cd02b91096264969290e2
parent	a7dd5030463e9d4ed592361ac8eac7cef8a325f3 (diff)