summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames R. Barlow <jim@purplerock.ca>2018-05-24 15:59:47 -0700
committerJames R. Barlow <jim@purplerock.ca>2018-05-24 15:59:47 -0700
commit5acb9dcf13ab04dce7a1c2e76de3658d5fe49fcc (patch)
treeda2b50c454e28f67c03cd02b91096264969290e2
parenta7dd5030463e9d4ed592361ac8eac7cef8a325f3 (diff)
Add support for accessing inline images
-rw-r--r--src/pikepdf/__init__.py28
-rw-r--r--src/pikepdf/_objects.py2
-rw-r--r--src/pikepdf/_pdfimage.py82
-rw-r--r--tests/resources/image-mono-inline.pdf74
-rw-r--r--tests/test_image_access.py18
5 files changed, 194 insertions, 10 deletions
diff --git a/src/pikepdf/__init__.py b/src/pikepdf/__init__.py
index e3f8046..2549d45 100644
--- a/src/pikepdf/__init__.py
+++ b/src/pikepdf/__init__.py
@@ -6,6 +6,7 @@
from collections import namedtuple
+from enum import Enum
from pkg_resources import get_distribution, DistributionNotFound
import os
@@ -26,7 +27,7 @@ from ._qpdf import (Object, ObjectType, PdfError, Pdf, PasswordError,
from ._objects import (Boolean, Integer, Real, Name, String, Array, Dictionary,
Stream, Operator, Null)
-from ._pdfimage import PdfImage, UnsupportedImageTypeError
+from ._pdfimage import PdfImage, PdfInlineImage, UnsupportedImageTypeError
__libqpdf_version__ = _qpdf.qpdf_version()
@@ -46,13 +47,36 @@ class _OperandGrouper(_qpdf.StreamParser):
super().__init__()
self.instructions = []
self._tokens = []
+ self._inline_image = False
+ self._inline_image_metadata = []
def handle_object(self, obj):
if obj.type_code == ObjectType.operator:
instruction = self.PdfInstruction(
operands=self._tokens, operator=obj)
- self.instructions.append(instruction)
self._tokens = []
+
+ if obj == Operator('BI'):
+ self._inline_image= True
+ elif self._inline_image:
+ if obj == Operator('ID'):
+ self._inline_image_metadata = instruction.operands
+ elif obj == Operator('EI'):
+ inline_image_data = instruction.operands[0]
+ iimage = PdfInlineImage(
+ image_data=inline_image_data,
+ image_object=self._inline_image_metadata
+ )
+ instruction = self.PdfInstruction(
+ operands=[iimage],
+ operator=Operator('INLINE IMAGE')
+ )
+ self.instructions.append(instruction)
+ self._inline_image = False
+ self._inline_image_metadata = []
+ else:
+ self.instructions.append(instruction)
+
else:
self._tokens.append(obj)
diff --git a/src/pikepdf/_objects.py b/src/pikepdf/_objects.py
index f4a45df..a02e110 100644
--- a/src/pikepdf/_objects.py
+++ b/src/pikepdf/_objects.py
@@ -47,6 +47,8 @@ class Integer(metaclass=_ObjectMeta):
def __new__(cls, n):
if n.bit_length() >= 64:
raise ValueError('Value is too large for 64-bit integer')
+ if not isinstance(n, int) and n == int(n):
+ n = int(n)
return _qpdf._new_integer(n)
diff --git a/src/pikepdf/_pdfimage.py b/src/pikepdf/_pdfimage.py
index 1163781..43bd1d7 100644
--- a/src/pikepdf/_pdfimage.py
+++ b/src/pikepdf/_pdfimage.py
@@ -21,18 +21,27 @@ class UnsupportedImageTypeError(Exception):
class _PdfImageDescriptor:
- def __init__(self, name, type_, default):
+ def __init__(self, name, type_, default, inline_name=None, inline_map=None):
self.name = name
self.type = type_
self.default = default
+ self.inline_name = inline_name
+ self.inline_map = inline_map
def __get__(self, wrapper, wrapperclass):
- val = getattr(wrapper.obj, self.name, self.default)
+ sentinel = object()
+ val = sentinel
+ if self.inline_name:
+ val = getattr(wrapper.obj, self.inline_name, sentinel)
+ if val is sentinel:
+ val = getattr(wrapper.obj, self.name, self.default)
if self.type == bool:
return val.as_bool() if isinstance(val, Object) else bool(val)
return self.type(val)
def __set__(self, wrapper, val):
+ if self.inline_name:
+ raise NotImplementedError("editing inline images")
setattr(wrapper.obj, self.name, val)
@@ -57,12 +66,9 @@ class PdfImage:
SIMPLE_COLORSPACES = ('/DeviceRGB', '/DeviceGray', '/CalRGB', '/CalGray')
def __init__(self, obj):
- if obj.type_code not in (ObjectType.stream, ObjectType.inlineimage):
- raise TypeError("can't construct PdfImage from non-image")
if obj.type_code == ObjectType.stream and \
obj.stream_dict.get("/Subtype") != "/Image":
raise TypeError("can't construct PdfImage from non-image")
-
self.obj = obj
width = _PdfImageDescriptor('Width', int, None)
@@ -96,6 +102,10 @@ class PdfImage:
raise NotImplementedError("not sure how to get colorspace")
@property
+ def is_inline(self):
+ return False
+
+ @property
def indexed(self):
return self._colorspaces[0] == '/Indexed'
@@ -180,6 +190,11 @@ class PdfImage:
raise UnsupportedImageTypeError()
+ def read_bytes(self):
+ return self.obj.read_bytes()
+
+ def get_stream_buffer(self):
+ return self.obj.get_stream_buffer()
def as_pil_image(self):
"""
@@ -199,10 +214,10 @@ class PdfImage:
if self.mode == 'RGB':
# No point in accessing the buffer here, size qpdf decodes to 3-byte
# RGB and Pillow needs RGBX for raw access
- data = self.obj.read_bytes()
+ data = self.read_bytes()
im = Image.frombytes('RGB', self.size, data)
elif self.mode in ('L', 'P'):
- buffer = self.obj.get_stream_buffer()
+ buffer = self.get_stream_buffer()
stride = 0 # tell Pillow to calculate stride from line width
ystep = 1 # image is top to bottom in memory
im = Image.frombuffer('L', self.size, buffer, "raw", 'L', stride,
@@ -268,6 +283,59 @@ class PdfImage:
return b.getvalue()
+def inline_remove_abbrevs(value):
+ abbrevs = {
+ '/G': '/DeviceGray',
+ '/RGB': '/DeviceRGB',
+ '/CMYK': '/DeviceCMYK',
+ '/I': '/Indexed',
+ '/AHx': '/ASCIIHexDecode',
+ '/A85': '/ASCII85Decode',
+ '/LZW': '/LZWDecode',
+ '/RL': '/RunLengthDecode',
+ '/CCF': '/CCITTFaxDecode',
+ '/DCT': '/DCTDecode'
+ }
+ return [abbrevs.get(value, value) for value in array_str(value)]
+
+
+class PdfInlineImage(PdfImage):
+
+ def __init__(self, *, image_data, image_object: tuple):
+ self._data = image_data
+ self._image_object = image_object
+
+ reparse = ' '.join([obj.unparse_resolved() for obj in image_object])
+ self.obj = Object.parse(('<< ' + reparse + ' >>').encode('ascii'))
+
+ width = _PdfImageDescriptor('Width', int, None, 'W')
+ height = _PdfImageDescriptor('Height', int, None, 'H')
+ image_mask = _PdfImageDescriptor('ImageMask', bool, False, 'IM')
+ _bpc = _PdfImageDescriptor('BitsPerComponent', int, None, 'BPC')
+ _colorspaces = _PdfImageDescriptor('ColorSpace', inline_remove_abbrevs, [], 'CS')
+ filters = _PdfImageDescriptor('Filter', inline_remove_abbrevs, [], 'F')
+ decode_parms = _PdfImageDescriptor('DecodeParms', dict_or_array_dict, [], 'DP')
+
+ @property
+ def is_inline(self):
+ return True
+
+ def __repr__(self):
+ return '<pikepdf.PdfInlineImage image mode={} size={}x{} at {}>'.format(
+ self.mode, self.width, self.height, hex(id(self)))
+
+ def extract_to(self, *, stream):
+ raise UnsupportedImageTypeError("inline images don't support extract")
+
+ def read_bytes(self):
+ raise NotImplementedError("qpdf returns compressed")
+ #return self._data._inline_image_bytes()
+
+ def get_stream_buffer(self):
+ raise NotImplementedError("qpdf returns compressed")
+ #return memoryview(self._data.inline_image_bytes())
+
+
def page_to_svg(page):
pdf = Pdf.new()
pdf.pages.append(page)
diff --git a/tests/resources/image-mono-inline.pdf b/tests/resources/image-mono-inline.pdf
new file mode 100644
index 0000000..6099821
--- /dev/null
+++ b/tests/resources/image-mono-inline.pdf
@@ -0,0 +1,74 @@
+%PDF-1.3
+%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/Contents 7 0 R /MediaBox [ 0 0 576 432 ] /Parent 6 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>>
+ /Type /Page
+>>
+endobj
+4 0 obj
+<<
+/Outlines 8 0 R /PageMode /UseNone /Pages 6 0 R /Type /Catalog
+>>
+endobj
+5 0 obj
+<<
+/Author (anonymous) /CreationDate (D:20180523163359+08'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20180523163359+08'00') /Producer (ReportLab PDF Library - www.reportlab.com)
+ /Subject (unspecified) /Title (untitled) /Trapped /False
+>>
+endobj
+6 0 obj
+<<
+/Count 1 /Kids [ 3 0 R ] /Type /Pages
+>>
+endobj
+7 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 160
+>>
+stream
+GapQh0E=F,0U\H3T\pNYT^QKk?tc>IP,;W#U1^23ihPEM_H!OBD?NuCV35o1S34@S_\F^:oK>Hm;h-Z[EXl-s%rujq)PReFmq?,:AE:[LH+It*-RhJ''s"89(Znk7AqhhifU*t"6lX_1EOgC9"`c&2!<C(o*SU~>endstream
+endobj
+8 0 obj
+<<
+/Count 0 /Type /Outlines
+>>
+endobj
+xref
+0 9
+0000000000 65535 f
+0000000073 00000 n
+0000000104 00000 n
+0000000211 00000 n
+0000000404 00000 n
+0000000488 00000 n
+0000000784 00000 n
+0000000843 00000 n
+0000001093 00000 n
+trailer
+<<
+/ID
+[<d320ef377e84a0f70d9470e6d94ea664><d320ef377e84a0f70d9470e6d94ea664>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 5 0 R
+/Root 4 0 R
+/Size 9
+>>
+startxref
+1139
+%%EOF
diff --git a/tests/test_image_access.py b/tests/test_image_access.py
index 56ca82c..9124414 100644
--- a/tests/test_image_access.py
+++ b/tests/test_image_access.py
@@ -5,7 +5,8 @@ from PIL import Image
import zlib
import sys
-from pikepdf import Pdf, Object, PdfImage, PdfError, Name, Null
+from pikepdf import (Pdf, Object, PdfImage, PdfError, Name, Null,
+ parse_content_stream, ObjectType, PdfInlineImage)
@pytest.fixture
@@ -74,3 +75,18 @@ def test_lowlevel_replace_jpeg(congress, outdir):
pdf = congress[1]
pdf.save(outdir / 'congress_gray.pdf')
+
+
+@pytest.fixture
+def inline(resources):
+ pdf = Pdf.open(resources / 'image-mono-inline.pdf')
+ for operands, command in parse_content_stream(pdf.pages[0]):
+ if operands and isinstance(operands[0], PdfInlineImage):
+ return operands[0], pdf
+
+
+def test_inline(inline):
+ iimage, pdf = inline
+ assert iimage.width == 8
+ assert iimage.image_mask == False
+ assert iimage.mode == 'RGB'