tests/test_image_access.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358

import zlib
from io import BytesIO
from pathlib import Path

import pytest
from PIL import Image
from PIL import features as PIL_features

from pikepdf import (
    Array,
    Dictionary,
    Name,
    Pdf,
    PdfError,
    PdfImage,
    PdfInlineImage,
    Stream,
    StreamDecodeLevel,
    parse_content_stream,
)
from pikepdf._cpphelpers import fspath
from pikepdf.models.image import UnsupportedImageTypeError, NotExtractableError


# pylint: disable=redefined-outer-name


def first_image_in(filename):
    pdf = Pdf.open(filename)
    pdfimagexobj = next(iter(pdf.pages[0].images.values()))
    return pdfimagexobj, pdf


@pytest.fixture
def congress(resources):
    return first_image_in(resources / 'congress.pdf')


@pytest.fixture
def sandwich(resources):
    return first_image_in(resources / 'sandwich.pdf')


def test_image_from_nonimage(resources):
    pdf = Pdf.open(resources / 'congress.pdf')
    resources = pdf.pages[0].Contents
    with pytest.raises(TypeError):
        PdfImage(resources)


def test_image(congress):
    pdfimage = PdfImage(congress[0])
    pillowimage = pdfimage.as_pil_image()

    assert pillowimage.mode == pdfimage.mode
    assert pillowimage.size == pdfimage.size


def test_imagemask(congress):
    assert PdfImage(congress[0]).image_mask == False


def test_image_replace(congress, outdir):
    pdfimage = PdfImage(congress[0])
    pillowimage = pdfimage.as_pil_image()

    grayscale = pillowimage.convert('L')
    grayscale = grayscale.resize((4, 4))  # So it is not obnoxious on error

    congress[0].write(zlib.compress(grayscale.tobytes()), filter=Name("/FlateDecode"))
    congress[0].ColorSpace = Name("/DeviceGray")
    pdf = congress[1]
    pdf.save(outdir / 'congress_gray.pdf')


def test_lowlevel_jpeg(congress):
    raw_bytes = congress[0].read_raw_bytes()
    with pytest.raises(PdfError):
        congress[0].read_bytes()

    im = Image.open(BytesIO(raw_bytes))
    assert im.format == 'JPEG'

    pim = PdfImage(congress[0])
    b = BytesIO()
    pim.extract_to(stream=b)
    b.seek(0)
    im = Image.open(b)
    assert im.size == (congress[0].Width, congress[0].Height)
    assert im.mode == 'RGB'


def test_lowlevel_replace_jpeg(congress, outdir):
    # This test will modify the PDF so needs its own image
    raw_bytes = congress[0].read_raw_bytes()

    im = Image.open(BytesIO(raw_bytes))
    grayscale = im.convert('L')
    grayscale = grayscale.resize((4, 4))  # So it is not obnoxious on error

    congress[0].write(
        zlib.compress(grayscale.tobytes()[:10]), filter=Name("/FlateDecode")
    )
    congress[0].ColorSpace = Name('/DeviceGray')

    pdf = congress[1]
    pdf.save(outdir / 'congress_gray.pdf')


@pytest.fixture
def inline(resources):
    pdf = Pdf.open(resources / 'image-mono-inline.pdf')
    for operands, _command in parse_content_stream(pdf.pages[0]):
        if operands and isinstance(operands[0], PdfInlineImage):
            return operands[0], pdf


def test_inline(inline):
    iimage, _pdf = inline
    assert iimage.width == 8
    assert iimage.image_mask == False
    assert iimage.mode == 'RGB'
    assert iimage.is_inline
    assert iimage.colorspace == '/DeviceRGB'


def test_bits_per_component_missing(congress):
    cong_im = congress[0]
    del cong_im.stream_dict['/BitsPerComponent']
    assert PdfImage(congress[0]).bits_per_component == 8


@pytest.mark.parametrize(
    'w,h,pixeldata,cs,bpc',
    [
        (1, 1, b'\xff', '/DeviceGray', 1),
        (1, 1, b'\xf0', '/DeviceGray', 8),
        (1, 1, b'\xff\x00\xff', '/DeviceRGB', 8),
    ],
)
def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc):
    pdf = Pdf.new()

    image_data = pixeldata * (w * h)

    image = Stream(pdf, image_data)
    image.Type = Name('/XObject')
    image.Subtype = Name('/Image')
    image.ColorSpace = Name(cs)
    image.BitsPerComponent = bpc
    image.Width = w
    image.Height = h

    xobj = {'/Im1': image}
    resources = {'/XObject': xobj}
    mediabox = [0, 0, 100, 100]
    stream = b'q 100 0 0 100 0 0 cm /Im1 Do Q'
    contents = Stream(pdf, stream)

    page_dict = {
        '/Type': Name('/Page'),
        '/MediaBox': mediabox,
        '/Contents': contents,
        '/Resources': resources,
    }
    page = pdf.make_indirect(page_dict)

    pdf.pages.append(page)
    outfile = outdir / 'test{w}{h}{cs}{bpc}.pdf'.format(w=w, h=h, cs=cs[1:], bpc=bpc)
    pdf.save(
        outfile, compress_streams=False, stream_decode_level=StreamDecodeLevel.none
    )

    p2 = pdf.open(outfile)
    pim = PdfImage(p2.pages[0].Resources.XObject['/Im1'])

    assert pim.bits_per_component == bpc
    assert pim.colorspace == cs
    assert pim.width == w
    assert pim.height == h
    if cs == '/DeviceRGB':
        assert pim.mode == 'RGB'
    elif cs == '/DeviceGray' and bpc == 8:
        assert pim.mode == 'L'
    elif bpc == 1:
        assert pim.mode == '1'
    assert not pim.palette

    assert pim.filters == []
    assert pim.read_bytes() == pixeldata

    outstream = BytesIO()
    pim.extract_to(stream=outstream)
    outstream.seek(0)
    im = Image.open(outstream)
    assert pim.mode == im.mode


@pytest.mark.parametrize(
    'filename,bpc,filters,ext,mode,format_',
    [
        ('sandwich.pdf', 1, ['/CCITTFaxDecode'], '.tif', '1', 'TIFF'),
        ('congress-gray.pdf', 8, ['/DCTDecode'], '.jpg', 'L', 'JPEG'),
        ('congress.pdf', 8, ['/DCTDecode'], '.jpg', 'RGB', 'JPEG'),
        ('cmyk-jpeg.pdf', 8, ['/DCTDecode'], '.jpg', 'CMYK', 'JPEG'),
    ],
)
def test_direct_extract(resources, filename, bpc, filters, ext, mode, format_):
    xobj, _pdf = first_image_in(resources / filename)
    pim = PdfImage(xobj)

    assert pim.bits_per_component == bpc
    assert pim.filters == filters

    outstream = BytesIO()
    outext = pim.extract_to(stream=outstream)
    assert outext == ext, 'unexpected output file'
    outstream.seek(0)

    im = Image.open(outstream)
    assert im.mode == mode
    assert im.format == format_


@pytest.mark.parametrize(
    'filename,bpc',
    [
        ('pal.pdf', 8),
        ('pal-1bit-trivial.pdf', 1),
        pytest.param(
            'pal-1bit-rgb.pdf', 1, marks=pytest.mark.xfail(raises=NotImplementedError)
        ),
    ],
)
def test_image_palette(resources, filename, bpc):
    pdf = Pdf.open(resources / filename)
    pim = PdfImage(next(iter(pdf.pages[0].images.values())))

    assert pim.palette[0] == 'RGB'
    assert pim.colorspace == '/DeviceRGB'
    assert not pim.is_inline
    assert pim.mode == 'P'
    assert pim.bits_per_component == bpc

    outstream = BytesIO()
    pim.extract_to(stream=outstream)


def test_bool_in_inline_image():
    piim = PdfInlineImage(image_data=b'', image_object=(Name.IM, True))
    assert piim.image_mask


@pytest.mark.skipif(
    not PIL_features.check_codec('jpg_2000'), reason='no JPEG2000 codec'
)
def test_jp2(resources):
    pdf = Pdf.open(resources / 'pike-jp2.pdf')
    xobj = next(iter(pdf.pages[0].images.values()))
    pim = PdfImage(xobj)

    assert '/JPXDecode' in pim.filters
    assert pim.colorspace == '/DeviceRGB'
    assert not pim.is_inline
    assert not pim.indexed
    assert pim.mode == 'RGB'
    assert pim.bits_per_component == 8

    outstream = BytesIO()
    pim.extract_to(stream=outstream)
    del pim
    del xobj.ColorSpace

    # If there is no explicit ColorSpace metadata we should get it from the
    # compressed data stream
    pim = PdfImage(xobj)
    assert pim.colorspace == '/DeviceRGB'
    assert pim.bits_per_component == 8


def test_extract_filepath(congress, outdir):
    xobj, _pdf = congress
    pim = PdfImage(xobj)

    # fspath is for Python 3.5
    result = pim.extract_to(fileprefix=fspath(outdir / 'image'))
    assert Path(result).exists()
    assert (outdir / 'image.jpg').exists()


def test_extract_direct_fails_nondefault_colortransform(congress):
    xobj, _pdf = congress

    xobj.DecodeParms = Dictionary(
        ColorTransform=42  # Non standard (or allowed in the spec)
    )
    pim = PdfImage(xobj)

    bio = BytesIO()
    with pytest.raises(NotExtractableError):
        pim._extract_direct(stream=bio)
    with pytest.raises(UnsupportedImageTypeError):
        pim.extract_to(stream=bio)

    xobj.ColorSpace = Name.DeviceCMYK
    pim = PdfImage(xobj)
    with pytest.raises(NotExtractableError):
        pim._extract_direct(stream=bio)
    with pytest.raises(UnsupportedImageTypeError):
        pim.extract_to(stream=bio)


def test_icc_use(resources):
    xobj, _pdf = first_image_in(resources / '1biticc.pdf')

    pim = PdfImage(xobj)
    assert pim.mode == '1'
    assert pim.colorspace == '/ICCBased'
    assert pim.bits_per_component == 1

    assert pim.icc.profile.xcolor_space == 'GRAY'


def test_icc_extract(resources):
    xobj, _pdf = first_image_in(resources / 'tree-icc.pdf')

    pim = PdfImage(xobj)
    assert pim.as_pil_image().info['icc_profile'] == pim.icc.tobytes()


def test_stacked_compression(resources):
    xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf')

    pim = PdfImage(xobj)
    assert pim.mode == 'RGB'
    assert pim.colorspace == '/DeviceRGB'
    assert pim.bits_per_component == 8
    assert pim.filters == ['/FlateDecode', '/JPXDecode']


def test_ccitt_photometry(sandwich):
    xobj, _pdf = sandwich

    pim = PdfImage(xobj)
    im = pim.as_pil_image()
    im = im.convert('L')
    assert im.getpixel((0, 0)) == 255, "Expected white background"


def test_ccitt_encodedbytealign(sandwich):
    xobj, _pdf = sandwich

    # Pretend this is image is "EncodedByteAlign". We don't have a FOSS
    # example of such an image.
    xobj.DecodeParms.EncodedByteAlign = True
    pim = PdfImage(xobj)
    with pytest.raises(UnsupportedImageTypeError):
        pim.as_pil_image()