tests/test_metadata.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342

from pathlib import Path
from datetime import datetime, timezone, timedelta
import re

import pytest
from hypothesis import given, example
from hypothesis.strategies import integers
import pikepdf
from pikepdf import Pdf, Dictionary, Name, PasswordError, Stream
from pikepdf.models.metadata import (
    decode_pdf_date, encode_pdf_date,
    XMP_NS_DC, XMP_NS_PDF, XMP_NS_XMP,
    DateConverter
)

import defusedxml.ElementTree as ET

try:
    from libxmp import XMPMeta
except ImportError:
    XMPMeta = None

pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning')

# pylint: disable=w0621


@pytest.fixture
def vera(resources):
    # Has XMP but no docinfo
    return Pdf.open(resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf')


@pytest.fixture
def graph(resources):
    # Has XMP and docinfo, all standard format XMP
    return Pdf.open(resources / 'graph.pdf')


@pytest.fixture
def sandwich(resources):
    # Has XMP, docinfo, <?adobe-xap-filters esc="CRLF"?>, shorthand attribute XMP
    return Pdf.open(resources / 'sandwich.pdf')


@pytest.fixture
def trivial(resources):
    # Has no XMP or docinfo
    return Pdf.open(resources / 'pal-1bit-trivial.pdf')


@pytest.fixture
def invalid_creationdate(resources):
    # Has nuls in docinfo, old PDF
    return Pdf.open(resources / 'invalid_creationdate.pdf')


def test_lowlevel(sandwich):
    meta = sandwich.open_metadata()
    assert meta._qname('pdf:Producer') == '{http://ns.adobe.com/pdf/1.3/}Producer'
    assert meta._prefix_from_uri('{http://ns.adobe.com/pdf/1.3/}Producer') == 'pdf:Producer'
    assert 'pdf:Producer' in meta
    assert '{http://ns.adobe.com/pdf/1.3/}Producer' in meta
    assert 'xmp:CreateDate' in meta
    assert meta['xmp:ModifyDate'].startswith('2017')
    assert len(meta) > 0
    assert meta['dc:title'] == 'Untitled'

    assert 'pdf:invalid' not in meta
    assert '{http://ns.adobe.com/pdf/1.3/}invalid' not in meta
    with pytest.raises(TypeError):
        assert ['hi'] in meta

    with pytest.raises(KeyError):
        meta['dc:invalid']
    with pytest.raises(KeyError):
        meta['{http://ns.adobe.com/pdf/1.3/}invalid']
    with pytest.raises(KeyError):
        meta['{http://invalid.com/ns/}doublyinvalid']


def test_no_info(vera, outdir):
    assert vera.trailer.get('/Info') is None, 'need a test file with no /Info'

    assert len(vera.docinfo) == 0
    creator = 'pikepdf test suite'
    vera.docinfo['/Creator'] = creator
    assert vera.docinfo.is_indirect, "/Info must be an indirect object"
    vera.save(outdir / 'out.pdf')

    new = Pdf.open(outdir / 'out.pdf')
    assert new.docinfo['/Creator'] == creator


def test_update_info(graph, outdir):
    new_title = '我敢打赌，你只是想看看这意味着什么'
    graph.docinfo['/Title'] = new_title
    graph.save(outdir / 'out.pdf')

    new = Pdf.open(outdir / 'out.pdf')
    assert new.docinfo['/Title'] == new_title
    assert graph.docinfo['/Author'] == new.docinfo['/Author']

    with pytest.raises(ValueError):
        new.docinfo = Dictionary({'/Keywords': 'bob'})

    new.docinfo = graph.make_indirect(Dictionary({'/Keywords': 'bob'}))
    assert new.docinfo.is_indirect, "/Info must be an indirect object"


def test_copy_info(vera, graph, outdir):
    vera.docinfo = vera.copy_foreign(graph.docinfo)
    assert vera.docinfo.is_indirect, "/Info must be an indirect object"
    vera.save(outdir / 'out.pdf')


def test_add_new_xmp_and_mark(trivial):
    with trivial.open_metadata(
        set_pikepdf_as_editor=False, update_docinfo=False
    ) as xmp_view:
        assert not xmp_view

    with trivial.open_metadata(update_docinfo=False
    ) as xmp:
        assert not xmp  # No changes at this point
    del xmp

    print(trivial.Root.Metadata.read_bytes())

    with trivial.open_metadata(update_docinfo=False
    ) as xmp:
        assert 'pikepdf' in xmp['pdf:Producer']
        assert 'xmp:MetadataDate' in xmp


def test_update_docinfo(vera):
    with vera.open_metadata(set_pikepdf_as_editor=False, update_docinfo=True) as xmp:
        pass
    assert xmp['pdf:Producer'] == vera.docinfo[Name.Producer]
    assert xmp['xmp:CreatorTool'] == vera.docinfo[Name.Creator]
    assert xmp['dc:creator'][0] == vera.docinfo[Name.Author]

    # Test delete propagation
    with vera.open_metadata(set_pikepdf_as_editor=False, update_docinfo=True) as xmp:
        del xmp['dc:creator']
    assert 'dc:creator' not in xmp
    assert Name.Author not in vera.docinfo


@pytest.mark.parametrize('filename', list((Path(__file__).parent / 'resources').glob('*.pdf')))
def test_roundtrip(filename):
    try:
        pdf = Pdf.open(filename)
    except PasswordError:
        return
    with pdf.open_metadata() as xmp:
        for k in xmp.keys():
            if not 'Date' in k:
                xmp[k] = 'A'
    assert '<?xpacket' not in str(xmp)


def test_build_metadata(trivial, graph, outdir):
    with trivial.open_metadata(set_pikepdf_as_editor=False) as xmp:
        xmp.load_from_docinfo(graph.docinfo)
    trivial.save(outdir / 'tmp.pdf')

    pdf = pikepdf.open(outdir / 'tmp.pdf')
    assert pdf.Root.Metadata.Type == Name.Metadata
    assert pdf.Root.Metadata.Subtype == Name.XML
    with pdf.open_metadata(set_pikepdf_as_editor=False) as xmp:
        assert 'pdf:Producer' not in xmp
        xmp_date = xmp['xmp:CreateDate']
        docinfo_date = decode_pdf_date(trivial.docinfo[Name.CreationDate])
        assert xmp_date == docinfo_date.isoformat()


def test_python_xmp_validate_add(trivial):
    with trivial.open_metadata() as xmp:
        xmp['dc:creator'] = ['Bob', 'Doug']
        xmp['dc:title'] = 'Title'
        xmp['dc:publisher'] = {'Mackenzie'}

    xmp_str = str(xmp).replace('\n', '')
    assert '<rdf:Seq><rdf:li>Bob</rdf:li><rdf:li>Doug</rdf:li>' in xmp_str
    assert '<rdf:Bag><rdf:li>Mackenzie</rdf:li>' in xmp_str

    if not XMPMeta:
        pytest.skip(msg='needs libxmp')

    xmpmeta = XMPMeta(xmp_str=str(xmp))
    DC = XMP_NS_DC
    assert xmpmeta.does_array_item_exist(DC, 'creator', 'Bob')
    assert xmpmeta.does_array_item_exist(DC, 'creator', 'Doug')
    assert xmpmeta.get_localized_text(DC, 'title', None, 'x-default') == 'Title'
    assert xmpmeta.does_array_item_exist(DC, 'publisher', 'Mackenzie')


def test_python_xmp_validate_change_list(graph):
    with graph.open_metadata() as xmp:
        assert 'dc:creator' in xmp
        xmp['dc:creator'] = ['Dobby', 'Kreacher']
    assert str(xmp)
    if not XMPMeta:
        pytest.skip(msg='needs libxmp')
    xmpmeta = XMPMeta(xmp_str=str(xmp))
    DC = XMP_NS_DC
    assert xmpmeta.does_array_item_exist(DC, 'creator', 'Dobby')
    assert xmpmeta.does_array_item_exist(DC, 'creator', 'Kreacher')


def test_python_xmp_validate_change(sandwich):
    with sandwich.open_metadata() as xmp:
        assert 'xmp:CreatorTool' in xmp
        xmp['xmp:CreatorTool'] = 'Creator'  # Exists as a xml tag text
        xmp['pdf:Producer'] = 'Producer'  # Exists as a tag node
    assert str(xmp)
    if not XMPMeta:
        pytest.skip(msg='needs libxmp')
    xmpmeta = XMPMeta(xmp_str=str(xmp))
    assert xmpmeta.does_property_exist(XMP_NS_XMP, 'CreatorTool')
    assert xmpmeta.does_property_exist(XMP_NS_PDF, 'Producer')


def test_decode_pdf_date():
    VALS = [
        ('20160220040559', datetime(2016, 2, 20, 4, 5, 59)),
        ("20180101010101Z00'00'", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
        ("20180101010101Z", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
        ("20180101010101+0000", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone.utc)),
        ("20180101010101+0100", datetime(2018, 1, 1, 1, 1, 1, tzinfo=timezone(timedelta(hours=1)))),
    ]
    for s, d in VALS:
        assert decode_pdf_date(s) == d


def test_date_docinfo_from_xmp():
    VALS = [
        ('2018-12-04T03:02:01', "20181204030201"),
        ('2018-12-15T07:36:43Z', "20181215073643+00'00'"),
        ('2018-12-04T03:02:01-01:00', "20181204030201-01'00'"),
    ]
    for xmp_val, docinfo_val in VALS:
        assert DateConverter.docinfo_from_xmp(xmp_val) == docinfo_val


@given(
    integers(-9999, 9999),
    integers(0, 99),
    integers(0, 99),
    integers(0, 99),
    integers(0, 99),
    integers(0, 99),
)
@example(1, 1, 1, 0, 0, 0)
def test_random_dates(year, month, day, hour, mins, sec):
    date_args = year, month, day, hour, mins, sec
    xmp = '{:04d}-{:02d}-{:02d}T{:02d}:{:02d}:{:02d}'.format(*date_args)
    docinfo = '{:04d}{:02d}{:02d}{:02d}{:02d}{:02d}'.format(*date_args)

    try:
        converted = DateConverter.docinfo_from_xmp(xmp)
    except ValueError:
        pass
    else:
        assert converted == docinfo

    try:
        converted = DateConverter.xmp_from_docinfo(docinfo)
    except ValueError:
        pass
    else:
        assert converted == xmp


def test_bad_char_rejection(trivial):
    with trivial.open_metadata() as xmp:
        xmp['dc:description'] = 'Bad characters \x00 \x01 \x02'
        xmp['dc:creator'] = ['\ue001bad', '\ufff0bad']
    ET.fromstring(str(xmp))


def test_xpacket_generation(sandwich):
    xmpstr1 = sandwich.Root.Metadata.read_bytes()
    xpacket_begin = b'<?xpacket begin='
    xpacket_end = b'<?xpacket end='
    assert xmpstr1.startswith(xpacket_begin)

    with sandwich.open_metadata() as xmp:
        xmp['dc:creator'] = 'Foo'

    xmpstr2 = sandwich.Root.Metadata.read_bytes()
    assert xmpstr2.startswith(xpacket_begin)
    def only_one_substring(s, subs):
        return s.find(subs) == s.rfind(subs)
    assert only_one_substring(xmpstr2, xpacket_begin)
    assert only_one_substring(xmpstr2, xpacket_end)


def test_no_rdf_subtags(graph):
    xmp = graph.open_metadata()
    assert '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Alt' not in xmp.keys()
    assert '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Bag' not in xmp.keys()
    assert '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}li' not in xmp.keys()


def test_remove_attribute_metadata(sandwich):
    with sandwich.open_metadata() as xmp:
        del xmp['pdfaid:part']
    assert 'pdfaid:part' not in xmp
    assert 'pdfaid:conformance' in xmp

    with sandwich.open_metadata() as xmp:
        del xmp['pdfaid:conformance']

    # Ensure the whole node was deleted
    assert not re.search(r'rdf:Description xmlns:[^\s]+ rdf:about=""/', str(xmp))


def test_no_x_xmpmeta(trivial):
    trivial.Root.Metadata = Stream(trivial, b"""
        <?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>
        <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
                xmlns:xmp="http://ns.adobe.com/xap/1.0/">
        <rdf:Description rdf:about=""
                        xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
                        xmlns:xmp="http://ns.adobe.com/xap/1.0/">
            <pdfaid:part>1</pdfaid:part>
            <pdfaid:conformance>A</pdfaid:conformance>
            <xmp:CreatorTool>Simple Scan 3.30.2</xmp:CreatorTool>
            <xmp:CreateDate>2019-02-05T07:08:46+01:00</xmp:CreateDate>
            <xmp:ModifyDate>2019-02-05T07:08:46+01:00</xmp:ModifyDate>
            <xmp:MetadataDate>2019-02-05T07:08:46+01:00</xmp:MetadataDate>
        </rdf:Description>
        </rdf:RDF>
        <?xpacket end="w"?>
    """.strip())

    with trivial.open_metadata() as xmp:
        assert xmp._get_rdf_root() is not None
        xmp['pdfaid:part'] = '2'
    assert xmp['pdfaid:part'] == '2'