summaryrefslogtreecommitdiff
path: root/lib/Image/ExifTool/iWork.pm
blob: 5b7045de310ea14187a691cb6a08afe592f220f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#------------------------------------------------------------------------------
# File:         iWork.pm
#
# Description:  Read Apple iWork '09 XML+ZIP files
#
# Revisions:    2009/11/11 - P. Harvey Created
#------------------------------------------------------------------------------

package Image::ExifTool::iWork;

use strict;
use vars qw($VERSION);
use Image::ExifTool qw(:DataAccess :Utils);
use Image::ExifTool::XMP;
use Image::ExifTool::ZIP;

$VERSION = '1.05';

# test for recognized iWork document extensions and outer XML elements
my %iWorkType = (
    # file extensions
    NUMBERS => 'NUMBERS',
    PAGES   => 'PAGES',
    KEY     => 'KEY',
    KTH     => 'KTH',
    NMBTEMPLATE => 'NMBTEMPLATE',
    # we don't support double extensions --
    # "PAGES.TEMPLATE" => 'Apple Pages Template',
    # outer XML elements
    'ls:document' => 'NUMBERS',
    'sl:document' => 'PAGES',
    'key:presentation' => 'KEY',
);

# MIME types for iWork files (Apple has not registered these yet, but these
# are my best guess after doing some googling.  I'm not 100% sure what "sff"
# indicates, but I think it refers to the new "flattened" package format)
my %mimeType = (
    'NUMBERS'       => 'application/x-iwork-numbers-sffnumbers',
    'PAGES'         => 'application/x-iwork-pages-sffpages',
    'KEY'           => 'application/x-iWork-keynote-sffkey',
    'NMBTEMPLATE'   => 'application/x-iwork-numbers-sfftemplate',
    'PAGES.TEMPLATE'=> 'application/x-iwork-pages-sfftemplate',
    'KTH'           => 'application/x-iWork-keynote-sffkth',
);

# iWork tags
%Image::ExifTool::iWork::Main = (
    GROUPS => { 0 => 'XML', 1 => 'XML', 2 => 'Document' },
    PROCESS_PROC => \&Image::ExifTool::XMP::ProcessXMP,
    VARS => { NO_ID => 1 },
    NOTES => q{
        The Apple iWork '09 file format is a ZIP archive containing XML files
        similar to the Office Open XML (OOXML) format.  Metadata tags in iWork
        files are extracted even if they don't appear below.
    },
    authors   => { Name => 'Author', Groups => { 2 => 'Author' } },
    comment   => { },
    copyright => { Groups => { 2 => 'Author' } },
    keywords  => { },
    projects  => { List => 1 },
    title     => { },
);

#------------------------------------------------------------------------------
# Generate a tag ID for this XML tag
# Inputs: 0) tag property name list ref
# Returns: tagID
sub GetTagID($)
{
    my $props = shift;
    return 0 if $$props[-1] =~ /^\w+:ID$/;  # ignore ID tags
    return ($$props[0] =~ /^.*?:(.*)/) ? $1 : $$props[0];
}

#------------------------------------------------------------------------------
# We found an XMP property name/value
# Inputs: 0) ExifTool object ref, 1) tag table ref
#         2) reference to array of XMP property names (last is current property)
#         3) property value, 4) attribute hash ref (not used here)
# Returns: 1 if valid tag was found
sub FoundTag($$$$;$)
{
    my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
    return 0 unless @$props;
    my $verbose = $et->Options('Verbose');

    $et->VPrint(0, "  | - Tag '", join('/',@$props), "'\n") if $verbose > 1;

    # un-escape XML character entities
    $val = Image::ExifTool::XMP::UnescapeXML($val);
    # convert from UTF8 to ExifTool Charset
    $val = $et->Decode($val, 'UTF8');
    my $tag = GetTagID($props) or return 0;

    # add any unknown tags to table
    unless ($$tagTablePtr{$tag}) {
        $et->VPrint(0, "  [adding $tag]\n") if $verbose;
        AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
    }
    # save the tag
    $et->HandleTag($tagTablePtr, $tag, $val);

    return 1;
}

#------------------------------------------------------------------------------
# Extract information from an iWork file
# Inputs: 0) ExifTool object reference, 1) dirInfo reference
# Returns: 1
# Notes: Upon entry to this routine, the file type has already been verified
# as ZIP and the dirInfo hash contains a 'ZIP' Archive::Zip object reference
sub Process_iWork($$)
{
    my ($et, $dirInfo) = @_;
    my $zip = $$dirInfo{ZIP};
    my ($type, $index, $indexFile, $status);

    # try to determine the file type
    local $SIG{'__WARN__'} = \&Image::ExifTool::ZIP::WarnProc;
    # trust type given by file extension if available
    $type = $iWorkType{$$et{FILE_EXT}} if $$et{FILE_EXT};
    unless ($type) {
        # read the index file
        my @members = $zip->membersMatching('^index\.(xml|apxl)$');
        if (@members) {
            ($index, $status) = $zip->contents($members[0]);
            unless ($status) {
                $indexFile = $members[0]->fileName();
                if ($index =~ /^\s*<\?xml version=[^<]+<(\w+:\w+)/s) {
                    $type = $iWorkType{$1} if $iWorkType{$1};
                }
            }
        } else {
            @members = $zip->membersMatching('(?i)^.*\.(pages|numbers|key)/Index.*');
            if (@members) {
                my $tmp = $members[0]->fileName();
                $type = $iWorkType{uc $1} if $tmp =~ /\.(pages|numbers|key)/i;
            }
        }
        $type or $type = 'ZIP';     # assume ZIP by default
    }
    $et->SetFileType($type, $mimeType{$type});

    my @members = $zip->members();
    my $docNum = 0;
    my $member;
    foreach $member (@members) {
        # get filename of this ZIP member
        my $file = $member->fileName();
        next unless defined $file;
        $et->VPrint(0, "File: $file\n");
        # set the document number and extract ZIP tags
        $$et{DOC_NUM} = ++$docNum;
        Image::ExifTool::ZIP::HandleMember($et, $member);

        # process only the index XML and JPEG thumbnail/preview files
        next unless $file =~ m{^(index\.(xml|apxl)|QuickLook/Thumbnail\.jpg|[^/]+/preview.jpg)$}i;
        # get the file contents if necessary
        # (CAREFUL! $buff MUST be local since we hand off a value ref to PreviewImage)
        my ($buff, $buffPt);
        if ($indexFile and $indexFile eq $file) {
            # use the index file we already loaded
            $buffPt = \$index;
        } else {
            ($buff, $status) = $zip->contents($member);
            $status and $et->Warn("Error extracting $file"), next;
            $buffPt = \$buff;
        }
        # extract JPEG as PreviewImage (should only be QuickLook/Thumbnail.jpg)
        if ($file =~ /\.jpg$/) {
            $et->FoundTag('PreviewImage', $buffPt);
            next;
        }
        # process "metadata" section of XML index file
        next unless $$buffPt =~ /<(\w+):metadata>/g;
        my $ns = $1;
        my $p1 = pos $$buffPt;
        next unless $$buffPt =~ m{</${ns}:metadata>}g;
        # construct XML data from "metadata" section only
        $$buffPt = '<?xml version="1.0"?>' . substr($$buffPt, $p1, pos($$buffPt)-$p1);
        my %dirInfo = (
            DataPt => $buffPt,
            DirLen => length $$buffPt,
            DataLen => length $$buffPt,
            XMPParseOpts => {
                FoundProc => \&FoundTag,
            },
        );
        my $tagTablePtr = GetTagTable('Image::ExifTool::iWork::Main');
        $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
        undef $$buffPt; # (free memory now)
    }
    delete $$et{DOC_NUM};
    return 1;
}

1;  # end

__END__

=head1 NAME

Image::ExifTool::iWork - Read Apple iWork '09 XML+ZIP files

=head1 SYNOPSIS

This module is used by Image::ExifTool

=head1 DESCRIPTION

This module contains definitions required by Image::ExifTool to extract meta
information from Apple iWork '09 XML+ZIP files.

=head1 AUTHOR

Copyright 2003-2019, Phil Harvey (phil at owl.phy.queensu.ca)

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=head1 SEE ALSO

L<Image::ExifTool::TagNames/iWork Tags>,
L<Image::ExifTool::TagNames/OOXML Tags>,
L<Image::ExifTool(3pm)|Image::ExifTool>

=cut