diff options
Diffstat (limited to 'pdfrw/pdfreader.py')
-rw-r--r-- | pdfrw/pdfreader.py | 388 |
1 files changed, 273 insertions, 115 deletions
diff --git a/pdfrw/pdfreader.py b/pdfrw/pdfreader.py index ffbc237..0baf0eb 100644 --- a/pdfrw/pdfreader.py +++ b/pdfrw/pdfreader.py @@ -1,5 +1,6 @@ -# A part of pdfrw (pdfrw.googlecode.com) -# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas +# Copyright (C) 2012-2015 Nerijus Mika # MIT license -- See LICENSE.txt for details ''' @@ -10,16 +11,18 @@ document pages are stored in a list in the pages attribute of the object. ''' import gc +import binascii +import collections +import itertools -from pdfrw.errors import PdfParseError, log -from pdfrw.tokens import PdfTokens -from pdfrw.objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect -from pdfrw.uncompress import uncompress +from .errors import PdfParseError, log +from .tokens import PdfTokens +from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect +from .uncompress import uncompress +from .py23_diffs import convert_load, iteritems -class PdfReader(PdfDict): - warned_bad_stream_start = False # Use to keep from spewing warnings - warned_bad_stream_end = False # Use to keep from spewing warnings +class PdfReader(PdfDict): def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int): ''' Return a previously loaded indirect object, or create @@ -64,7 +67,9 @@ class PdfReader(PdfDict): tok = next() while tok != '>>': if not tok.startswith('/'): - source.exception('Expected PDF /name object') + source.error('Expected PDF /name object') + tok = next() + continue key = tok value = next() func = specialget(value) @@ -74,8 +79,11 @@ class PdfReader(PdfDict): else: tok = next() if value.isdigit() and tok.isdigit(): - if next() != 'R': - source.exception('Expected "R" following two integers') + tok2 = next() + if tok2 != 'R': + source.error('Expected "R" following two integers') + tok = tok2 + continue value = self.findindirect(value, tok) tok = next() result[key] = value @@ -92,7 +100,7 @@ class PdfReader(PdfDict): ''' source.exception('Unexpected delimiter') - def findstream(self, obj, tok, source, PdfDict=PdfDict, isinstance=isinstance, len=len): + def findstream(self, obj, tok, source, len=len): ''' Figure out if there is a content stream following an object, and return the start pointer to the content stream if so. @@ -102,9 +110,6 @@ class PdfReader(PdfDict): be an indirect object.) ''' - isdict = isinstance(obj, PdfDict) - if not isdict or tok != 'stream': - source.exception("Expected 'endobj'%s token", isdict and " or 'stream'" or '') fdata = source.fdata startstream = source.tokstart + len(tok) gotcr = fdata[startstream] == '\r' @@ -113,28 +118,29 @@ class PdfReader(PdfDict): startstream += gotlf if not gotlf: if not gotcr: - source.exception(r'stream keyword not followed by \n') - if not self.warned_bad_stream_start: - source.warning(r"stream keyword terminated by \r without \n") - self.private.warned_bad_stream_start = True + source.error(r'stream keyword not followed by \n') + else: + source.warning(r"stream keyword terminated " + r"by \r without \n") return startstream - def readstream(self, obj, startstream, source, - streamending = 'endstream endobj'.split(), int=int): + def readstream(self, obj, startstream, source, exact_required=False, + streamending='endstream endobj'.split(), int=int): fdata = source.fdata - length = int(obj.Length) + length = int(obj.Length) source.floc = target_endstream = startstream + length endit = source.multiple(2) obj._stream = fdata[startstream:target_endstream] if endit == streamending: return + if exact_required: + source.exception('Expected endstream endobj') + # The length attribute does not match the distance between the # stream and endstream keywords. - do_warn, self.warned_bad_stream_end = self.warned_bad_stream_end, False - - #TODO: Extract maxstream from dictionary of object offsets + # TODO: Extract maxstream from dictionary of object offsets # and use rfind instead of find. maxstream = len(fdata) - 20 endstream = fdata.find('endstream', startstream, maxstream) @@ -143,19 +149,23 @@ class PdfReader(PdfDict): if endstream < 0: source.error('Could not find endstream') return - if length == room + 1 and fdata[startstream-2:startstream] == '\r\n': + if (length == room + 1 and + fdata[startstream - 2:startstream] == '\r\n'): source.warning(r"stream keyword terminated by \r without \n") - obj._stream = fdata[startstream-1:target_endstream-1] + obj._stream = fdata[startstream - 1:target_endstream - 1] return source.floc = endstream if length > room: - source.error('stream /Length attribute (%d) appears to be too big (size %d) -- adjusting', - length, room) + source.error('stream /Length attribute (%d) appears to ' + 'be too big (size %d) -- adjusting', + length, room) obj.stream = fdata[startstream:endstream] return if fdata[target_endstream:endstream].rstrip(): - source.error('stream /Length attribute (%d) might be smaller than data size (%d)', - length, room) + source.error('stream /Length attribute (%d) appears to ' + 'be too small (size %d) -- adjusting', + length, room) + obj.stream = fdata[startstream:endstream] return endobj = fdata.find('endobj', endstream, maxstream) if endobj < 0: @@ -166,14 +176,15 @@ class PdfReader(PdfDict): return source.error('Illegal endstream/endobj combination') - def loadindirect(self, key): + def loadindirect(self, key, PdfDict=PdfDict, + isinstance=isinstance): result = self.indirect_objects.get(key) if not isinstance(result, PdfIndirect): return result source = self.source offset = int(self.source.obj_offsets.get(key, '0')) if not offset: - log.warning("Did not find PDF object %s" % (key,)) + source.warning("Did not find PDF object %s", key) return None # Read the object header and validate it @@ -189,12 +200,15 @@ class PdfReader(PdfDict): source.next() objheader = '%d %d obj' % (objnum, gennum) fdata = source.fdata - offset2 = fdata.find('\n' + objheader) + 1 or fdata.find('\r' + objheader) + 1 - if not offset2 or fdata.find(fdata[offset2-1] + objheader, offset2) > 0: - source.warning("Expected indirect object '%s'" % objheader) + offset2 = (fdata.find('\n' + objheader) + 1 or + fdata.find('\r' + objheader) + 1) + if (not offset2 or + fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0): + source.warning("Expected indirect object '%s'", objheader) return None - source.warning("Indirect object %s found at incorrect offset %d (expected offset %d)" % - (objheader, offset2, offset)) + source.warning("Indirect object %s found at incorrect " + "offset %d (expected offset %d)", + objheader, offset2, offset) source.floc = offset2 + len(objheader) # Read the object, and call special code if it starts @@ -208,20 +222,99 @@ class PdfReader(PdfDict): self.deferred_objects.remove(key) # Mark the object as indirect, and - # add it to the list of streams if it starts a stream + # just return it if it is a simple object. obj.indirect = key tok = source.next() - if tok != 'endobj': + if tok == 'endobj': + return obj + + # Should be a stream. Either that or it's broken. + isdict = isinstance(obj, PdfDict) + if isdict and tok == 'stream': self.readstream(obj, self.findstream(obj, tok, source), source) + return obj + + # Houston, we have a problem, but let's see if it + # is easily fixable. Leaving out a space before endobj + # is apparently an easy mistake to make on generation + # (Because it won't be noticed unless you are specifically + # generating an indirect object that doesn't end with any + # sort of delimiter.) It is so common that things like + # okular just handle it. + + if isinstance(obj, PdfObject) and obj.endswith('endobj'): + source.error('No space or delimiter before endobj') + obj = PdfObject(obj[:-6]) + else: + source.error("Expected 'endobj'%s token", + isdict and " or 'stream'" or '') + obj = PdfObject('') + + obj.indirect = key + self.indirect_objects[key] = obj return obj - def findxref(fdata): + def read_all(self): + deferred = self.deferred_objects + prev = set() + while 1: + new = deferred - prev + if not new: + break + prev |= deferred + for key in new: + self.loadindirect(key) + + def uncompress(self): + self.read_all() + uncompress(self.indirect_objects.values()) + + def load_stream_objects(self, object_streams): + # read object streams + objs = [] + for num in object_streams: + obj = self.findindirect(num, 0).real_value() + assert obj.Type == '/ObjStm' + objs.append(obj) + + # read objects from stream + if objs: + uncompress(objs) + for obj in objs: + objsource = PdfTokens(obj.stream, 0, False) + snext = objsource.next + offsets = {} + firstoffset = int(obj.First) + num = snext() + while num.isdigit(): + offset = int(snext()) + offsets[int(num)] = firstoffset + offset + num = snext() + for num, offset in iteritems(offsets): + # Read the object, and call special code if it starts + # an array or dictionary + objsource.floc = offset + sobj = snext() + func = self.special.get(sobj) + if func is not None: + sobj = func(objsource) + + key = (num, 0) + self.indirect_objects[key] = sobj + if key in self.deferred_objects: + self.deferred_objects.remove(key) + + # Mark the object as indirect, and + # add it to the list of streams if it starts a stream + sobj.indirect = key + + def findxref(self, fdata): ''' Find the cross reference section at the end of a file ''' startloc = fdata.rfind('startxref') if startloc < 0: raise PdfParseError('Did not find "startxref" at end of file') - source = PdfTokens(fdata, startloc, False) + source = PdfTokens(fdata, startloc, False, self.verbose) tok = source.next() assert tok == 'startxref' # (We just checked this...) tableloc = source.next_default() @@ -229,19 +322,67 @@ class PdfReader(PdfDict): source.exception('Expected table location') if source.next_default().rstrip().lstrip('%') != 'EOF': source.exception('Expected %%EOF') - return startloc, PdfTokens(fdata, int(tableloc), True) - findxref = staticmethod(findxref) + return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose) - def parsexref(self, source, int=int, range=range): + def parse_xref_stream(self, source, int=int, range=range, + enumerate=enumerate, islice=itertools.islice, + defaultdict=collections.defaultdict, + hexlify=binascii.hexlify): ''' Parse (one of) the cross-reference file section(s) ''' - fdata = source.fdata + + def readint(s, lengths): + lengths = itertools.cycle(lengths) + offset = 0 + for length in itertools.cycle(lengths): + next = offset + length + yield int(hexlify(s[offset:next]), 16) if length else None + offset = next + setdefault = source.obj_offsets.setdefault - add_offset = source.all_offsets.append next = source.next + # check for xref stream object + objid = source.multiple(3) + ok = len(objid) == 3 + ok = ok and objid[0].isdigit() + ok = ok and objid[1] == 'obj' + ok = ok and objid[2] == '<<' + if not ok: + source.exception('Expected xref stream start') + obj = self.readdict(source) + if obj.Type != PdfName.XRef: + source.exception('Expected dict type of /XRef') tok = next() - if tok != 'xref': - source.exception('Expected "xref" keyword') + self.readstream(obj, self.findstream(obj, tok, source), source, True) + if not uncompress([obj], True): + source.exception('Could not decompress Xref stream') + num_pairs = obj.Index or PdfArray(['0', obj.Size]) + num_pairs = [int(x) for x in num_pairs] + num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) + entry_sizes = [int(x) for x in obj.W] + if len(entry_sizes) != 3: + source.exception('Invalid entry size') + object_streams = defaultdict(list) + get = readint(obj.stream, entry_sizes) + for objnum, size in num_pairs: + for cnt in range(size): + xtype, p1, p2 = islice(get, 3) + if xtype in (1, None): + if p1: + setdefault((objnum, p2 or 0), p1) + elif xtype == 2: + object_streams[p1].append((objnum, p2)) + objnum += 1 + + obj.private.object_streams = object_streams + return obj + + def parse_xref_table(self, source, int=int, range=range): + ''' Parse (one of) the cross-reference file section(s) + ''' + setdefault = source.obj_offsets.setdefault + next = source.next + # plain xref table start = source.floc try: while 1: @@ -256,13 +397,13 @@ class PdfReader(PdfDict): if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) - add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: - # Table formatted incorrectly. See if we can figure it out anyway. + # Table formatted incorrectly. + # See if we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: @@ -270,24 +411,41 @@ class PdfReader(PdfDict): if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: - offset, generation, inuse = int(tokens[0]), int(tokens[1]), tokens[2] + offset, generation, inuse = (int(tokens[0]), + int(tokens[1]), tokens[2]) if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) - add_offset(offset) objnum += 1 elif tokens: - log.error('Invalid line in xref table: %s' % repr(line)) + log.error('Invalid line in xref table: %s' % + repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end - source.next() + next() except: source.floc = start source.exception('Invalid table format') + def parsexref(self, source): + ''' Parse (one of) the cross-reference file section(s) + ''' + next = source.next + tok = next() + if tok.isdigit(): + return self.parse_xref_stream(source), True + elif tok == 'xref': + self.parse_xref_table(source) + tok = next() + if tok != '<<': + source.exception('Expected "<<" starting catalog') + return self.readdict(source), False + else: + source.exception('Expected "xref" keyword or xref stream object') + def readpages(self, node): - pagename=PdfName.Page - pagesname=PdfName.Pages + pagename = PdfName.Page + pagesname = PdfName.Pages catalogname = PdfName.Catalog typename = PdfName.Type kidname = PdfName.Kids @@ -306,20 +464,23 @@ class PdfReader(PdfDict): for node in readnode(node[pagesname]): yield node else: - log.error('Expected /Page or /Pages dictionary, got %s' % repr(node)) + log.error('Expected /Page or /Pages dictionary, got %s' % + repr(node)) try: return list(readnode(node)) - except (AttributeError, TypeError), s: + except (AttributeError, TypeError) as s: log.error('Invalid page tree: %s' % s) return [] - def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True): + def __init__(self, fname=None, fdata=None, decompress=False, + disable_gc=True, verbose=True): + self.private.verbose = verbose # Runs a lot faster with GC off. disable_gc = disable_gc and gc.isenabled() + if disable_gc: + gc.disable() try: - if disable_gc: - gc.disable() if fname is not None: assert fdata is None # Allow reading preexisting streams like pyPdf @@ -331,8 +492,9 @@ class PdfReader(PdfDict): fdata = f.read() f.close() except IOError: - raise PdfParseError('Could not read PDF file %s' % fname) - + raise PdfParseError('Could not read PDF file %s' % + fname) + fdata = convert_load(fdata) assert fdata is not None if not fdata.startswith('%PDF-'): startloc = fdata.find('%PDF-') @@ -342,11 +504,15 @@ class PdfReader(PdfDict): lines = fdata.lstrip().splitlines() if not lines: raise PdfParseError('Empty PDF file!') - raise PdfParseError('Invalid PDF header: %s' % repr(lines[0])) + raise PdfParseError('Invalid PDF header: %s' % + repr(lines[0])) + + self.private.version = fdata[5:8] endloc = fdata.rfind('%EOF') if endloc < 0: - raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:])) + raise PdfParseError('EOF mark not found: %s' % + repr(fdata[-20:])) endloc += 6 junk = fdata[endloc:] fdata = fdata[:endloc] @@ -363,46 +529,53 @@ class PdfReader(PdfDict): for tok in r'\ ( ) < > { } ] >> %'.split(): self.special[tok] = self.badtoken - startloc, source = self.findxref(fdata) private.source = source - xref_table_list = [] - source.all_offsets = [] + + # Find all the xref tables/streams, and + # then deal with them backwards. + xref_list = [] while 1: source.obj_offsets = {} - # Loop through all the cross-reference tables - self.parsexref(source) - tok = source.next() - if tok != '<<': - source.exception('Expected "<<" starting catalog') - - newdict = self.readdict(source) - - token = source.next() - if token != 'startxref' and not xref_table_list: - source.warning('Expected "startxref" at end of xref table') - - # Loop if any previously-written tables. - prev = newdict.Prev + trailer, is_stream = self.parsexref(source) + prev = trailer.Prev if prev is None: + token = source.next() + if token != 'startxref' and not xref_list: + source.warning('Expected "startxref" ' + 'at end of xref table') break - if not xref_table_list: - newdict.Prev = None - original_indirect = self.indirect_objects.copy() - original_newdict = newdict + xref_list.append((source.obj_offsets, trailer, is_stream)) source.floc = int(prev) - xref_table_list.append(source.obj_offsets) - self.indirect_objects.clear() - - if xref_table_list: - for update in reversed(xref_table_list): - source.obj_offsets.update(update) - self.indirect_objects.clear() - self.indirect_objects.update(original_indirect) - newdict = original_newdict - self.update(newdict) - - #self.read_all_indirect(source) + + if is_stream: + self.load_stream_objects(trailer.object_streams) + + while xref_list: + later_offsets, later_trailer, is_stream = xref_list.pop() + source.obj_offsets.update(later_offsets) + if is_stream: + trailer.update(later_trailer) + self.load_stream_objects(later_trailer.object_streams) + else: + trailer = later_trailer + + trailer.Prev = None + + if (trailer.Version and + float(trailer.Version) > float(self.version)): + self.private.version = trailer.Version + + if is_stream: + self.Root = trailer.Root + self.Info = trailer.Info + self.ID = trailer.ID + self.Size = trailer.Size + self.Encrypt = trailer.Encrypt + else: + self.update(trailer) + + # self.read_all_indirect(source) private.pages = self.readpages(self.Root) if decompress: self.uncompress() @@ -416,18 +589,3 @@ class PdfReader(PdfDict): # For compatibility with pyPdf def getPage(self, pagenum): return self.pages[pagenum] - - def read_all(self): - deferred = self.deferred_objects - prev = set() - while 1: - new = deferred - prev - if not new: - break - prev |= deferred - for key in new: - self.loadindirect(key) - - def uncompress(self): - self.read_all() - uncompress(self.indirect_objects.itervalues()) |