summaryrefslogtreecommitdiff
path: root/pdfrw/pdfreader.py
diff options
context:
space:
mode:
Diffstat (limited to 'pdfrw/pdfreader.py')
-rw-r--r--pdfrw/pdfreader.py388
1 files changed, 273 insertions, 115 deletions
diff --git a/pdfrw/pdfreader.py b/pdfrw/pdfreader.py
index ffbc237..0baf0eb 100644
--- a/pdfrw/pdfreader.py
+++ b/pdfrw/pdfreader.py
@@ -1,5 +1,6 @@
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
+# Copyright (C) 2012-2015 Nerijus Mika
# MIT license -- See LICENSE.txt for details
'''
@@ -10,16 +11,18 @@ document pages are stored in a list in the pages attribute
of the object.
'''
import gc
+import binascii
+import collections
+import itertools
-from pdfrw.errors import PdfParseError, log
-from pdfrw.tokens import PdfTokens
-from pdfrw.objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
-from pdfrw.uncompress import uncompress
+from .errors import PdfParseError, log
+from .tokens import PdfTokens
+from .objects import PdfDict, PdfArray, PdfName, PdfObject, PdfIndirect
+from .uncompress import uncompress
+from .py23_diffs import convert_load, iteritems
-class PdfReader(PdfDict):
- warned_bad_stream_start = False # Use to keep from spewing warnings
- warned_bad_stream_end = False # Use to keep from spewing warnings
+class PdfReader(PdfDict):
def findindirect(self, objnum, gennum, PdfIndirect=PdfIndirect, int=int):
''' Return a previously loaded indirect object, or create
@@ -64,7 +67,9 @@ class PdfReader(PdfDict):
tok = next()
while tok != '>>':
if not tok.startswith('/'):
- source.exception('Expected PDF /name object')
+ source.error('Expected PDF /name object')
+ tok = next()
+ continue
key = tok
value = next()
func = specialget(value)
@@ -74,8 +79,11 @@ class PdfReader(PdfDict):
else:
tok = next()
if value.isdigit() and tok.isdigit():
- if next() != 'R':
- source.exception('Expected "R" following two integers')
+ tok2 = next()
+ if tok2 != 'R':
+ source.error('Expected "R" following two integers')
+ tok = tok2
+ continue
value = self.findindirect(value, tok)
tok = next()
result[key] = value
@@ -92,7 +100,7 @@ class PdfReader(PdfDict):
'''
source.exception('Unexpected delimiter')
- def findstream(self, obj, tok, source, PdfDict=PdfDict, isinstance=isinstance, len=len):
+ def findstream(self, obj, tok, source, len=len):
''' Figure out if there is a content stream
following an object, and return the start
pointer to the content stream if so.
@@ -102,9 +110,6 @@ class PdfReader(PdfDict):
be an indirect object.)
'''
- isdict = isinstance(obj, PdfDict)
- if not isdict or tok != 'stream':
- source.exception("Expected 'endobj'%s token", isdict and " or 'stream'" or '')
fdata = source.fdata
startstream = source.tokstart + len(tok)
gotcr = fdata[startstream] == '\r'
@@ -113,28 +118,29 @@ class PdfReader(PdfDict):
startstream += gotlf
if not gotlf:
if not gotcr:
- source.exception(r'stream keyword not followed by \n')
- if not self.warned_bad_stream_start:
- source.warning(r"stream keyword terminated by \r without \n")
- self.private.warned_bad_stream_start = True
+ source.error(r'stream keyword not followed by \n')
+ else:
+ source.warning(r"stream keyword terminated "
+ r"by \r without \n")
return startstream
- def readstream(self, obj, startstream, source,
- streamending = 'endstream endobj'.split(), int=int):
+ def readstream(self, obj, startstream, source, exact_required=False,
+ streamending='endstream endobj'.split(), int=int):
fdata = source.fdata
- length = int(obj.Length)
+ length = int(obj.Length)
source.floc = target_endstream = startstream + length
endit = source.multiple(2)
obj._stream = fdata[startstream:target_endstream]
if endit == streamending:
return
+ if exact_required:
+ source.exception('Expected endstream endobj')
+
# The length attribute does not match the distance between the
# stream and endstream keywords.
- do_warn, self.warned_bad_stream_end = self.warned_bad_stream_end, False
-
- #TODO: Extract maxstream from dictionary of object offsets
+ # TODO: Extract maxstream from dictionary of object offsets
# and use rfind instead of find.
maxstream = len(fdata) - 20
endstream = fdata.find('endstream', startstream, maxstream)
@@ -143,19 +149,23 @@ class PdfReader(PdfDict):
if endstream < 0:
source.error('Could not find endstream')
return
- if length == room + 1 and fdata[startstream-2:startstream] == '\r\n':
+ if (length == room + 1 and
+ fdata[startstream - 2:startstream] == '\r\n'):
source.warning(r"stream keyword terminated by \r without \n")
- obj._stream = fdata[startstream-1:target_endstream-1]
+ obj._stream = fdata[startstream - 1:target_endstream - 1]
return
source.floc = endstream
if length > room:
- source.error('stream /Length attribute (%d) appears to be too big (size %d) -- adjusting',
- length, room)
+ source.error('stream /Length attribute (%d) appears to '
+ 'be too big (size %d) -- adjusting',
+ length, room)
obj.stream = fdata[startstream:endstream]
return
if fdata[target_endstream:endstream].rstrip():
- source.error('stream /Length attribute (%d) might be smaller than data size (%d)',
- length, room)
+ source.error('stream /Length attribute (%d) appears to '
+ 'be too small (size %d) -- adjusting',
+ length, room)
+ obj.stream = fdata[startstream:endstream]
return
endobj = fdata.find('endobj', endstream, maxstream)
if endobj < 0:
@@ -166,14 +176,15 @@ class PdfReader(PdfDict):
return
source.error('Illegal endstream/endobj combination')
- def loadindirect(self, key):
+ def loadindirect(self, key, PdfDict=PdfDict,
+ isinstance=isinstance):
result = self.indirect_objects.get(key)
if not isinstance(result, PdfIndirect):
return result
source = self.source
offset = int(self.source.obj_offsets.get(key, '0'))
if not offset:
- log.warning("Did not find PDF object %s" % (key,))
+ source.warning("Did not find PDF object %s", key)
return None
# Read the object header and validate it
@@ -189,12 +200,15 @@ class PdfReader(PdfDict):
source.next()
objheader = '%d %d obj' % (objnum, gennum)
fdata = source.fdata
- offset2 = fdata.find('\n' + objheader) + 1 or fdata.find('\r' + objheader) + 1
- if not offset2 or fdata.find(fdata[offset2-1] + objheader, offset2) > 0:
- source.warning("Expected indirect object '%s'" % objheader)
+ offset2 = (fdata.find('\n' + objheader) + 1 or
+ fdata.find('\r' + objheader) + 1)
+ if (not offset2 or
+ fdata.find(fdata[offset2 - 1] + objheader, offset2) > 0):
+ source.warning("Expected indirect object '%s'", objheader)
return None
- source.warning("Indirect object %s found at incorrect offset %d (expected offset %d)" %
- (objheader, offset2, offset))
+ source.warning("Indirect object %s found at incorrect "
+ "offset %d (expected offset %d)",
+ objheader, offset2, offset)
source.floc = offset2 + len(objheader)
# Read the object, and call special code if it starts
@@ -208,20 +222,99 @@ class PdfReader(PdfDict):
self.deferred_objects.remove(key)
# Mark the object as indirect, and
- # add it to the list of streams if it starts a stream
+ # just return it if it is a simple object.
obj.indirect = key
tok = source.next()
- if tok != 'endobj':
+ if tok == 'endobj':
+ return obj
+
+ # Should be a stream. Either that or it's broken.
+ isdict = isinstance(obj, PdfDict)
+ if isdict and tok == 'stream':
self.readstream(obj, self.findstream(obj, tok, source), source)
+ return obj
+
+ # Houston, we have a problem, but let's see if it
+ # is easily fixable. Leaving out a space before endobj
+ # is apparently an easy mistake to make on generation
+ # (Because it won't be noticed unless you are specifically
+ # generating an indirect object that doesn't end with any
+ # sort of delimiter.) It is so common that things like
+ # okular just handle it.
+
+ if isinstance(obj, PdfObject) and obj.endswith('endobj'):
+ source.error('No space or delimiter before endobj')
+ obj = PdfObject(obj[:-6])
+ else:
+ source.error("Expected 'endobj'%s token",
+ isdict and " or 'stream'" or '')
+ obj = PdfObject('')
+
+ obj.indirect = key
+ self.indirect_objects[key] = obj
return obj
- def findxref(fdata):
+ def read_all(self):
+ deferred = self.deferred_objects
+ prev = set()
+ while 1:
+ new = deferred - prev
+ if not new:
+ break
+ prev |= deferred
+ for key in new:
+ self.loadindirect(key)
+
+ def uncompress(self):
+ self.read_all()
+ uncompress(self.indirect_objects.values())
+
+ def load_stream_objects(self, object_streams):
+ # read object streams
+ objs = []
+ for num in object_streams:
+ obj = self.findindirect(num, 0).real_value()
+ assert obj.Type == '/ObjStm'
+ objs.append(obj)
+
+ # read objects from stream
+ if objs:
+ uncompress(objs)
+ for obj in objs:
+ objsource = PdfTokens(obj.stream, 0, False)
+ snext = objsource.next
+ offsets = {}
+ firstoffset = int(obj.First)
+ num = snext()
+ while num.isdigit():
+ offset = int(snext())
+ offsets[int(num)] = firstoffset + offset
+ num = snext()
+ for num, offset in iteritems(offsets):
+ # Read the object, and call special code if it starts
+ # an array or dictionary
+ objsource.floc = offset
+ sobj = snext()
+ func = self.special.get(sobj)
+ if func is not None:
+ sobj = func(objsource)
+
+ key = (num, 0)
+ self.indirect_objects[key] = sobj
+ if key in self.deferred_objects:
+ self.deferred_objects.remove(key)
+
+ # Mark the object as indirect, and
+ # add it to the list of streams if it starts a stream
+ sobj.indirect = key
+
+ def findxref(self, fdata):
''' Find the cross reference section at the end of a file
'''
startloc = fdata.rfind('startxref')
if startloc < 0:
raise PdfParseError('Did not find "startxref" at end of file')
- source = PdfTokens(fdata, startloc, False)
+ source = PdfTokens(fdata, startloc, False, self.verbose)
tok = source.next()
assert tok == 'startxref' # (We just checked this...)
tableloc = source.next_default()
@@ -229,19 +322,67 @@ class PdfReader(PdfDict):
source.exception('Expected table location')
if source.next_default().rstrip().lstrip('%') != 'EOF':
source.exception('Expected %%EOF')
- return startloc, PdfTokens(fdata, int(tableloc), True)
- findxref = staticmethod(findxref)
+ return startloc, PdfTokens(fdata, int(tableloc), True, self.verbose)
- def parsexref(self, source, int=int, range=range):
+ def parse_xref_stream(self, source, int=int, range=range,
+ enumerate=enumerate, islice=itertools.islice,
+ defaultdict=collections.defaultdict,
+ hexlify=binascii.hexlify):
''' Parse (one of) the cross-reference file section(s)
'''
- fdata = source.fdata
+
+ def readint(s, lengths):
+ lengths = itertools.cycle(lengths)
+ offset = 0
+ for length in itertools.cycle(lengths):
+ next = offset + length
+ yield int(hexlify(s[offset:next]), 16) if length else None
+ offset = next
+
setdefault = source.obj_offsets.setdefault
- add_offset = source.all_offsets.append
next = source.next
+ # check for xref stream object
+ objid = source.multiple(3)
+ ok = len(objid) == 3
+ ok = ok and objid[0].isdigit()
+ ok = ok and objid[1] == 'obj'
+ ok = ok and objid[2] == '<<'
+ if not ok:
+ source.exception('Expected xref stream start')
+ obj = self.readdict(source)
+ if obj.Type != PdfName.XRef:
+ source.exception('Expected dict type of /XRef')
tok = next()
- if tok != 'xref':
- source.exception('Expected "xref" keyword')
+ self.readstream(obj, self.findstream(obj, tok, source), source, True)
+ if not uncompress([obj], True):
+ source.exception('Could not decompress Xref stream')
+ num_pairs = obj.Index or PdfArray(['0', obj.Size])
+ num_pairs = [int(x) for x in num_pairs]
+ num_pairs = zip(num_pairs[0::2], num_pairs[1::2])
+ entry_sizes = [int(x) for x in obj.W]
+ if len(entry_sizes) != 3:
+ source.exception('Invalid entry size')
+ object_streams = defaultdict(list)
+ get = readint(obj.stream, entry_sizes)
+ for objnum, size in num_pairs:
+ for cnt in range(size):
+ xtype, p1, p2 = islice(get, 3)
+ if xtype in (1, None):
+ if p1:
+ setdefault((objnum, p2 or 0), p1)
+ elif xtype == 2:
+ object_streams[p1].append((objnum, p2))
+ objnum += 1
+
+ obj.private.object_streams = object_streams
+ return obj
+
+ def parse_xref_table(self, source, int=int, range=range):
+ ''' Parse (one of) the cross-reference file section(s)
+ '''
+ setdefault = source.obj_offsets.setdefault
+ next = source.next
+ # plain xref table
start = source.floc
try:
while 1:
@@ -256,13 +397,13 @@ class PdfReader(PdfDict):
if inuse == 'n':
if offset != 0:
setdefault((objnum, generation), offset)
- add_offset(offset)
elif inuse != 'f':
raise ValueError
except:
pass
try:
- # Table formatted incorrectly. See if we can figure it out anyway.
+ # Table formatted incorrectly.
+ # See if we can figure it out anyway.
end = source.fdata.rindex('trailer', start)
table = source.fdata[start:end].splitlines()
for line in table:
@@ -270,24 +411,41 @@ class PdfReader(PdfDict):
if len(tokens) == 2:
objnum = int(tokens[0])
elif len(tokens) == 3:
- offset, generation, inuse = int(tokens[0]), int(tokens[1]), tokens[2]
+ offset, generation, inuse = (int(tokens[0]),
+ int(tokens[1]), tokens[2])
if offset != 0 and inuse == 'n':
setdefault((objnum, generation), offset)
- add_offset(offset)
objnum += 1
elif tokens:
- log.error('Invalid line in xref table: %s' % repr(line))
+ log.error('Invalid line in xref table: %s' %
+ repr(line))
raise ValueError
log.warning('Badly formatted xref table')
source.floc = end
- source.next()
+ next()
except:
source.floc = start
source.exception('Invalid table format')
+ def parsexref(self, source):
+ ''' Parse (one of) the cross-reference file section(s)
+ '''
+ next = source.next
+ tok = next()
+ if tok.isdigit():
+ return self.parse_xref_stream(source), True
+ elif tok == 'xref':
+ self.parse_xref_table(source)
+ tok = next()
+ if tok != '<<':
+ source.exception('Expected "<<" starting catalog')
+ return self.readdict(source), False
+ else:
+ source.exception('Expected "xref" keyword or xref stream object')
+
def readpages(self, node):
- pagename=PdfName.Page
- pagesname=PdfName.Pages
+ pagename = PdfName.Page
+ pagesname = PdfName.Pages
catalogname = PdfName.Catalog
typename = PdfName.Type
kidname = PdfName.Kids
@@ -306,20 +464,23 @@ class PdfReader(PdfDict):
for node in readnode(node[pagesname]):
yield node
else:
- log.error('Expected /Page or /Pages dictionary, got %s' % repr(node))
+ log.error('Expected /Page or /Pages dictionary, got %s' %
+ repr(node))
try:
return list(readnode(node))
- except (AttributeError, TypeError), s:
+ except (AttributeError, TypeError) as s:
log.error('Invalid page tree: %s' % s)
return []
- def __init__(self, fname=None, fdata=None, decompress=False, disable_gc=True):
+ def __init__(self, fname=None, fdata=None, decompress=False,
+ disable_gc=True, verbose=True):
+ self.private.verbose = verbose
# Runs a lot faster with GC off.
disable_gc = disable_gc and gc.isenabled()
+ if disable_gc:
+ gc.disable()
try:
- if disable_gc:
- gc.disable()
if fname is not None:
assert fdata is None
# Allow reading preexisting streams like pyPdf
@@ -331,8 +492,9 @@ class PdfReader(PdfDict):
fdata = f.read()
f.close()
except IOError:
- raise PdfParseError('Could not read PDF file %s' % fname)
-
+ raise PdfParseError('Could not read PDF file %s' %
+ fname)
+ fdata = convert_load(fdata)
assert fdata is not None
if not fdata.startswith('%PDF-'):
startloc = fdata.find('%PDF-')
@@ -342,11 +504,15 @@ class PdfReader(PdfDict):
lines = fdata.lstrip().splitlines()
if not lines:
raise PdfParseError('Empty PDF file!')
- raise PdfParseError('Invalid PDF header: %s' % repr(lines[0]))
+ raise PdfParseError('Invalid PDF header: %s' %
+ repr(lines[0]))
+
+ self.private.version = fdata[5:8]
endloc = fdata.rfind('%EOF')
if endloc < 0:
- raise PdfParseError('EOF mark not found: %s' % repr(fdata[-20:]))
+ raise PdfParseError('EOF mark not found: %s' %
+ repr(fdata[-20:]))
endloc += 6
junk = fdata[endloc:]
fdata = fdata[:endloc]
@@ -363,46 +529,53 @@ class PdfReader(PdfDict):
for tok in r'\ ( ) < > { } ] >> %'.split():
self.special[tok] = self.badtoken
-
startloc, source = self.findxref(fdata)
private.source = source
- xref_table_list = []
- source.all_offsets = []
+
+ # Find all the xref tables/streams, and
+ # then deal with them backwards.
+ xref_list = []
while 1:
source.obj_offsets = {}
- # Loop through all the cross-reference tables
- self.parsexref(source)
- tok = source.next()
- if tok != '<<':
- source.exception('Expected "<<" starting catalog')
-
- newdict = self.readdict(source)
-
- token = source.next()
- if token != 'startxref' and not xref_table_list:
- source.warning('Expected "startxref" at end of xref table')
-
- # Loop if any previously-written tables.
- prev = newdict.Prev
+ trailer, is_stream = self.parsexref(source)
+ prev = trailer.Prev
if prev is None:
+ token = source.next()
+ if token != 'startxref' and not xref_list:
+ source.warning('Expected "startxref" '
+ 'at end of xref table')
break
- if not xref_table_list:
- newdict.Prev = None
- original_indirect = self.indirect_objects.copy()
- original_newdict = newdict
+ xref_list.append((source.obj_offsets, trailer, is_stream))
source.floc = int(prev)
- xref_table_list.append(source.obj_offsets)
- self.indirect_objects.clear()
-
- if xref_table_list:
- for update in reversed(xref_table_list):
- source.obj_offsets.update(update)
- self.indirect_objects.clear()
- self.indirect_objects.update(original_indirect)
- newdict = original_newdict
- self.update(newdict)
-
- #self.read_all_indirect(source)
+
+ if is_stream:
+ self.load_stream_objects(trailer.object_streams)
+
+ while xref_list:
+ later_offsets, later_trailer, is_stream = xref_list.pop()
+ source.obj_offsets.update(later_offsets)
+ if is_stream:
+ trailer.update(later_trailer)
+ self.load_stream_objects(later_trailer.object_streams)
+ else:
+ trailer = later_trailer
+
+ trailer.Prev = None
+
+ if (trailer.Version and
+ float(trailer.Version) > float(self.version)):
+ self.private.version = trailer.Version
+
+ if is_stream:
+ self.Root = trailer.Root
+ self.Info = trailer.Info
+ self.ID = trailer.ID
+ self.Size = trailer.Size
+ self.Encrypt = trailer.Encrypt
+ else:
+ self.update(trailer)
+
+ # self.read_all_indirect(source)
private.pages = self.readpages(self.Root)
if decompress:
self.uncompress()
@@ -416,18 +589,3 @@ class PdfReader(PdfDict):
# For compatibility with pyPdf
def getPage(self, pagenum):
return self.pages[pagenum]
-
- def read_all(self):
- deferred = self.deferred_objects
- prev = set()
- while 1:
- new = deferred - prev
- if not new:
- break
- prev |= deferred
- for key in new:
- self.loadindirect(key)
-
- def uncompress(self):
- self.read_all()
- uncompress(self.indirect_objects.itervalues())