diff options
Diffstat (limited to 'pdfrw/pdfwriter.py')
-rwxr-xr-x | pdfrw/pdfwriter.py | 176 |
1 files changed, 111 insertions, 65 deletions
diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py index 42740dc..644bb30 100755 --- a/pdfrw/pdfwriter.py +++ b/pdfrw/pdfwriter.py @@ -1,7 +1,5 @@ -#!/usr/bin/env python - -# A part of pdfrw (pdfrw.googlecode.com) -# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas # MIT license -- See LICENSE.txt for details ''' @@ -17,32 +15,50 @@ and addpage() assumes that the pages are part of a valid tree/forest of PDF objects. ''' +import gc -try: - set -except NameError: - from sets import Set as set - -from pdfrw.objects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString -from pdfrw.compress import compress as do_compress -from pdfrw.errors import PdfOutputError, log +from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict, + PdfObject, PdfString) +from .compress import compress as do_compress +from .errors import PdfOutputError, log +from .py23_diffs import iteritems, convert_store NullObject = PdfObject('null') NullObject.indirect = True NullObject.Type = 'Null object' + +def user_fmt(obj, isinstance=isinstance, float=float, str=str, + basestring=str, encode=PdfString.encode): + ''' This function may be replaced by the user for + specialized formatting requirements. + ''' + + if isinstance(obj, basestring): + return encode(obj) + + # PDFs don't handle exponent notation + if isinstance(obj, float): + return ('%.9f' % obj).rstrip('0').rstrip('.') + + return str(obj) + + def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), - id=id, isinstance=isinstance, getattr=getattr,len=len, - sum=sum, set=set, str=str, basestring=basestring, - hasattr=hasattr, repr=repr, enumerate=enumerate, - list=list, dict=dict, tuple=tuple, - do_compress=do_compress, PdfArray=PdfArray, - PdfDict=PdfDict, PdfObject=PdfObject, encode=PdfString.encode): + user_fmt=user_fmt, do_compress=do_compress, + convert_store=convert_store, iteritems=iteritems, + id=id, isinstance=isinstance, getattr=getattr, len=len, + sum=sum, set=set, str=str, hasattr=hasattr, repr=repr, + enumerate=enumerate, list=list, dict=dict, tuple=tuple, + PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject): ''' FormatObjects performs the actual formatting and disk write. Should be a class, was a class, turned into nested functions for performace (to reduce attribute lookups). ''' + def f_write(s): + f.write(convert_store(s)) + def add(obj): ''' Add an object to our list, if it's an indirect object. Just format it if not. @@ -58,7 +74,9 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), if not indirect: if objid in visited: - log.warning('Replicating direct %s object, should be indirect for optimal file size' % type(obj)) + log.warning('Replicating direct %s object, ' + 'should be indirect for optimal file size' % + type(obj)) obj = type(obj)(obj) objid = id(obj) visiting(objid) @@ -83,7 +101,7 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), objnum = len(objlist) + 1 objlist_append(None) indirect_dict[objid] = objnum - deferred.append((objnum-1, obj)) + deferred.append((objnum - 1, obj)) return '%s 0 R' % objnum def format_array(myarray, formatter): @@ -119,30 +137,32 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), elif isinstance(obj, PdfDict): if compress and obj.stream: do_compress([obj]) + pairs = sorted((x, y, getattr(x, 'encoded', x)) + for (x, y) in obj.iteritems()) myarray = [] - dictkeys = [str(x) for x in obj.keys()] - dictkeys.sort() - for key in dictkeys: - myarray.append(key) - myarray.append(add(obj[key])) + for key, value, encoding in pairs: + myarray.append(encoding) + myarray.append(add(value)) result = format_array(myarray, '<<%s>>') stream = obj.stream if stream is not None: - result = '%s\nstream\n%s\nendstream' % (result, stream) + result = ('%s\nstream\n%s\nendstream' % + (result, stream)) return result obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj) continue - if not hasattr(obj, 'indirect') and isinstance(obj, basestring): - return encode(obj) - return str(getattr(obj, 'encoded', obj)) + # We assume that an object with an indirect + # attribute knows how to represent itself to us. + if hasattr(obj, 'indirect'): + return str(getattr(obj, 'encoded', obj)) + return user_fmt(obj) def format_deferred(): while deferred: index, obj = deferred.pop() objlist[index] = format_obj(obj) - indirect_dict = {} indirect_dict_get = indirect_dict.get objlist = [] @@ -152,14 +172,17 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), leaving = visited.remove space_join = ' '.join lf_join = '\n '.join - f_write = f.write deferred = [] - # Don't reference old catalog or pages objects -- swap references to new ones. - swapobj = {PdfName.Catalog:trailer.Root, PdfName.Pages:trailer.Root.Pages, None:trailer}.get - swapobj = [(objid, swapobj(obj.Type)) for objid, obj in killobj.iteritems()] - swapobj = dict((objid, obj is None and NullObject or obj) for objid, obj in swapobj).get + # Don't reference old catalog or pages objects -- + # swap references to new ones. + swapobj = {PdfName.Catalog: trailer.Root, + PdfName.Pages: trailer.Root.Pages, None: trailer}.get + swapobj = [(objid, swapobj(obj.Type)) + for objid, obj in iteritems(killobj)] + swapobj = dict((objid, obj is None and NullObject or obj) + for objid, obj in swapobj).get for objid in killobj: assert swapobj(objid) is not None @@ -197,9 +220,11 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(), f_write('%010d %05d %s\r\n' % x) f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset)) + class PdfWriter(object): _trailer = None + canonicalize = False def __init__(self, version='1.3', compress=False): self.pagearray = PdfArray() @@ -211,15 +236,15 @@ class PdfWriter(object): self._trailer = None if page.Type != PdfName.Page: raise PdfOutputError('Bad /Type: Expected %s, found %s' - % (PdfName.Page, page.Type)) - inheritable = page.inheritable # searches for resources + % (PdfName.Page, page.Type)) + inheritable = page.inheritable # searches for resources self.pagearray.append( IndirectPdfDict( page, - Resources = inheritable.Resources, - MediaBox = inheritable.MediaBox, - CropBox = inheritable.CropBox, - Rotate = inheritable.Rotate, + Resources=inheritable.Resources, + MediaBox=inheritable.MediaBox, + CropBox=inheritable.CropBox, + Rotate=inheritable.Rotate, ) ) @@ -247,21 +272,26 @@ class PdfWriter(object): if trailer is not None: return trailer + if self.canonicalize: + self.make_canonical() + # Create the basic object structure of the PDF file trailer = PdfDict( - Root = IndirectPdfDict( - Type = PdfName.Catalog, - Pages = IndirectPdfDict( - Type = PdfName.Pages, - Count = PdfObject(len(self.pagearray)), - Kids = self.pagearray + Root=IndirectPdfDict( + Type=PdfName.Catalog, + Pages=IndirectPdfDict( + Type=PdfName.Pages, + Count=PdfObject(len(self.pagearray)), + Kids=self.pagearray ) ) ) - # Make all the pages point back to the page dictionary + # Make all the pages point back to the page dictionary and + # ensure they are indirect references pagedict = trailer.Root.Pages for page in pagedict.Kids: page.Parent = pagedict + page.indirect = True self._trailer = trailer return trailer @@ -270,26 +300,42 @@ class PdfWriter(object): trailer = property(_get_trailer, _set_trailer) - def write(self, fname, trailer=None): + def write(self, fname, trailer=None, user_fmt=user_fmt, + disable_gc=True): trailer = trailer or self.trailer # Dump the data. We either have a filename or a preexisting # file object. preexisting = hasattr(fname, 'write') f = preexisting and fname or open(fname, 'wb') - FormatObjects(f, trailer, self.version, self.compress, self.killobj) - if not preexisting: - f.close() - -if __name__ == '__main__': - import logging - log.setLevel(logging.DEBUG) - import pdfreader - x = pdfreader.PdfReader('source.pdf') - y = PdfWriter() - for i, page in enumerate(x.pages): - print ' Adding page', i+1, '\r', - y.addpage(page) - print - y.write('result.pdf') - print + if disable_gc: + gc.disable() + + try: + FormatObjects(f, trailer, self.version, self.compress, + self.killobj, user_fmt=user_fmt) + finally: + if not preexisting: + f.close() + if disable_gc: + gc.enable() + + def make_canonical(self): + ''' Canonicalizes a PDF. Assumes everything + is a Pdf object already. + ''' + visited = set() + workitems = list(self.pagearray) + while workitems: + obj = workitems.pop() + objid = id(obj) + if objid in visited: + continue + visited.add(objid) + obj.indirect = False + if isinstance(obj, (PdfArray, PdfDict)): + obj.indirect = True + if isinstance(obj, PdfArray): + workitems += obj + else: + workitems += obj.values() |