1 files changed, 111 insertions, 65 deletions
diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py
index 42740dc..644bb30 100755
--- a/pdfrw/pdfwriter.py
+++ b/pdfrw/pdfwriter.py
@@ -1,7 +1,5 @@
-#!/usr/bin/env python
-
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 # MIT license -- See LICENSE.txt for details
 
 '''
@@ -17,32 +15,50 @@ and
 addpage() assumes that the pages are part of a valid
 tree/forest of PDF objects.
 '''
+import gc
 
-try:
-    set
-except NameError:
-    from sets import Set as set
-
-from pdfrw.objects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString
-from pdfrw.compress import compress as do_compress
-from pdfrw.errors import PdfOutputError, log
+from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict,
+                      PdfObject, PdfString)
+from .compress import compress as do_compress
+from .errors import PdfOutputError, log
+from .py23_diffs import iteritems, convert_store
 
 NullObject = PdfObject('null')
 NullObject.indirect = True
 NullObject.Type = 'Null object'
 
+
+def user_fmt(obj, isinstance=isinstance, float=float, str=str,
+             basestring=str, encode=PdfString.encode):
+    ''' This function may be replaced by the user for
+        specialized formatting requirements.
+    '''
+
+    if isinstance(obj, basestring):
+        return encode(obj)
+
+    # PDFs don't handle exponent notation
+    if isinstance(obj, float):
+            return ('%.9f' % obj).rstrip('0').rstrip('.')
+
+    return str(obj)
+
+
 def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
-        id=id, isinstance=isinstance, getattr=getattr,len=len,
-        sum=sum, set=set, str=str, basestring=basestring,
-        hasattr=hasattr, repr=repr, enumerate=enumerate,
-        list=list, dict=dict, tuple=tuple,
-        do_compress=do_compress, PdfArray=PdfArray,
-        PdfDict=PdfDict, PdfObject=PdfObject, encode=PdfString.encode):
+                  user_fmt=user_fmt, do_compress=do_compress,
+                  convert_store=convert_store, iteritems=iteritems,
+                  id=id, isinstance=isinstance, getattr=getattr, len=len,
+                  sum=sum, set=set, str=str, hasattr=hasattr, repr=repr,
+                  enumerate=enumerate, list=list, dict=dict, tuple=tuple,
+                  PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject):
     ''' FormatObjects performs the actual formatting and disk write.
         Should be a class, was a class, turned into nested functions
         for performace (to reduce attribute lookups).
     '''
 
+    def f_write(s):
+        f.write(convert_store(s))
+
     def add(obj):
         ''' Add an object to our list, if it's an indirect
             object.  Just format it if not.
@@ -58,7 +74,9 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
 
         if not indirect:
             if objid in visited:
-                log.warning('Replicating direct %s object, should be indirect for optimal file size' % type(obj))
+                log.warning('Replicating direct %s object, '
+                            'should be indirect for optimal file size' %
+                            type(obj))
                 obj = type(obj)(obj)
                 objid = id(obj)
             visiting(objid)
@@ -83,7 +101,7 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
             objnum = len(objlist) + 1
             objlist_append(None)
             indirect_dict[objid] = objnum
-            deferred.append((objnum-1, obj))
+            deferred.append((objnum - 1, obj))
         return '%s 0 R' % objnum
 
     def format_array(myarray, formatter):
@@ -119,30 +137,32 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
                 elif isinstance(obj, PdfDict):
                     if compress and obj.stream:
                         do_compress([obj])
+                    pairs = sorted((x, y, getattr(x, 'encoded', x))
+                                   for (x, y) in obj.iteritems())
                     myarray = []
-                    dictkeys = [str(x) for x in obj.keys()]
-                    dictkeys.sort()
-                    for key in dictkeys:
-                        myarray.append(key)
-                        myarray.append(add(obj[key]))
+                    for key, value, encoding in pairs:
+                        myarray.append(encoding)
+                        myarray.append(add(value))
                     result = format_array(myarray, '<<%s>>')
                     stream = obj.stream
                     if stream is not None:
-                        result = '%s\nstream\n%s\nendstream' % (result, stream)
+                        result = ('%s\nstream\n%s\nendstream' %
+                                  (result, stream))
                     return result
                 obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj)
                 continue
 
-            if not hasattr(obj, 'indirect') and isinstance(obj, basestring):
-                return encode(obj)
-            return str(getattr(obj, 'encoded', obj))
+            # We assume that an object with an indirect
+            # attribute knows how to represent itself to us.
+            if hasattr(obj, 'indirect'):
+                return str(getattr(obj, 'encoded', obj))
+            return user_fmt(obj)
 
     def format_deferred():
         while deferred:
             index, obj = deferred.pop()
             objlist[index] = format_obj(obj)
 
-
     indirect_dict = {}
     indirect_dict_get = indirect_dict.get
     objlist = []
@@ -152,14 +172,17 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
     leaving = visited.remove
     space_join = ' '.join
     lf_join = '\n  '.join
-    f_write = f.write
 
     deferred = []
 
-    # Don't reference old catalog or pages objects -- swap references to new ones.
-    swapobj = {PdfName.Catalog:trailer.Root, PdfName.Pages:trailer.Root.Pages, None:trailer}.get
-    swapobj = [(objid, swapobj(obj.Type)) for objid, obj in killobj.iteritems()]
-    swapobj = dict((objid, obj is None and NullObject or obj) for objid, obj in swapobj).get
+    # Don't reference old catalog or pages objects --
+    # swap references to new ones.
+    swapobj = {PdfName.Catalog: trailer.Root,
+               PdfName.Pages: trailer.Root.Pages, None: trailer}.get
+    swapobj = [(objid, swapobj(obj.Type))
+               for objid, obj in iteritems(killobj)]
+    swapobj = dict((objid, obj is None and NullObject or obj)
+                   for objid, obj in swapobj).get
 
     for objid in killobj:
         assert swapobj(objid) is not None
@@ -197,9 +220,11 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
         f_write('%010d %05d %s\r\n' % x)
     f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
 
+
 class PdfWriter(object):
 
     _trailer = None
+    canonicalize = False
 
     def __init__(self, version='1.3', compress=False):
         self.pagearray = PdfArray()
@@ -211,15 +236,15 @@ class PdfWriter(object):
         self._trailer = None
         if page.Type != PdfName.Page:
             raise PdfOutputError('Bad /Type:  Expected %s, found %s'
-                                  % (PdfName.Page, page.Type))
-        inheritable = page.inheritable # searches for resources
+                                 % (PdfName.Page, page.Type))
+        inheritable = page.inheritable  # searches for resources
         self.pagearray.append(
             IndirectPdfDict(
                 page,
-                Resources = inheritable.Resources,
-                MediaBox = inheritable.MediaBox,
-                CropBox = inheritable.CropBox,
-                Rotate = inheritable.Rotate,
+                Resources=inheritable.Resources,
+                MediaBox=inheritable.MediaBox,
+                CropBox=inheritable.CropBox,
+                Rotate=inheritable.Rotate,
             )
         )
 
@@ -247,21 +272,26 @@ class PdfWriter(object):
         if trailer is not None:
             return trailer
 
+        if self.canonicalize:
+            self.make_canonical()
+
         # Create the basic object structure of the PDF file
         trailer = PdfDict(
-            Root = IndirectPdfDict(
-                Type = PdfName.Catalog,
-                Pages = IndirectPdfDict(
-                    Type = PdfName.Pages,
-                    Count = PdfObject(len(self.pagearray)),
-                    Kids = self.pagearray
+            Root=IndirectPdfDict(
+                Type=PdfName.Catalog,
+                Pages=IndirectPdfDict(
+                    Type=PdfName.Pages,
+                    Count=PdfObject(len(self.pagearray)),
+                    Kids=self.pagearray
                 )
             )
         )
-        # Make all the pages point back to the page dictionary
+        # Make all the pages point back to the page dictionary and
+        # ensure they are indirect references
         pagedict = trailer.Root.Pages
         for page in pagedict.Kids:
             page.Parent = pagedict
+            page.indirect = True
         self._trailer = trailer
         return trailer
 
@@ -270,26 +300,42 @@ class PdfWriter(object):
 
     trailer = property(_get_trailer, _set_trailer)
 
-    def write(self, fname, trailer=None):
+    def write(self, fname, trailer=None, user_fmt=user_fmt,
+              disable_gc=True):
         trailer = trailer or self.trailer
 
         # Dump the data.  We either have a filename or a preexisting
         # file object.
         preexisting = hasattr(fname, 'write')
         f = preexisting and fname or open(fname, 'wb')
-        FormatObjects(f, trailer, self.version, self.compress, self.killobj)
-        if not preexisting:
-            f.close()
-
-if __name__ == '__main__':
-    import logging
-    log.setLevel(logging.DEBUG)
-    import pdfreader
-    x = pdfreader.PdfReader('source.pdf')
-    y = PdfWriter()
-    for i, page in enumerate(x.pages):
-        print '  Adding page', i+1, '\r',
-        y.addpage(page)
-    print
-    y.write('result.pdf')
-    print
+        if disable_gc:
+            gc.disable()
+
+        try:
+            FormatObjects(f, trailer, self.version, self.compress,
+                          self.killobj, user_fmt=user_fmt)
+        finally:
+            if not preexisting:
+                f.close()
+            if disable_gc:
+                gc.enable()
+
+    def make_canonical(self):
+        ''' Canonicalizes a PDF.  Assumes everything
+            is a Pdf object already.
+        '''
+        visited = set()
+        workitems = list(self.pagearray)
+        while workitems:
+            obj = workitems.pop()
+            objid = id(obj)
+            if objid in visited:
+                continue
+            visited.add(objid)
+            obj.indirect = False
+            if isinstance(obj, (PdfArray, PdfDict)):
+                obj.indirect = True
+                if isinstance(obj, PdfArray):
+                    workitems += obj
+                else:
+                    workitems += obj.values()