summaryrefslogtreecommitdiff
path: root/pdfrw/pdfwriter.py
diff options
context:
space:
mode:
authorJohannes Schauer <josch@debian.org>2015-10-10 00:55:44 +0200
committerJohannes Schauer <josch@debian.org>2015-10-10 00:55:44 +0200
commita4f43aea0f0af03190e95a431a96722325d8ee9e (patch)
treea780de0e239b4645c0aee1c9d85be7f01f885e73 /pdfrw/pdfwriter.py
parenta1959ba9c0c9f3881c3e593e5aef1046750880f2 (diff)
import new upstream version 0.2
Diffstat (limited to 'pdfrw/pdfwriter.py')
-rwxr-xr-xpdfrw/pdfwriter.py176
1 files changed, 111 insertions, 65 deletions
diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py
index 42740dc..644bb30 100755
--- a/pdfrw/pdfwriter.py
+++ b/pdfrw/pdfwriter.py
@@ -1,7 +1,5 @@
-#!/usr/bin/env python
-
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details
'''
@@ -17,32 +15,50 @@ and
addpage() assumes that the pages are part of a valid
tree/forest of PDF objects.
'''
+import gc
-try:
- set
-except NameError:
- from sets import Set as set
-
-from pdfrw.objects import PdfName, PdfArray, PdfDict, IndirectPdfDict, PdfObject, PdfString
-from pdfrw.compress import compress as do_compress
-from pdfrw.errors import PdfOutputError, log
+from .objects import (PdfName, PdfArray, PdfDict, IndirectPdfDict,
+ PdfObject, PdfString)
+from .compress import compress as do_compress
+from .errors import PdfOutputError, log
+from .py23_diffs import iteritems, convert_store
NullObject = PdfObject('null')
NullObject.indirect = True
NullObject.Type = 'Null object'
+
+def user_fmt(obj, isinstance=isinstance, float=float, str=str,
+ basestring=str, encode=PdfString.encode):
+ ''' This function may be replaced by the user for
+ specialized formatting requirements.
+ '''
+
+ if isinstance(obj, basestring):
+ return encode(obj)
+
+ # PDFs don't handle exponent notation
+ if isinstance(obj, float):
+ return ('%.9f' % obj).rstrip('0').rstrip('.')
+
+ return str(obj)
+
+
def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
- id=id, isinstance=isinstance, getattr=getattr,len=len,
- sum=sum, set=set, str=str, basestring=basestring,
- hasattr=hasattr, repr=repr, enumerate=enumerate,
- list=list, dict=dict, tuple=tuple,
- do_compress=do_compress, PdfArray=PdfArray,
- PdfDict=PdfDict, PdfObject=PdfObject, encode=PdfString.encode):
+ user_fmt=user_fmt, do_compress=do_compress,
+ convert_store=convert_store, iteritems=iteritems,
+ id=id, isinstance=isinstance, getattr=getattr, len=len,
+ sum=sum, set=set, str=str, hasattr=hasattr, repr=repr,
+ enumerate=enumerate, list=list, dict=dict, tuple=tuple,
+ PdfArray=PdfArray, PdfDict=PdfDict, PdfObject=PdfObject):
''' FormatObjects performs the actual formatting and disk write.
Should be a class, was a class, turned into nested functions
for performace (to reduce attribute lookups).
'''
+ def f_write(s):
+ f.write(convert_store(s))
+
def add(obj):
''' Add an object to our list, if it's an indirect
object. Just format it if not.
@@ -58,7 +74,9 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
if not indirect:
if objid in visited:
- log.warning('Replicating direct %s object, should be indirect for optimal file size' % type(obj))
+ log.warning('Replicating direct %s object, '
+ 'should be indirect for optimal file size' %
+ type(obj))
obj = type(obj)(obj)
objid = id(obj)
visiting(objid)
@@ -83,7 +101,7 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
objnum = len(objlist) + 1
objlist_append(None)
indirect_dict[objid] = objnum
- deferred.append((objnum-1, obj))
+ deferred.append((objnum - 1, obj))
return '%s 0 R' % objnum
def format_array(myarray, formatter):
@@ -119,30 +137,32 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
elif isinstance(obj, PdfDict):
if compress and obj.stream:
do_compress([obj])
+ pairs = sorted((x, y, getattr(x, 'encoded', x))
+ for (x, y) in obj.iteritems())
myarray = []
- dictkeys = [str(x) for x in obj.keys()]
- dictkeys.sort()
- for key in dictkeys:
- myarray.append(key)
- myarray.append(add(obj[key]))
+ for key, value, encoding in pairs:
+ myarray.append(encoding)
+ myarray.append(add(value))
result = format_array(myarray, '<<%s>>')
stream = obj.stream
if stream is not None:
- result = '%s\nstream\n%s\nendstream' % (result, stream)
+ result = ('%s\nstream\n%s\nendstream' %
+ (result, stream))
return result
obj = (PdfArray, PdfDict)[isinstance(obj, dict)](obj)
continue
- if not hasattr(obj, 'indirect') and isinstance(obj, basestring):
- return encode(obj)
- return str(getattr(obj, 'encoded', obj))
+ # We assume that an object with an indirect
+ # attribute knows how to represent itself to us.
+ if hasattr(obj, 'indirect'):
+ return str(getattr(obj, 'encoded', obj))
+ return user_fmt(obj)
def format_deferred():
while deferred:
index, obj = deferred.pop()
objlist[index] = format_obj(obj)
-
indirect_dict = {}
indirect_dict_get = indirect_dict.get
objlist = []
@@ -152,14 +172,17 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
leaving = visited.remove
space_join = ' '.join
lf_join = '\n '.join
- f_write = f.write
deferred = []
- # Don't reference old catalog or pages objects -- swap references to new ones.
- swapobj = {PdfName.Catalog:trailer.Root, PdfName.Pages:trailer.Root.Pages, None:trailer}.get
- swapobj = [(objid, swapobj(obj.Type)) for objid, obj in killobj.iteritems()]
- swapobj = dict((objid, obj is None and NullObject or obj) for objid, obj in swapobj).get
+ # Don't reference old catalog or pages objects --
+ # swap references to new ones.
+ swapobj = {PdfName.Catalog: trailer.Root,
+ PdfName.Pages: trailer.Root.Pages, None: trailer}.get
+ swapobj = [(objid, swapobj(obj.Type))
+ for objid, obj in iteritems(killobj)]
+ swapobj = dict((objid, obj is None and NullObject or obj)
+ for objid, obj in swapobj).get
for objid in killobj:
assert swapobj(objid) is not None
@@ -197,9 +220,11 @@ def FormatObjects(f, trailer, version='1.3', compress=True, killobj=(),
f_write('%010d %05d %s\r\n' % x)
f_write('trailer\n\n%s\nstartxref\n%s\n%%%%EOF\n' % (trailer, offset))
+
class PdfWriter(object):
_trailer = None
+ canonicalize = False
def __init__(self, version='1.3', compress=False):
self.pagearray = PdfArray()
@@ -211,15 +236,15 @@ class PdfWriter(object):
self._trailer = None
if page.Type != PdfName.Page:
raise PdfOutputError('Bad /Type: Expected %s, found %s'
- % (PdfName.Page, page.Type))
- inheritable = page.inheritable # searches for resources
+ % (PdfName.Page, page.Type))
+ inheritable = page.inheritable # searches for resources
self.pagearray.append(
IndirectPdfDict(
page,
- Resources = inheritable.Resources,
- MediaBox = inheritable.MediaBox,
- CropBox = inheritable.CropBox,
- Rotate = inheritable.Rotate,
+ Resources=inheritable.Resources,
+ MediaBox=inheritable.MediaBox,
+ CropBox=inheritable.CropBox,
+ Rotate=inheritable.Rotate,
)
)
@@ -247,21 +272,26 @@ class PdfWriter(object):
if trailer is not None:
return trailer
+ if self.canonicalize:
+ self.make_canonical()
+
# Create the basic object structure of the PDF file
trailer = PdfDict(
- Root = IndirectPdfDict(
- Type = PdfName.Catalog,
- Pages = IndirectPdfDict(
- Type = PdfName.Pages,
- Count = PdfObject(len(self.pagearray)),
- Kids = self.pagearray
+ Root=IndirectPdfDict(
+ Type=PdfName.Catalog,
+ Pages=IndirectPdfDict(
+ Type=PdfName.Pages,
+ Count=PdfObject(len(self.pagearray)),
+ Kids=self.pagearray
)
)
)
- # Make all the pages point back to the page dictionary
+ # Make all the pages point back to the page dictionary and
+ # ensure they are indirect references
pagedict = trailer.Root.Pages
for page in pagedict.Kids:
page.Parent = pagedict
+ page.indirect = True
self._trailer = trailer
return trailer
@@ -270,26 +300,42 @@ class PdfWriter(object):
trailer = property(_get_trailer, _set_trailer)
- def write(self, fname, trailer=None):
+ def write(self, fname, trailer=None, user_fmt=user_fmt,
+ disable_gc=True):
trailer = trailer or self.trailer
# Dump the data. We either have a filename or a preexisting
# file object.
preexisting = hasattr(fname, 'write')
f = preexisting and fname or open(fname, 'wb')
- FormatObjects(f, trailer, self.version, self.compress, self.killobj)
- if not preexisting:
- f.close()
-
-if __name__ == '__main__':
- import logging
- log.setLevel(logging.DEBUG)
- import pdfreader
- x = pdfreader.PdfReader('source.pdf')
- y = PdfWriter()
- for i, page in enumerate(x.pages):
- print ' Adding page', i+1, '\r',
- y.addpage(page)
- print
- y.write('result.pdf')
- print
+ if disable_gc:
+ gc.disable()
+
+ try:
+ FormatObjects(f, trailer, self.version, self.compress,
+ self.killobj, user_fmt=user_fmt)
+ finally:
+ if not preexisting:
+ f.close()
+ if disable_gc:
+ gc.enable()
+
+ def make_canonical(self):
+ ''' Canonicalizes a PDF. Assumes everything
+ is a Pdf object already.
+ '''
+ visited = set()
+ workitems = list(self.pagearray)
+ while workitems:
+ obj = workitems.pop()
+ objid = id(obj)
+ if objid in visited:
+ continue
+ visited.add(objid)
+ obj.indirect = False
+ if isinstance(obj, (PdfArray, PdfDict)):
+ obj.indirect = True
+ if isinstance(obj, PdfArray):
+ workitems += obj
+ else:
+ workitems += obj.values()