summaryrefslogtreecommitdiff
path: root/pdfrw/buildxobj.py
diff options
context:
space:
mode:
Diffstat (limited to 'pdfrw/buildxobj.py')
-rw-r--r--pdfrw/buildxobj.py201
1 files changed, 154 insertions, 47 deletions
diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py
index ba34f61..d210c67 100644
--- a/pdfrw/buildxobj.py
+++ b/pdfrw/buildxobj.py
@@ -1,5 +1,5 @@
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details
'''
@@ -28,26 +28,52 @@ Reference for content: Adobe PDF reference, sixth edition, version 1.7
Form xobjects discussed chapter 4.9, page 355
'''
-from pdfrw.objects import PdfDict, PdfArray, PdfName
-from pdfrw.pdfreader import PdfReader
-from pdfrw.errors import log
+from .objects import PdfDict, PdfArray, PdfName
+from .pdfreader import PdfReader
+from .errors import log, PdfNotImplementedError
+from .py23_diffs import iteritems
+
class ViewInfo(object):
''' Instantiate ViewInfo with a uri, and it will parse out
the filename, page, and viewrect into object attributes.
+
+ Note 1:
+ Viewrects follow the adobe definition. (See reference
+ above). They are arrays of 4 numbers:
+
+ - Distance from left of document in points
+ - Distance from top (NOT bottom) of document in points
+ - Width of rectangle in points
+ - Height of rectangle in points
+
+ Note 2:
+ For simplicity, Viewrects can also be specified
+ in fractions of the document. If every number in
+ the viewrect is between 0 and 1 inclusive, then
+ viewrect elements 0 and 2 are multiplied by the
+ mediabox width before use, and viewrect elements
+ 1 and 3 are multiplied by the mediabox height before
+ use.
+
+ Note 3:
+ By default, an XObject based on the view will be
+ cacheable. It should not be cacheable if the XObject
+ will be subsequently modified.
'''
doc = None
docname = None
page = None
viewrect = None
rotate = None
+ cacheable = True
def __init__(self, pageinfo='', **kw):
- pageinfo=pageinfo.split('#',1)
+ pageinfo = pageinfo.split('#', 1)
if len(pageinfo) == 2:
pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
for key in 'page viewrect'.split():
- if pageinfo[0].startswith(key+'='):
+ if pageinfo[0].startswith(key + '='):
break
else:
self.docname = pageinfo.pop(0)
@@ -63,10 +89,11 @@ class ViewInfo(object):
setattr(self, key, [float(x) for x in value])
else:
log.error('Unknown option: %s', key)
- for key, value in kw.iteritems():
+ for key, value in iteritems(kw):
assert hasattr(self, key), key
setattr(self, key, value)
+
def get_rotation(rotate):
''' Return clockwise rotation code:
0 = unrotated
@@ -80,10 +107,11 @@ def get_rotation(rotate):
return 0
if rotate % 90 != 0:
return 0
- return rotate / 90
+ return rotate // 90
+
def rotate_point(point, rotation):
- ''' Rotate an (x,y) coordinate clockwise by a
+ ''' Rotate an (x,y) coordinate clockwise by a
rotation code specifying a multiple of 90 degrees.
'''
if rotation & 1:
@@ -92,6 +120,7 @@ def rotate_point(point, rotation):
point = -point[0], -point[1]
return point
+
def rotate_rect(rect, rotation):
''' Rotate both points within the rectangle, then normalize
the rectangle by returning the new lower left, then new
@@ -101,68 +130,133 @@ def rotate_rect(rect, rotation):
return (min(rect[0], rect[2]), min(rect[1], rect[3]),
max(rect[0], rect[2]), max(rect[1], rect[3]))
+
def getrects(inheritable, pageinfo, rotation):
''' Given the inheritable attributes of a page and
the desired pageinfo rectangle, return the page's
media box and the calculated boundary (clip) box.
'''
mbox = tuple([float(x) for x in inheritable.MediaBox])
+ cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
vrect = pageinfo.viewrect
- if vrect is None:
- cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
- else:
+ if vrect is not None:
# Rotate the media box to match what the user sees,
# figure out the clipping box, then rotate back
- mleft, mbot, mright, mtop = rotate_rect(mbox, rotation)
+ mleft, mbot, mright, mtop = rotate_rect(cbox, rotation)
x, y, w, h = vrect
+
+ # Support operations in fractions of a page
+ if 0 <= min(vrect) < max(vrect) <= 1:
+ mw = mright - mleft
+ mh = mtop - mbot
+ x *= mw
+ w *= mw
+ y *= mh
+ h *= mh
+
cleft = mleft + x
ctop = mtop - y
cright = cleft + w
cbot = ctop - h
- cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop)
+ cbox = (max(mleft, cleft), max(mbot, cbot),
+ min(mright, cright), min(mtop, ctop))
cbox = rotate_rect(cbox, -rotation)
return mbox, cbox
-def _cache_xobj(contents, resources, mbox, bbox, rotation):
+def _build_cache(contents, allow_compressed):
+ ''' Build a new dictionary holding the stream,
+ and save it along with private cache info.
+ Assumes validity has been pre-checked if
+ we have a non-None xobj_copy.
+ '''
+ try:
+ xobj_copy = contents.xobj_copy
+ except AttributeError:
+ # Should have a PdfArray here...
+ array = contents
+ private = contents
+ else:
+ # Should have a PdfDict here -- might or might not have cache copy
+ if xobj_copy is not None:
+ return xobj_copy
+ array = [contents]
+ private = contents.private
+
+ # The spec says nothing about nested arrays. Will
+ # assume that's not a problem until we encounter them...
+
+ xobj_copy = PdfDict(array[0])
+ xobj_copy.private.xobj_cachedict = {}
+ private.xobj_copy = xobj_copy
+
+ if len(array) > 1:
+ newstream = '\n'.join(x.stream for x in array)
+ newlength = sum(int(x.Length) for x in array) + len(array) - 1
+ assert newlength == len(newstream)
+ xobj_copy.stream = newstream
+
+ # Cannot currently cope with different kinds of
+ # compression in the array, so just disallow it.
+ allow_compressed = False
+
+ if not allow_compressed:
+ # Make sure there are no compression parameters
+ for cdict in array:
+ keys = [x[0] for x in iteritems(cdict)]
+ if len(keys) != 1:
+ raise PdfNotImplementedError(
+ 'Xobjects with compression parameters not supported: %s' %
+ keys)
+ return xobj_copy
+
+
+def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True):
''' Return a cached Form XObject, or create a new one and cache it.
Adds private members x, y, w, h
'''
cachedict = contents.xobj_cachedict
- if cachedict is None:
- cachedict = contents.private.xobj_cachedict = {}
cachekey = mbox, bbox, rotation
- result = cachedict.get(cachekey)
+ result = cachedict.get(cachekey) if cacheable else None
if result is None:
- func = (_get_fullpage, _get_subpage)[mbox != bbox]
+ # If we are not getting a full page, or if we are going to
+ # modify the results, first retrieve an underlying Form XObject
+ # that represents the entire page, so that we are not copying
+ # the full page data into the new file multiple times
+ func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable]
result = PdfDict(
- func(contents, resources, mbox, bbox, rotation),
- Type = PdfName.XObject,
- Subtype = PdfName.Form,
- FormType = 1,
- BBox = PdfArray(bbox),
+ func(contents, resources, mbox),
+ Type=PdfName.XObject,
+ Subtype=PdfName.Form,
+ FormType=1,
+ BBox=PdfArray(bbox),
)
rect = bbox
if rotation:
- matrix = rotate_point((1, 0), rotation) + rotate_point((0, 1), rotation)
+ matrix = (rotate_point((1, 0), rotation) +
+ rotate_point((0, 1), rotation))
result.Matrix = PdfArray(matrix + (0, 0))
rect = rotate_rect(rect, rotation)
- result.private.x = rect[0]
- result.private.y = rect[1]
- result.private.w = rect[2] - rect[0]
- result.private.h = rect[3] - rect[1]
- cachedict[cachekey] = result
+ private = result.private
+ private.x = rect[0]
+ private.y = rect[1]
+ private.w = rect[2] - rect[0]
+ private.h = rect[3] - rect[1]
+ if cacheable:
+ cachedict[cachekey] = result
return result
-def _get_fullpage(contents, resources, mbox, bbox, rotation):
+
+def _get_fullpage(contents, resources, mbox):
''' fullpage is easy. Just copy the contents,
set up the resources, and let _cache_xobj handle the
rest.
'''
return PdfDict(contents, Resources=resources)
-def _get_subpage(contents, resources, mbox, bbox, rotation):
+
+def _get_subpage(contents, resources, mbox):
''' subpages *could* be as easy as full pages, but we
choose to complicate life by creating a Form XObject
for the page, and then one that references it for
@@ -170,37 +264,44 @@ def _get_subpage(contents, resources, mbox, bbox, rotation):
items from the page.
'''
return PdfDict(
- stream = '/FullPage Do\n',
- Resources = PdfDict(
- XObject = PdfDict(
- FullPage = _cache_xobj(contents, resources, mbox, mbox, 0)
+ stream='/FullPage Do\n',
+ Resources=PdfDict(
+ XObject=PdfDict(
+ FullPage=_cache_xobj(contents, resources, mbox, mbox, 0)
)
)
)
+
def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
''' pagexobj creates and returns a Form XObject for
a given view within a page (Defaults to entire page.)
+
+ pagexobj is passed a page and a viewrect.
'''
inheritable = page.inheritable
resources = inheritable.Resources
rotation = get_rotation(inheritable.Rotate)
mbox, bbox = getrects(inheritable, viewinfo, rotation)
rotation += get_rotation(viewinfo.rotate)
- contents = page.Contents
- # Make sure the only attribute is length
- # All the filters must have been executed
- assert int(contents.Length) == len(contents.stream)
- if not allow_compressed:
- assert len([x for x in contents.iteritems()]) == 1
- return _cache_xobj(contents, resources, mbox, bbox, rotation)
-
+ contents = _build_cache(page.Contents, allow_compressed)
+ return _cache_xobj(contents, resources, mbox, bbox, rotation,
+ viewinfo.cacheable)
def docxobj(pageinfo, doc=None, allow_compressed=True):
- ''' docxobj creates and returns an actual Form XObject.
+ ''' docinfo reads a page out of a document and uses
+ pagexobj to create the Form XObject based on
+ the page.
+
+ This is a convenience function for things like
+ rst2pdf that want to be able to pass in textual
+ filename/location descriptors and don't want to
+ know about using PdfReader.
+
Can work standalone, or in conjunction with
the CacheXObj class (below).
+
'''
if not isinstance(pageinfo, ViewInfo):
pageinfo = ViewInfo(pageinfo)
@@ -215,7 +316,8 @@ def docxobj(pageinfo, doc=None, allow_compressed=True):
elif pageinfo.doc is not None:
doc = pageinfo.doc
else:
- doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed)
+ doc = pageinfo.doc = PdfReader(pageinfo.docname,
+ decompress=not allow_compressed)
assert isinstance(doc, PdfReader)
sourcepage = doc.pages[(pageinfo.page or 1) - 1]
@@ -227,6 +329,11 @@ class CacheXObj(object):
and to keep from making the output too much
bigger than it ought to be by replicating
unnecessary object copies.
+
+ This is a convenience function for things like
+ rst2pdf that want to be able to pass in textual
+ filename/location descriptors and don't want to
+ know about using PdfReader.
'''
def __init__(self, decompress=False):
''' Set decompress true if you need