1 files changed, 154 insertions, 47 deletions
diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py
index ba34f61..d210c67 100644
--- a/pdfrw/buildxobj.py
+++ b/pdfrw/buildxobj.py
@@ -1,5 +1,5 @@
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 # MIT license -- See LICENSE.txt for details
 
 '''
@@ -28,26 +28,52 @@ Reference for content:   Adobe PDF reference, sixth edition, version 1.7
         Form xobjects discussed chapter 4.9, page 355
 '''
 
-from pdfrw.objects import PdfDict, PdfArray, PdfName
-from pdfrw.pdfreader import PdfReader
-from pdfrw.errors import log
+from .objects import PdfDict, PdfArray, PdfName
+from .pdfreader import PdfReader
+from .errors import log, PdfNotImplementedError
+from .py23_diffs import iteritems
+
 
 class ViewInfo(object):
     ''' Instantiate ViewInfo with a uri, and it will parse out
         the filename, page, and viewrect into object attributes.
+
+        Note 1:
+            Viewrects follow the adobe definition.  (See reference
+            above). They are arrays of 4 numbers:
+
+            - Distance from left of document in points
+            - Distance from top (NOT bottom) of document in points
+            - Width of rectangle in points
+            - Height of rectangle in points
+
+        Note 2:
+            For simplicity, Viewrects can also be specified
+            in fractions of the document.  If every number in
+            the viewrect is between 0 and 1 inclusive, then
+            viewrect elements 0 and 2 are multiplied by the
+            mediabox width before use, and viewrect elements
+            1 and 3 are multiplied by the mediabox height before
+            use.
+
+        Note 3:
+            By default, an XObject based on the view will be
+            cacheable.  It should not be cacheable if the XObject
+            will be subsequently modified.
     '''
     doc = None
     docname = None
     page = None
     viewrect = None
     rotate = None
+    cacheable = True
 
     def __init__(self, pageinfo='', **kw):
-        pageinfo=pageinfo.split('#',1)
+        pageinfo = pageinfo.split('#', 1)
         if len(pageinfo) == 2:
             pageinfo[1:] = pageinfo[1].replace('&', '#').split('#')
         for key in 'page viewrect'.split():
-            if pageinfo[0].startswith(key+'='):
+            if pageinfo[0].startswith(key + '='):
                 break
         else:
             self.docname = pageinfo.pop(0)
@@ -63,10 +89,11 @@ class ViewInfo(object):
                 setattr(self, key, [float(x) for x in value])
             else:
                 log.error('Unknown option: %s', key)
-        for key, value in kw.iteritems():
+        for key, value in iteritems(kw):
             assert hasattr(self, key), key
             setattr(self, key, value)
 
+
 def get_rotation(rotate):
     ''' Return clockwise rotation code:
           0 = unrotated
@@ -80,10 +107,11 @@ def get_rotation(rotate):
         return 0
     if rotate % 90 != 0:
         return 0
-    return rotate / 90
+    return rotate // 90
+
 
 def rotate_point(point, rotation):
-    ''' Rotate an (x,y) coordinate clockwise by a 
+    ''' Rotate an (x,y) coordinate clockwise by a
         rotation code specifying a multiple of 90 degrees.
     '''
     if rotation & 1:
@@ -92,6 +120,7 @@ def rotate_point(point, rotation):
         point = -point[0], -point[1]
     return point
 
+
 def rotate_rect(rect, rotation):
     ''' Rotate both points within the rectangle, then normalize
         the rectangle by returning the new lower left, then new
@@ -101,68 +130,133 @@ def rotate_rect(rect, rotation):
     return (min(rect[0], rect[2]), min(rect[1], rect[3]),
             max(rect[0], rect[2]), max(rect[1], rect[3]))
 
+
 def getrects(inheritable, pageinfo, rotation):
     ''' Given the inheritable attributes of a page and
         the desired pageinfo rectangle, return the page's
         media box and the calculated boundary (clip) box.
     '''
     mbox = tuple([float(x) for x in inheritable.MediaBox])
+    cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
     vrect = pageinfo.viewrect
-    if vrect is None:
-        cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)])
-    else:
+    if vrect is not None:
         # Rotate the media box to match what the user sees,
         # figure out the clipping box, then rotate back
-        mleft, mbot, mright, mtop = rotate_rect(mbox, rotation)
+        mleft, mbot, mright, mtop = rotate_rect(cbox, rotation)
         x, y, w, h = vrect
+
+        # Support operations in fractions of a page
+        if 0 <= min(vrect) < max(vrect) <= 1:
+            mw = mright - mleft
+            mh = mtop - mbot
+            x *= mw
+            w *= mw
+            y *= mh
+            h *= mh
+
         cleft = mleft + x
         ctop = mtop - y
         cright = cleft + w
         cbot = ctop - h
-        cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop)
+        cbox = (max(mleft, cleft), max(mbot, cbot),
+                min(mright, cright), min(mtop, ctop))
         cbox = rotate_rect(cbox, -rotation)
     return mbox, cbox
 
 
-def _cache_xobj(contents, resources, mbox, bbox, rotation):
+def _build_cache(contents, allow_compressed):
+    ''' Build a new dictionary holding the stream,
+        and save it along with private cache info.
+        Assumes validity has been pre-checked if
+        we have a non-None xobj_copy.
+    '''
+    try:
+        xobj_copy = contents.xobj_copy
+    except AttributeError:
+        # Should have a PdfArray here...
+        array = contents
+        private = contents
+    else:
+        # Should have a PdfDict here -- might or might not have cache copy
+        if xobj_copy is not None:
+            return xobj_copy
+        array = [contents]
+        private = contents.private
+
+    # The spec says nothing about nested arrays.  Will
+    # assume that's not a problem until we encounter them...
+
+    xobj_copy = PdfDict(array[0])
+    xobj_copy.private.xobj_cachedict = {}
+    private.xobj_copy = xobj_copy
+
+    if len(array) > 1:
+        newstream = '\n'.join(x.stream for x in array)
+        newlength = sum(int(x.Length) for x in array) + len(array) - 1
+        assert newlength == len(newstream)
+        xobj_copy.stream = newstream
+
+        # Cannot currently cope with different kinds of
+        # compression in the array, so just disallow it.
+        allow_compressed = False
+
+    if not allow_compressed:
+        # Make sure there are no compression parameters
+        for cdict in array:
+            keys = [x[0] for x in iteritems(cdict)]
+            if len(keys) != 1:
+                raise PdfNotImplementedError(
+                    'Xobjects with compression parameters not supported: %s' %
+                    keys)
+    return xobj_copy
+
+
+def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True):
     ''' Return a cached Form XObject, or create a new one and cache it.
         Adds private members x, y, w, h
     '''
     cachedict = contents.xobj_cachedict
-    if cachedict is None:
-        cachedict = contents.private.xobj_cachedict = {}
     cachekey = mbox, bbox, rotation
-    result = cachedict.get(cachekey)
+    result = cachedict.get(cachekey) if cacheable else None
     if result is None:
-        func = (_get_fullpage, _get_subpage)[mbox != bbox]
+        # If we are not getting a full page, or if we are going to
+        # modify the results, first retrieve an underlying Form XObject
+        # that represents the entire page, so that we are not copying
+        # the full page data into the new file multiple times
+        func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable]
         result = PdfDict(
-            func(contents, resources, mbox, bbox, rotation),
-            Type = PdfName.XObject,
-            Subtype = PdfName.Form,
-            FormType = 1,
-            BBox = PdfArray(bbox),
+            func(contents, resources, mbox),
+            Type=PdfName.XObject,
+            Subtype=PdfName.Form,
+            FormType=1,
+            BBox=PdfArray(bbox),
         )
         rect = bbox
         if rotation:
-            matrix = rotate_point((1, 0), rotation) + rotate_point((0, 1), rotation)
+            matrix = (rotate_point((1, 0), rotation) +
+                      rotate_point((0, 1), rotation))
             result.Matrix = PdfArray(matrix + (0, 0))
             rect = rotate_rect(rect, rotation)
 
-        result.private.x = rect[0]
-        result.private.y = rect[1]
-        result.private.w = rect[2] - rect[0]
-        result.private.h = rect[3] - rect[1]
-        cachedict[cachekey] = result
+        private = result.private
+        private.x = rect[0]
+        private.y = rect[1]
+        private.w = rect[2] - rect[0]
+        private.h = rect[3] - rect[1]
+        if cacheable:
+            cachedict[cachekey] = result
     return result
 
-def _get_fullpage(contents, resources, mbox, bbox, rotation):
+
+def _get_fullpage(contents, resources, mbox):
     ''' fullpage is easy.  Just copy the contents,
         set up the resources, and let _cache_xobj handle the
         rest.
     '''
     return PdfDict(contents, Resources=resources)
 
-def _get_subpage(contents, resources, mbox, bbox, rotation):
+
+def _get_subpage(contents, resources, mbox):
     ''' subpages *could* be as easy as full pages, but we
         choose to complicate life by creating a Form XObject
         for the page, and then one that references it for
@@ -170,37 +264,44 @@ def _get_subpage(contents, resources, mbox, bbox, rotation):
         items from the page.
     '''
     return PdfDict(
-        stream = '/FullPage Do\n',
-        Resources = PdfDict(
-            XObject = PdfDict(
-                FullPage = _cache_xobj(contents, resources, mbox, mbox, 0)
+        stream='/FullPage Do\n',
+        Resources=PdfDict(
+            XObject=PdfDict(
+                FullPage=_cache_xobj(contents, resources, mbox, mbox, 0)
             )
         )
     )
 
+
 def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True):
     ''' pagexobj creates and returns a Form XObject for
         a given view within a page (Defaults to entire page.)
+
+        pagexobj is passed a page and a viewrect.
     '''
     inheritable = page.inheritable
     resources = inheritable.Resources
     rotation = get_rotation(inheritable.Rotate)
     mbox, bbox = getrects(inheritable, viewinfo, rotation)
     rotation += get_rotation(viewinfo.rotate)
-    contents = page.Contents
-    # Make sure the only attribute is length
-    # All the filters must have been executed
-    assert int(contents.Length) == len(contents.stream)
-    if not allow_compressed:
-        assert len([x for x in contents.iteritems()]) == 1
-    return _cache_xobj(contents, resources, mbox, bbox, rotation)
-
+    contents = _build_cache(page.Contents, allow_compressed)
+    return _cache_xobj(contents, resources, mbox, bbox, rotation,
+                       viewinfo.cacheable)
 
 
 def docxobj(pageinfo, doc=None, allow_compressed=True):
-    ''' docxobj creates and returns an actual Form XObject.
+    ''' docinfo reads a page out of a document and uses
+        pagexobj to create the Form XObject based on
+        the page.
+
+        This is a convenience function for things like
+        rst2pdf that want to be able to pass in textual
+        filename/location descriptors and don't want to
+        know about using PdfReader.
+
         Can work standalone, or in conjunction with
         the CacheXObj class (below).
+
     '''
     if not isinstance(pageinfo, ViewInfo):
         pageinfo = ViewInfo(pageinfo)
@@ -215,7 +316,8 @@ def docxobj(pageinfo, doc=None, allow_compressed=True):
     elif pageinfo.doc is not None:
         doc = pageinfo.doc
     else:
-        doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed)
+        doc = pageinfo.doc = PdfReader(pageinfo.docname,
+                                       decompress=not allow_compressed)
     assert isinstance(doc, PdfReader)
 
     sourcepage = doc.pages[(pageinfo.page or 1) - 1]
@@ -227,6 +329,11 @@ class CacheXObj(object):
         and to keep from making the output too much
         bigger than it ought to be by replicating
         unnecessary object copies.
+
+        This is a convenience function for things like
+        rst2pdf that want to be able to pass in textual
+        filename/location descriptors and don't want to
+        know about using PdfReader.
     '''
     def __init__(self, decompress=False):
         ''' Set decompress true if you need