diff options
Diffstat (limited to 'pdfrw/buildxobj.py')
-rw-r--r-- | pdfrw/buildxobj.py | 201 |
1 files changed, 154 insertions, 47 deletions
diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py index ba34f61..d210c67 100644 --- a/pdfrw/buildxobj.py +++ b/pdfrw/buildxobj.py @@ -1,5 +1,5 @@ -# A part of pdfrw (pdfrw.googlecode.com) -# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas # MIT license -- See LICENSE.txt for details ''' @@ -28,26 +28,52 @@ Reference for content: Adobe PDF reference, sixth edition, version 1.7 Form xobjects discussed chapter 4.9, page 355 ''' -from pdfrw.objects import PdfDict, PdfArray, PdfName -from pdfrw.pdfreader import PdfReader -from pdfrw.errors import log +from .objects import PdfDict, PdfArray, PdfName +from .pdfreader import PdfReader +from .errors import log, PdfNotImplementedError +from .py23_diffs import iteritems + class ViewInfo(object): ''' Instantiate ViewInfo with a uri, and it will parse out the filename, page, and viewrect into object attributes. + + Note 1: + Viewrects follow the adobe definition. (See reference + above). They are arrays of 4 numbers: + + - Distance from left of document in points + - Distance from top (NOT bottom) of document in points + - Width of rectangle in points + - Height of rectangle in points + + Note 2: + For simplicity, Viewrects can also be specified + in fractions of the document. If every number in + the viewrect is between 0 and 1 inclusive, then + viewrect elements 0 and 2 are multiplied by the + mediabox width before use, and viewrect elements + 1 and 3 are multiplied by the mediabox height before + use. + + Note 3: + By default, an XObject based on the view will be + cacheable. It should not be cacheable if the XObject + will be subsequently modified. ''' doc = None docname = None page = None viewrect = None rotate = None + cacheable = True def __init__(self, pageinfo='', **kw): - pageinfo=pageinfo.split('#',1) + pageinfo = pageinfo.split('#', 1) if len(pageinfo) == 2: pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') for key in 'page viewrect'.split(): - if pageinfo[0].startswith(key+'='): + if pageinfo[0].startswith(key + '='): break else: self.docname = pageinfo.pop(0) @@ -63,10 +89,11 @@ class ViewInfo(object): setattr(self, key, [float(x) for x in value]) else: log.error('Unknown option: %s', key) - for key, value in kw.iteritems(): + for key, value in iteritems(kw): assert hasattr(self, key), key setattr(self, key, value) + def get_rotation(rotate): ''' Return clockwise rotation code: 0 = unrotated @@ -80,10 +107,11 @@ def get_rotation(rotate): return 0 if rotate % 90 != 0: return 0 - return rotate / 90 + return rotate // 90 + def rotate_point(point, rotation): - ''' Rotate an (x,y) coordinate clockwise by a + ''' Rotate an (x,y) coordinate clockwise by a rotation code specifying a multiple of 90 degrees. ''' if rotation & 1: @@ -92,6 +120,7 @@ def rotate_point(point, rotation): point = -point[0], -point[1] return point + def rotate_rect(rect, rotation): ''' Rotate both points within the rectangle, then normalize the rectangle by returning the new lower left, then new @@ -101,68 +130,133 @@ def rotate_rect(rect, rotation): return (min(rect[0], rect[2]), min(rect[1], rect[3]), max(rect[0], rect[2]), max(rect[1], rect[3])) + def getrects(inheritable, pageinfo, rotation): ''' Given the inheritable attributes of a page and the desired pageinfo rectangle, return the page's media box and the calculated boundary (clip) box. ''' mbox = tuple([float(x) for x in inheritable.MediaBox]) + cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) vrect = pageinfo.viewrect - if vrect is None: - cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) - else: + if vrect is not None: # Rotate the media box to match what the user sees, # figure out the clipping box, then rotate back - mleft, mbot, mright, mtop = rotate_rect(mbox, rotation) + mleft, mbot, mright, mtop = rotate_rect(cbox, rotation) x, y, w, h = vrect + + # Support operations in fractions of a page + if 0 <= min(vrect) < max(vrect) <= 1: + mw = mright - mleft + mh = mtop - mbot + x *= mw + w *= mw + y *= mh + h *= mh + cleft = mleft + x ctop = mtop - y cright = cleft + w cbot = ctop - h - cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop) + cbox = (max(mleft, cleft), max(mbot, cbot), + min(mright, cright), min(mtop, ctop)) cbox = rotate_rect(cbox, -rotation) return mbox, cbox -def _cache_xobj(contents, resources, mbox, bbox, rotation): +def _build_cache(contents, allow_compressed): + ''' Build a new dictionary holding the stream, + and save it along with private cache info. + Assumes validity has been pre-checked if + we have a non-None xobj_copy. + ''' + try: + xobj_copy = contents.xobj_copy + except AttributeError: + # Should have a PdfArray here... + array = contents + private = contents + else: + # Should have a PdfDict here -- might or might not have cache copy + if xobj_copy is not None: + return xobj_copy + array = [contents] + private = contents.private + + # The spec says nothing about nested arrays. Will + # assume that's not a problem until we encounter them... + + xobj_copy = PdfDict(array[0]) + xobj_copy.private.xobj_cachedict = {} + private.xobj_copy = xobj_copy + + if len(array) > 1: + newstream = '\n'.join(x.stream for x in array) + newlength = sum(int(x.Length) for x in array) + len(array) - 1 + assert newlength == len(newstream) + xobj_copy.stream = newstream + + # Cannot currently cope with different kinds of + # compression in the array, so just disallow it. + allow_compressed = False + + if not allow_compressed: + # Make sure there are no compression parameters + for cdict in array: + keys = [x[0] for x in iteritems(cdict)] + if len(keys) != 1: + raise PdfNotImplementedError( + 'Xobjects with compression parameters not supported: %s' % + keys) + return xobj_copy + + +def _cache_xobj(contents, resources, mbox, bbox, rotation, cacheable=True): ''' Return a cached Form XObject, or create a new one and cache it. Adds private members x, y, w, h ''' cachedict = contents.xobj_cachedict - if cachedict is None: - cachedict = contents.private.xobj_cachedict = {} cachekey = mbox, bbox, rotation - result = cachedict.get(cachekey) + result = cachedict.get(cachekey) if cacheable else None if result is None: - func = (_get_fullpage, _get_subpage)[mbox != bbox] + # If we are not getting a full page, or if we are going to + # modify the results, first retrieve an underlying Form XObject + # that represents the entire page, so that we are not copying + # the full page data into the new file multiple times + func = (_get_fullpage, _get_subpage)[mbox != bbox or not cacheable] result = PdfDict( - func(contents, resources, mbox, bbox, rotation), - Type = PdfName.XObject, - Subtype = PdfName.Form, - FormType = 1, - BBox = PdfArray(bbox), + func(contents, resources, mbox), + Type=PdfName.XObject, + Subtype=PdfName.Form, + FormType=1, + BBox=PdfArray(bbox), ) rect = bbox if rotation: - matrix = rotate_point((1, 0), rotation) + rotate_point((0, 1), rotation) + matrix = (rotate_point((1, 0), rotation) + + rotate_point((0, 1), rotation)) result.Matrix = PdfArray(matrix + (0, 0)) rect = rotate_rect(rect, rotation) - result.private.x = rect[0] - result.private.y = rect[1] - result.private.w = rect[2] - rect[0] - result.private.h = rect[3] - rect[1] - cachedict[cachekey] = result + private = result.private + private.x = rect[0] + private.y = rect[1] + private.w = rect[2] - rect[0] + private.h = rect[3] - rect[1] + if cacheable: + cachedict[cachekey] = result return result -def _get_fullpage(contents, resources, mbox, bbox, rotation): + +def _get_fullpage(contents, resources, mbox): ''' fullpage is easy. Just copy the contents, set up the resources, and let _cache_xobj handle the rest. ''' return PdfDict(contents, Resources=resources) -def _get_subpage(contents, resources, mbox, bbox, rotation): + +def _get_subpage(contents, resources, mbox): ''' subpages *could* be as easy as full pages, but we choose to complicate life by creating a Form XObject for the page, and then one that references it for @@ -170,37 +264,44 @@ def _get_subpage(contents, resources, mbox, bbox, rotation): items from the page. ''' return PdfDict( - stream = '/FullPage Do\n', - Resources = PdfDict( - XObject = PdfDict( - FullPage = _cache_xobj(contents, resources, mbox, mbox, 0) + stream='/FullPage Do\n', + Resources=PdfDict( + XObject=PdfDict( + FullPage=_cache_xobj(contents, resources, mbox, mbox, 0) ) ) ) + def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): ''' pagexobj creates and returns a Form XObject for a given view within a page (Defaults to entire page.) + + pagexobj is passed a page and a viewrect. ''' inheritable = page.inheritable resources = inheritable.Resources rotation = get_rotation(inheritable.Rotate) mbox, bbox = getrects(inheritable, viewinfo, rotation) rotation += get_rotation(viewinfo.rotate) - contents = page.Contents - # Make sure the only attribute is length - # All the filters must have been executed - assert int(contents.Length) == len(contents.stream) - if not allow_compressed: - assert len([x for x in contents.iteritems()]) == 1 - return _cache_xobj(contents, resources, mbox, bbox, rotation) - + contents = _build_cache(page.Contents, allow_compressed) + return _cache_xobj(contents, resources, mbox, bbox, rotation, + viewinfo.cacheable) def docxobj(pageinfo, doc=None, allow_compressed=True): - ''' docxobj creates and returns an actual Form XObject. + ''' docinfo reads a page out of a document and uses + pagexobj to create the Form XObject based on + the page. + + This is a convenience function for things like + rst2pdf that want to be able to pass in textual + filename/location descriptors and don't want to + know about using PdfReader. + Can work standalone, or in conjunction with the CacheXObj class (below). + ''' if not isinstance(pageinfo, ViewInfo): pageinfo = ViewInfo(pageinfo) @@ -215,7 +316,8 @@ def docxobj(pageinfo, doc=None, allow_compressed=True): elif pageinfo.doc is not None: doc = pageinfo.doc else: - doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed) + doc = pageinfo.doc = PdfReader(pageinfo.docname, + decompress=not allow_compressed) assert isinstance(doc, PdfReader) sourcepage = doc.pages[(pageinfo.page or 1) - 1] @@ -227,6 +329,11 @@ class CacheXObj(object): and to keep from making the output too much bigger than it ought to be by replicating unnecessary object copies. + + This is a convenience function for things like + rst2pdf that want to be able to pass in textual + filename/location descriptors and don't want to + know about using PdfReader. ''' def __init__(self, decompress=False): ''' Set decompress true if you need |