diff options
Diffstat (limited to 'pdfrw/buildxobj.py')
-rw-r--r-- | pdfrw/buildxobj.py | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/pdfrw/buildxobj.py b/pdfrw/buildxobj.py new file mode 100644 index 0000000..ba34f61 --- /dev/null +++ b/pdfrw/buildxobj.py @@ -0,0 +1,249 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' + +This module contains code to build PDF "Form XObjects". + +A Form XObject allows a fragment from one PDF file to be cleanly +included in another PDF file. + +Reference for syntax: "Parameters for opening PDF files" from SDK 8.1 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_open_parameters.pdf + + supported 'page=xxx', 'viewrect=<left>,<top>,<width>,<height>' + + Also supported by this, but not by Adobe: + 'rotate=xxx' where xxx in [0, 90, 180, 270] + + Units are in points + + +Reference for content: Adobe PDF reference, sixth edition, version 1.7 + + http://www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf + + Form xobjects discussed chapter 4.9, page 355 +''' + +from pdfrw.objects import PdfDict, PdfArray, PdfName +from pdfrw.pdfreader import PdfReader +from pdfrw.errors import log + +class ViewInfo(object): + ''' Instantiate ViewInfo with a uri, and it will parse out + the filename, page, and viewrect into object attributes. + ''' + doc = None + docname = None + page = None + viewrect = None + rotate = None + + def __init__(self, pageinfo='', **kw): + pageinfo=pageinfo.split('#',1) + if len(pageinfo) == 2: + pageinfo[1:] = pageinfo[1].replace('&', '#').split('#') + for key in 'page viewrect'.split(): + if pageinfo[0].startswith(key+'='): + break + else: + self.docname = pageinfo.pop(0) + for item in pageinfo: + key, value = item.split('=') + key = key.strip() + value = value.replace(',', ' ').split() + if key in ('page', 'rotate'): + assert len(value) == 1 + setattr(self, key, int(value[0])) + elif key == 'viewrect': + assert len(value) == 4 + setattr(self, key, [float(x) for x in value]) + else: + log.error('Unknown option: %s', key) + for key, value in kw.iteritems(): + assert hasattr(self, key), key + setattr(self, key, value) + +def get_rotation(rotate): + ''' Return clockwise rotation code: + 0 = unrotated + 1 = 90 degrees + 2 = 180 degrees + 3 = 270 degrees + ''' + try: + rotate = int(rotate) + except (ValueError, TypeError): + return 0 + if rotate % 90 != 0: + return 0 + return rotate / 90 + +def rotate_point(point, rotation): + ''' Rotate an (x,y) coordinate clockwise by a + rotation code specifying a multiple of 90 degrees. + ''' + if rotation & 1: + point = point[1], -point[0] + if rotation & 2: + point = -point[0], -point[1] + return point + +def rotate_rect(rect, rotation): + ''' Rotate both points within the rectangle, then normalize + the rectangle by returning the new lower left, then new + upper right. + ''' + rect = rotate_point(rect[:2], rotation) + rotate_point(rect[2:], rotation) + return (min(rect[0], rect[2]), min(rect[1], rect[3]), + max(rect[0], rect[2]), max(rect[1], rect[3])) + +def getrects(inheritable, pageinfo, rotation): + ''' Given the inheritable attributes of a page and + the desired pageinfo rectangle, return the page's + media box and the calculated boundary (clip) box. + ''' + mbox = tuple([float(x) for x in inheritable.MediaBox]) + vrect = pageinfo.viewrect + if vrect is None: + cbox = tuple([float(x) for x in (inheritable.CropBox or mbox)]) + else: + # Rotate the media box to match what the user sees, + # figure out the clipping box, then rotate back + mleft, mbot, mright, mtop = rotate_rect(mbox, rotation) + x, y, w, h = vrect + cleft = mleft + x + ctop = mtop - y + cright = cleft + w + cbot = ctop - h + cbox = max(mleft, cleft), max(mbot, cbot), min(mright, cright), min(mtop, ctop) + cbox = rotate_rect(cbox, -rotation) + return mbox, cbox + + +def _cache_xobj(contents, resources, mbox, bbox, rotation): + ''' Return a cached Form XObject, or create a new one and cache it. + Adds private members x, y, w, h + ''' + cachedict = contents.xobj_cachedict + if cachedict is None: + cachedict = contents.private.xobj_cachedict = {} + cachekey = mbox, bbox, rotation + result = cachedict.get(cachekey) + if result is None: + func = (_get_fullpage, _get_subpage)[mbox != bbox] + result = PdfDict( + func(contents, resources, mbox, bbox, rotation), + Type = PdfName.XObject, + Subtype = PdfName.Form, + FormType = 1, + BBox = PdfArray(bbox), + ) + rect = bbox + if rotation: + matrix = rotate_point((1, 0), rotation) + rotate_point((0, 1), rotation) + result.Matrix = PdfArray(matrix + (0, 0)) + rect = rotate_rect(rect, rotation) + + result.private.x = rect[0] + result.private.y = rect[1] + result.private.w = rect[2] - rect[0] + result.private.h = rect[3] - rect[1] + cachedict[cachekey] = result + return result + +def _get_fullpage(contents, resources, mbox, bbox, rotation): + ''' fullpage is easy. Just copy the contents, + set up the resources, and let _cache_xobj handle the + rest. + ''' + return PdfDict(contents, Resources=resources) + +def _get_subpage(contents, resources, mbox, bbox, rotation): + ''' subpages *could* be as easy as full pages, but we + choose to complicate life by creating a Form XObject + for the page, and then one that references it for + the subpage, on the off-chance that we want multiple + items from the page. + ''' + return PdfDict( + stream = '/FullPage Do\n', + Resources = PdfDict( + XObject = PdfDict( + FullPage = _cache_xobj(contents, resources, mbox, mbox, 0) + ) + ) + ) + +def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): + ''' pagexobj creates and returns a Form XObject for + a given view within a page (Defaults to entire page.) + ''' + inheritable = page.inheritable + resources = inheritable.Resources + rotation = get_rotation(inheritable.Rotate) + mbox, bbox = getrects(inheritable, viewinfo, rotation) + rotation += get_rotation(viewinfo.rotate) + contents = page.Contents + # Make sure the only attribute is length + # All the filters must have been executed + assert int(contents.Length) == len(contents.stream) + if not allow_compressed: + assert len([x for x in contents.iteritems()]) == 1 + return _cache_xobj(contents, resources, mbox, bbox, rotation) + + + +def docxobj(pageinfo, doc=None, allow_compressed=True): + ''' docxobj creates and returns an actual Form XObject. + Can work standalone, or in conjunction with + the CacheXObj class (below). + ''' + if not isinstance(pageinfo, ViewInfo): + pageinfo = ViewInfo(pageinfo) + + # If we're explicitly passed a document, + # make sure we don't have one implicitly as well. + # If no implicit or explicit doc, then read one in + # from the filename. + if doc is not None: + assert pageinfo.doc is None + pageinfo.doc = doc + elif pageinfo.doc is not None: + doc = pageinfo.doc + else: + doc = pageinfo.doc = PdfReader(pageinfo.docname, decompress = not allow_compressed) + assert isinstance(doc, PdfReader) + + sourcepage = doc.pages[(pageinfo.page or 1) - 1] + return pagexobj(sourcepage, pageinfo, allow_compressed) + + +class CacheXObj(object): + ''' Use to keep from reparsing files over and over, + and to keep from making the output too much + bigger than it ought to be by replicating + unnecessary object copies. + ''' + def __init__(self, decompress=False): + ''' Set decompress true if you need + the Form XObjects to be decompressed. + Will decompress what it can and scream + about the rest. + ''' + self.cached_pdfs = {} + self.decompress = decompress + + def load(self, sourcename): + ''' Load a Form XObject from a uri + ''' + info = ViewInfo(sourcename) + fname = info.docname + pcache = self.cached_pdfs + doc = pcache.get(fname) + if doc is None: + doc = pcache[fname] = PdfReader(fname, decompress=self.decompress) + return docxobj(info, doc, allow_compressed=not self.decompress) |