diff options
Diffstat (limited to 'pdfrw/findobjs.py')
-rw-r--r-- | pdfrw/findobjs.py | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/pdfrw/findobjs.py b/pdfrw/findobjs.py new file mode 100644 index 0000000..f19ebdf --- /dev/null +++ b/pdfrw/findobjs.py @@ -0,0 +1,138 @@ +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2015 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' This module contains a function to find all the XObjects + in a document, and another function that will wrap them + in page objects. +''' + +from .objects import PdfDict, PdfArray, PdfName +from .pdfwriter import user_fmt + + +def find_objects(source, valid_types=(PdfName.XObject, None), + valid_subtypes=(PdfName.Form, PdfName.Image), + no_follow=(PdfName.Parent,), + isinstance=isinstance, id=id, sorted=sorted, + reversed=reversed, PdfDict=PdfDict): + ''' + Find all the objects of a particular kind in a document + or array. Defaults to looking for Form and Image XObjects. + + This could be done recursively, but some PDFs + are quite deeply nested, so we do it without + recursion. + + Note that we don't know exactly where things appear on pages, + but we aim for a sort order that is (a) mostly in document order, + and (b) reproducible. For arrays, objects are processed in + array order, and for dicts, they are processed in key order. + ''' + container = (PdfDict, PdfArray) + + # Allow passing a list of pages, or a dict + if isinstance(source, PdfDict): + source = [source] + else: + source = list(source) + + visited = set() + source.reverse() + while source: + obj = source.pop() + if not isinstance(obj, container): + continue + myid = id(obj) + if myid in visited: + continue + visited.add(myid) + if isinstance(obj, PdfDict): + if obj.Type in valid_types and obj.Subtype in valid_subtypes: + yield obj + obj = [y for (x, y) in sorted(obj.iteritems()) + if x not in no_follow] + else: + # TODO: This forces resolution of any indirect objects in + # the array. It may not be necessary. Don't know if + # reversed() does any voodoo underneath the hood. + # It's cheap enough for now, but might be removeable. + obj and obj[0] + source.extend(reversed(obj)) + + +def wrap_object(obj, width, margin): + ''' Wrap an xobj in its own page object. + ''' + fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q' + contents = PdfDict(indirect=True) + subtype = obj.Subtype + if subtype == PdfName.Form: + contents._stream = obj.stream + contents.Length = obj.Length + contents.Filter = obj.Filter + contents.DecodeParms = obj.DecodeParms + resources = obj.Resources + mbox = obj.BBox + elif subtype == PdfName.Image: # Image + xoffset = margin[0] + yoffset = margin[1] + cw = width - margin[0] - margin[2] + iw, ih = float(obj.Width), float(obj.Height) + ch = 1.0 * cw / iw * ih + height = ch + margin[1] + margin[3] + p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset)) + contents.stream = fmt % p + resources = PdfDict(XObject=PdfDict(MyImage=obj)) + mbox = PdfArray((0, 0, width, height)) + else: + raise TypeError("Expected Form or Image XObject") + + return PdfDict( + indirect=True, + Type=PdfName.Page, + MediaBox=mbox, + Resources=resources, + Contents=contents, + ) + + +def trivial_xobjs(maxignore=300): + ''' Ignore XObjects that trivially contain other XObjects. + ''' + ignore = set('q Q cm Do'.split()) + Image = PdfName.Image + + def check(obj): + if obj.Subtype == Image: + return False + s = obj.stream + if len(s) < maxignore: + s = (x for x in s.split() if not x.startswith('/') and + x not in ignore) + s = (x.replace('.', '').replace('-', '') for x in s) + if not [x for x in s if not x.isdigit()]: + return True + return check + + +def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72, + image_only=False, ignore=trivial_xobjs(), + wrap_object=wrap_object): + ''' page_per_xobj wraps every XObj found + in its own page object. + width and margin are used to set image sizes. + ''' + try: + iter(margin) + except: + margin = [margin] + while len(margin) < 4: + margin *= 2 + + if isinstance(xobj_iter, (list, dict)): + xobj_iter = find_objects(xobj_iter) + for obj in xobj_iter: + if not ignore(obj): + if not image_only or obj.Subtype == PdfName.IMage: + yield wrap_object(obj, width, margin) |