1 files changed, 138 insertions, 0 deletions
diff --git a/pdfrw/findobjs.py b/pdfrw/findobjs.py
new file mode 100644
index 0000000..f19ebdf
--- /dev/null
+++ b/pdfrw/findobjs.py
@@ -0,0 +1,138 @@
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2015 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+''' This module contains a function to find all the XObjects
+    in a document, and another function that will wrap them
+    in page objects.
+'''
+
+from .objects import PdfDict, PdfArray, PdfName
+from .pdfwriter import user_fmt
+
+
+def find_objects(source, valid_types=(PdfName.XObject, None),
+                 valid_subtypes=(PdfName.Form, PdfName.Image),
+                 no_follow=(PdfName.Parent,),
+                 isinstance=isinstance, id=id, sorted=sorted,
+                 reversed=reversed, PdfDict=PdfDict):
+    '''
+        Find all the objects of a particular kind in a document
+        or array.  Defaults to looking for Form and Image XObjects.
+
+        This could be done recursively, but some PDFs
+        are quite deeply nested, so we do it without
+        recursion.
+
+        Note that we don't know exactly where things appear on pages,
+        but we aim for a sort order that is (a) mostly in document order,
+        and (b) reproducible.  For arrays, objects are processed in
+        array order, and for dicts, they are processed in key order.
+    '''
+    container = (PdfDict, PdfArray)
+
+    # Allow passing a list of pages, or a dict
+    if isinstance(source, PdfDict):
+        source = [source]
+    else:
+        source = list(source)
+
+    visited = set()
+    source.reverse()
+    while source:
+        obj = source.pop()
+        if not isinstance(obj, container):
+            continue
+        myid = id(obj)
+        if myid in visited:
+            continue
+        visited.add(myid)
+        if isinstance(obj, PdfDict):
+            if obj.Type in valid_types and obj.Subtype in valid_subtypes:
+                yield obj
+            obj = [y for (x, y) in sorted(obj.iteritems())
+                   if x not in no_follow]
+        else:
+            # TODO: This forces resolution of any indirect objects in
+            # the array.  It may not be necessary.  Don't know if
+            # reversed() does any voodoo underneath the hood.
+            # It's cheap enough for now, but might be removeable.
+            obj and obj[0]
+        source.extend(reversed(obj))
+
+
+def wrap_object(obj, width, margin):
+    ''' Wrap an xobj in its own page object.
+    '''
+    fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
+    contents = PdfDict(indirect=True)
+    subtype = obj.Subtype
+    if subtype == PdfName.Form:
+        contents._stream = obj.stream
+        contents.Length = obj.Length
+        contents.Filter = obj.Filter
+        contents.DecodeParms = obj.DecodeParms
+        resources = obj.Resources
+        mbox = obj.BBox
+    elif subtype == PdfName.Image:  # Image
+        xoffset = margin[0]
+        yoffset = margin[1]
+        cw = width - margin[0] - margin[2]
+        iw, ih = float(obj.Width), float(obj.Height)
+        ch = 1.0 * cw / iw * ih
+        height = ch + margin[1] + margin[3]
+        p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset))
+        contents.stream = fmt % p
+        resources = PdfDict(XObject=PdfDict(MyImage=obj))
+        mbox = PdfArray((0, 0, width, height))
+    else:
+        raise TypeError("Expected Form or Image XObject")
+
+    return PdfDict(
+        indirect=True,
+        Type=PdfName.Page,
+        MediaBox=mbox,
+        Resources=resources,
+        Contents=contents,
+        )
+
+
+def trivial_xobjs(maxignore=300):
+    ''' Ignore XObjects that trivially contain other XObjects.
+    '''
+    ignore = set('q Q cm Do'.split())
+    Image = PdfName.Image
+
+    def check(obj):
+        if obj.Subtype == Image:
+            return False
+        s = obj.stream
+        if len(s) < maxignore:
+            s = (x for x in s.split() if not x.startswith('/') and
+                 x not in ignore)
+            s = (x.replace('.', '').replace('-', '') for x in s)
+            if not [x for x in s if not x.isdigit()]:
+                return True
+    return check
+
+
+def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72,
+                  image_only=False, ignore=trivial_xobjs(),
+                  wrap_object=wrap_object):
+    ''' page_per_xobj wraps every XObj found
+        in its own page object.
+        width and margin are used to set image sizes.
+    '''
+    try:
+        iter(margin)
+    except:
+        margin = [margin]
+    while len(margin) < 4:
+        margin *= 2
+
+    if isinstance(xobj_iter, (list, dict)):
+        xobj_iter = find_objects(xobj_iter)
+    for obj in xobj_iter:
+        if not ignore(obj):
+            if not image_only or obj.Subtype == PdfName.IMage:
+                yield wrap_object(obj, width, margin)