1 files changed, 228 insertions, 0 deletions
diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py
new file mode 100644
index 0000000..8067490
--- /dev/null
+++ b/pdfrw/tokens.py
@@ -0,0 +1,228 @@
+# A part of pdfrw (pdfrw.googlecode.com)
+# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
+# MIT license -- See LICENSE.txt for details
+
+'''
+A tokenizer for PDF streams.
+
+In general, documentation used was "PDF reference",
+sixth edition, for PDF version 1.7, dated November 2006.
+
+'''
+
+from __future__ import generators
+
+import re
+import itertools
+from pdfrw.objects import PdfString, PdfObject
+from pdfrw.errors import log, PdfParseError
+
+def linepos(fdata, loc):
+    line = fdata.count('\n', 0, loc) + 1
+    line += fdata.count('\r', 0, loc) - fdata.count('\r\n', 0, loc)
+    col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
+    return line, col
+
+class PdfTokens(object):
+
+    # Table 3.1, page 50 of reference, defines whitespace
+    eol = '\n\r'
+    whitespace = '\x00 \t\f' + eol
+
+    # Text on page 50 defines delimiter characters
+    # Escape the ]
+    delimiters = r'()<>{}[\]/%'
+
+    # "normal" stuff is all but delimiters or whitespace.
+
+    p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, whitespace)
+
+    p_comment = r'\%%[^%s]*' % eol
+
+    # This will get the bulk of literal strings.
+    p_literal_string = r'\((?:[^\\()]+|\\.)*[()]?'
+
+    # This will get more pieces of literal strings
+    # (Don't ask me why, but it hangs without the trailing ?.)
+    p_literal_string_extend = r'(?:[^\\()]+|\\.)*[()]?'
+
+    # A hex string.  This one's easy.
+    p_hex_string = r'\<[%s0-9A-Fa-f]*\>' % whitespace
+
+    p_dictdelim = r'\<\<|\>\>'
+    p_name = r'/[^%s%s]*' % (delimiters, whitespace)
+
+    p_catchall = '[^%s]' % whitespace
+
+    pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, p_literal_string, p_comment, p_catchall])
+    findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), re.DOTALL).finditer
+    findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, whitespace), re.DOTALL).finditer
+    splitname = re.compile(r'\#([0-9A-Fa-f]{2})').split
+
+    def _cacheobj(cache, obj, constructor):
+        ''' This caching relies on the constructors
+            returning something that will compare as
+            equal to the original obj.  This works
+            fine with our PDF objects.
+        '''
+        result = cache.get(obj)
+        if result is None:
+            result = constructor(obj)
+            cache[result] = result
+        return result
+
+    def fixname(self, cache, token, constructor, splitname=splitname, join=''.join, cacheobj=_cacheobj):
+        ''' Inside name tokens, a '#' character indicates that
+            the next two bytes are hex characters to be used
+            to form the 'real' character.
+        '''
+        substrs = splitname(token)
+        if '#' in join(substrs[::2]):
+            self.warning('Invalid /Name token')
+            return token
+        substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2])
+        result = cacheobj(cache, join(substrs), constructor)
+        result.encoded = token
+        return result
+
+    def _gettoks(self, startloc, cacheobj=_cacheobj,
+                       delimiters=delimiters, findtok=findtok, findparen=findparen,
+                       PdfString=PdfString, PdfObject=PdfObject):
+        ''' Given a source data string and a location inside it,
+            gettoks generates tokens.  Each token is a tuple of the form:
+             <starting file loc>, <ending file loc>, <token string>
+            The ending file loc is past any trailing whitespace.
+
+            The main complication here is the literal strings, which
+            can contain nested parentheses.  In order to cope with these
+            we can discard the current iterator and loop back to the
+            top to get a fresh one.
+
+            We could use re.search instead of re.finditer, but that's slower.
+        '''
+        fdata = self.fdata
+        current = self.current = [(startloc, startloc)]
+        namehandler = (cacheobj, self.fixname)
+        cache = {}
+        while 1:
+            for match in findtok(fdata, current[0][1]):
+                current[0] = tokspan = match.span()
+                token = match.group(1)
+                firstch = token[0]
+                if firstch not in delimiters:
+                    token = cacheobj(cache, token, PdfObject)
+                elif firstch in '/<(%':
+                    if firstch == '/':
+                        # PDF Name
+                        token = namehandler['#' in token](cache, token, PdfObject)
+                    elif firstch == '<':
+                        # << dict delim, or < hex string >
+                        if token[1:2] != '<':
+                            token = cacheobj(cache, token, PdfString)
+                    elif firstch == '(':
+                        # Literal string
+                        # It's probably simple, but maybe not
+                        # Nested parentheses are a bear, and if
+                        # they are present, we exit the for loop
+                        # and get back in with a new starting location.
+                        ends = None  # For broken strings
+                        if fdata[match.end(1)-1] != ')':
+                            nest = 2
+                            m_start, loc = tokspan
+                            for match in findparen(fdata, loc):
+                                loc = match.end(1)
+                                ending = fdata[loc-1] == ')'
+                                nest += 1 - ending * 2
+                                if not nest:
+                                    break
+                                if ending and ends is None:
+                                    ends = loc, match.end(), nest
+                            token = fdata[m_start:loc]
+                            current[0] = m_start, match.end()
+                            if nest:
+                                # There is one possible recoverable error seen in
+                                # the wild -- some stupid generators don't escape (.
+                                # If this happens, just terminate on first unescaped ).
+                                # The string won't be quite right, but that's a science
+                                # fair project for another time.
+                                (self.error, self.exception)[not ends]('Unterminated literal string')
+                                loc, ends, nest = ends
+                                token = fdata[m_start:loc] + ')' * nest
+                                current[0] = m_start, ends
+                        token = cacheobj(cache, token, PdfString)
+                    elif firstch == '%':
+                        # Comment
+                        if self.strip_comments:
+                            continue
+                    else:
+                        self.exception('Tokenizer logic incorrect -- should never get here')
+
+                yield token
+                if current[0] is not tokspan:
+                    break
+            else:
+                if self.strip_comments:
+                    break
+                raise StopIteration
+
+    def __init__(self, fdata, startloc=0, strip_comments=True):
+        self.fdata = fdata
+        self.strip_comments = strip_comments
+        self.iterator = iterator = self._gettoks(startloc)
+        self.next = iterator.next
+
+    def setstart(self, startloc):
+        ''' Change the starting location.
+        '''
+        current = self.current
+        if startloc != current[0][1]:
+            current[0] = startloc, startloc
+
+    def floc(self):
+        ''' Return the current file position
+            (where the next token will be retrieved)
+        '''
+        return self.current[0][1]
+    floc = property(floc, setstart)
+
+    def tokstart(self):
+        ''' Return the file position of the most
+            recently retrieved token.
+        '''
+        return self.current[0][0]
+    tokstart = property(tokstart, setstart)
+
+    def __iter__(self):
+        return self.iterator
+
+    def multiple(self, count, islice=itertools.islice, list=list):
+        ''' Retrieve multiple tokens
+        '''
+        return list(islice(self, count))
+
+    def next_default(self, default='nope'):
+        for result in self:
+            return result
+        return default
+
+    def msg(self, msg, *arg):
+        if arg:
+            msg %= arg
+        fdata = self.fdata
+        begin, end = self.current[0]
+        line, col = linepos(fdata, begin)
+        if end > begin:
+            tok = fdata[begin:end].rstrip()
+            if len(tok) > 30:
+                tok = tok[:26] + ' ...'
+            return '%s (line=%d, col=%d, token=%s)' % (msg, line, col, repr(tok))
+        return '%s (line=%d, col=%d)' % (msg, line, col)
+
+    def warning(self, *arg):
+        log.warning(self.msg(*arg))
+
+    def error(self, *arg):
+        log.error(self.msg(*arg))
+
+    def exception(self, *arg):
+        raise PdfParseError(self.msg(*arg))