1 files changed, 50 insertions, 42 deletions
diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py
index 8067490..5b061d5 100644
--- a/pdfrw/tokens.py
+++ b/pdfrw/tokens.py
@@ -1,5 +1,5 @@
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
 # MIT license -- See LICENSE.txt for details
 
 '''
@@ -10,12 +10,13 @@ sixth edition, for PDF version 1.7, dated November 2006.
 
 '''
 
-from __future__ import generators
-
 import re
 import itertools
-from pdfrw.objects import PdfString, PdfObject
-from pdfrw.errors import log, PdfParseError
+from .objects import PdfString, PdfObject
+from .objects.pdfname import BasePdfName
+from .errors import log, PdfParseError
+from .py23_diffs import nextattr
+
 
 def linepos(fdata, loc):
     line = fdata.count('\n', 0, loc) + 1
@@ -23,6 +24,7 @@ def linepos(fdata, loc):
     col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
     return line, col
 
+
 class PdfTokens(object):
 
     # Table 3.1, page 50 of reference, defines whitespace
@@ -35,7 +37,8 @@ class PdfTokens(object):
 
     # "normal" stuff is all but delimiters or whitespace.
 
-    p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, whitespace)
+    p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters,
+                                             whitespace)
 
     p_comment = r'\%%[^%s]*' % eol
 
@@ -54,10 +57,12 @@ class PdfTokens(object):
 
     p_catchall = '[^%s]' % whitespace
 
-    pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, p_literal_string, p_comment, p_catchall])
-    findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), re.DOTALL).finditer
-    findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, whitespace), re.DOTALL).finditer
-    splitname = re.compile(r'\#([0-9A-Fa-f]{2})').split
+    pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim,
+                        p_literal_string, p_comment, p_catchall])
+    findtok = re.compile('(%s)[%s]*' % (pattern, whitespace),
+                         re.DOTALL).finditer
+    findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
+                                          whitespace), re.DOTALL).finditer
 
     def _cacheobj(cache, obj, constructor):
         ''' This caching relies on the constructors
@@ -71,23 +76,10 @@ class PdfTokens(object):
             cache[result] = result
         return result
 
-    def fixname(self, cache, token, constructor, splitname=splitname, join=''.join, cacheobj=_cacheobj):
-        ''' Inside name tokens, a '#' character indicates that
-            the next two bytes are hex characters to be used
-            to form the 'real' character.
-        '''
-        substrs = splitname(token)
-        if '#' in join(substrs[::2]):
-            self.warning('Invalid /Name token')
-            return token
-        substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2])
-        result = cacheobj(cache, join(substrs), constructor)
-        result.encoded = token
-        return result
-
     def _gettoks(self, startloc, cacheobj=_cacheobj,
-                       delimiters=delimiters, findtok=findtok, findparen=findparen,
-                       PdfString=PdfString, PdfObject=PdfObject):
+                 delimiters=delimiters, findtok=findtok,
+                 findparen=findparen, PdfString=PdfString,
+                 PdfObject=PdfObject, BasePdfName=BasePdfName):
         ''' Given a source data string and a location inside it,
             gettoks generates tokens.  Each token is a tuple of the form:
              <starting file loc>, <ending file loc>, <token string>
@@ -102,7 +94,6 @@ class PdfTokens(object):
         '''
         fdata = self.fdata
         current = self.current = [(startloc, startloc)]
-        namehandler = (cacheobj, self.fixname)
         cache = {}
         while 1:
             for match in findtok(fdata, current[0][1]):
@@ -114,7 +105,10 @@ class PdfTokens(object):
                 elif firstch in '/<(%':
                     if firstch == '/':
                         # PDF Name
-                        token = namehandler['#' in token](cache, token, PdfObject)
+                        encoded = token
+                        token = cache.get(encoded)
+                        if token is None:
+                            token = cache[token] = BasePdfName(encoded)
                     elif firstch == '<':
                         # << dict delim, or < hex string >
                         if token[1:2] != '<':
@@ -126,12 +120,12 @@ class PdfTokens(object):
                         # they are present, we exit the for loop
                         # and get back in with a new starting location.
                         ends = None  # For broken strings
-                        if fdata[match.end(1)-1] != ')':
+                        if fdata[match.end(1) - 1] != ')':
                             nest = 2
                             m_start, loc = tokspan
                             for match in findparen(fdata, loc):
                                 loc = match.end(1)
-                                ending = fdata[loc-1] == ')'
+                                ending = fdata[loc - 1] == ')'
                                 nest += 1 - ending * 2
                                 if not nest:
                                     break
@@ -140,12 +134,14 @@ class PdfTokens(object):
                             token = fdata[m_start:loc]
                             current[0] = m_start, match.end()
                             if nest:
-                                # There is one possible recoverable error seen in
-                                # the wild -- some stupid generators don't escape (.
-                                # If this happens, just terminate on first unescaped ).
-                                # The string won't be quite right, but that's a science
+                                # There is one possible recoverable error
+                                # seen in the wild -- some stupid generators
+                                # don't escape (.  If this happens, just
+                                # terminate on first unescaped ). The string
+                                # won't be quite right, but that's a science
                                 # fair project for another time.
-                                (self.error, self.exception)[not ends]('Unterminated literal string')
+                                (self.error, self.exception)[not ends](
+                                    'Unterminated literal string')
                                 loc, ends, nest = ends
                                 token = fdata[m_start:loc] + ')' * nest
                                 current[0] = m_start, ends
@@ -155,7 +151,8 @@ class PdfTokens(object):
                         if self.strip_comments:
                             continue
                     else:
-                        self.exception('Tokenizer logic incorrect -- should never get here')
+                        self.exception(('Tokenizer logic incorrect -- '
+                                        'should never get here'))
 
                 yield token
                 if current[0] is not tokspan:
@@ -165,11 +162,12 @@ class PdfTokens(object):
                     break
                 raise StopIteration
 
-    def __init__(self, fdata, startloc=0, strip_comments=True):
+    def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True):
         self.fdata = fdata
         self.strip_comments = strip_comments
         self.iterator = iterator = self._gettoks(startloc)
-        self.next = iterator.next
+        self.msgs_dumped = None if verbose else set()
+        self.next = getattr(iterator, nextattr)
 
     def setstart(self, startloc):
         ''' Change the starting location.
@@ -206,6 +204,11 @@ class PdfTokens(object):
         return default
 
     def msg(self, msg, *arg):
+        dumped = self.msgs_dumped
+        if dumped is not None:
+            if msg in dumped:
+                return
+            dumped.add(msg)
         if arg:
             msg %= arg
         fdata = self.fdata
@@ -215,14 +218,19 @@ class PdfTokens(object):
             tok = fdata[begin:end].rstrip()
             if len(tok) > 30:
                 tok = tok[:26] + ' ...'
-            return '%s (line=%d, col=%d, token=%s)' % (msg, line, col, repr(tok))
+            return ('%s (line=%d, col=%d, token=%s)' %
+                    (msg, line, col, repr(tok)))
         return '%s (line=%d, col=%d)' % (msg, line, col)
 
     def warning(self, *arg):
-        log.warning(self.msg(*arg))
+        s = self.msg(*arg)
+        if s:
+            log.warning(s)
 
     def error(self, *arg):
-        log.error(self.msg(*arg))
+        s = self.msg(*arg)
+        if s:
+            log.error(s)
 
     def exception(self, *arg):
         raise PdfParseError(self.msg(*arg))