diff options
Diffstat (limited to 'pdfrw/tokens.py')
-rw-r--r-- | pdfrw/tokens.py | 92 |
1 files changed, 50 insertions, 42 deletions
diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py index 8067490..5b061d5 100644 --- a/pdfrw/tokens.py +++ b/pdfrw/tokens.py @@ -1,5 +1,5 @@ -# A part of pdfrw (pdfrw.googlecode.com) -# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas +# A part of pdfrw (https://github.com/pmaupin/pdfrw) +# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas # MIT license -- See LICENSE.txt for details ''' @@ -10,12 +10,13 @@ sixth edition, for PDF version 1.7, dated November 2006. ''' -from __future__ import generators - import re import itertools -from pdfrw.objects import PdfString, PdfObject -from pdfrw.errors import log, PdfParseError +from .objects import PdfString, PdfObject +from .objects.pdfname import BasePdfName +from .errors import log, PdfParseError +from .py23_diffs import nextattr + def linepos(fdata, loc): line = fdata.count('\n', 0, loc) + 1 @@ -23,6 +24,7 @@ def linepos(fdata, loc): col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc)) return line, col + class PdfTokens(object): # Table 3.1, page 50 of reference, defines whitespace @@ -35,7 +37,8 @@ class PdfTokens(object): # "normal" stuff is all but delimiters or whitespace. - p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, whitespace) + p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, + whitespace) p_comment = r'\%%[^%s]*' % eol @@ -54,10 +57,12 @@ class PdfTokens(object): p_catchall = '[^%s]' % whitespace - pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, p_literal_string, p_comment, p_catchall]) - findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), re.DOTALL).finditer - findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, whitespace), re.DOTALL).finditer - splitname = re.compile(r'\#([0-9A-Fa-f]{2})').split + pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, + p_literal_string, p_comment, p_catchall]) + findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), + re.DOTALL).finditer + findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, + whitespace), re.DOTALL).finditer def _cacheobj(cache, obj, constructor): ''' This caching relies on the constructors @@ -71,23 +76,10 @@ class PdfTokens(object): cache[result] = result return result - def fixname(self, cache, token, constructor, splitname=splitname, join=''.join, cacheobj=_cacheobj): - ''' Inside name tokens, a '#' character indicates that - the next two bytes are hex characters to be used - to form the 'real' character. - ''' - substrs = splitname(token) - if '#' in join(substrs[::2]): - self.warning('Invalid /Name token') - return token - substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2]) - result = cacheobj(cache, join(substrs), constructor) - result.encoded = token - return result - def _gettoks(self, startloc, cacheobj=_cacheobj, - delimiters=delimiters, findtok=findtok, findparen=findparen, - PdfString=PdfString, PdfObject=PdfObject): + delimiters=delimiters, findtok=findtok, + findparen=findparen, PdfString=PdfString, + PdfObject=PdfObject, BasePdfName=BasePdfName): ''' Given a source data string and a location inside it, gettoks generates tokens. Each token is a tuple of the form: <starting file loc>, <ending file loc>, <token string> @@ -102,7 +94,6 @@ class PdfTokens(object): ''' fdata = self.fdata current = self.current = [(startloc, startloc)] - namehandler = (cacheobj, self.fixname) cache = {} while 1: for match in findtok(fdata, current[0][1]): @@ -114,7 +105,10 @@ class PdfTokens(object): elif firstch in '/<(%': if firstch == '/': # PDF Name - token = namehandler['#' in token](cache, token, PdfObject) + encoded = token + token = cache.get(encoded) + if token is None: + token = cache[token] = BasePdfName(encoded) elif firstch == '<': # << dict delim, or < hex string > if token[1:2] != '<': @@ -126,12 +120,12 @@ class PdfTokens(object): # they are present, we exit the for loop # and get back in with a new starting location. ends = None # For broken strings - if fdata[match.end(1)-1] != ')': + if fdata[match.end(1) - 1] != ')': nest = 2 m_start, loc = tokspan for match in findparen(fdata, loc): loc = match.end(1) - ending = fdata[loc-1] == ')' + ending = fdata[loc - 1] == ')' nest += 1 - ending * 2 if not nest: break @@ -140,12 +134,14 @@ class PdfTokens(object): token = fdata[m_start:loc] current[0] = m_start, match.end() if nest: - # There is one possible recoverable error seen in - # the wild -- some stupid generators don't escape (. - # If this happens, just terminate on first unescaped ). - # The string won't be quite right, but that's a science + # There is one possible recoverable error + # seen in the wild -- some stupid generators + # don't escape (. If this happens, just + # terminate on first unescaped ). The string + # won't be quite right, but that's a science # fair project for another time. - (self.error, self.exception)[not ends]('Unterminated literal string') + (self.error, self.exception)[not ends]( + 'Unterminated literal string') loc, ends, nest = ends token = fdata[m_start:loc] + ')' * nest current[0] = m_start, ends @@ -155,7 +151,8 @@ class PdfTokens(object): if self.strip_comments: continue else: - self.exception('Tokenizer logic incorrect -- should never get here') + self.exception(('Tokenizer logic incorrect -- ' + 'should never get here')) yield token if current[0] is not tokspan: @@ -165,11 +162,12 @@ class PdfTokens(object): break raise StopIteration - def __init__(self, fdata, startloc=0, strip_comments=True): + def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True): self.fdata = fdata self.strip_comments = strip_comments self.iterator = iterator = self._gettoks(startloc) - self.next = iterator.next + self.msgs_dumped = None if verbose else set() + self.next = getattr(iterator, nextattr) def setstart(self, startloc): ''' Change the starting location. @@ -206,6 +204,11 @@ class PdfTokens(object): return default def msg(self, msg, *arg): + dumped = self.msgs_dumped + if dumped is not None: + if msg in dumped: + return + dumped.add(msg) if arg: msg %= arg fdata = self.fdata @@ -215,14 +218,19 @@ class PdfTokens(object): tok = fdata[begin:end].rstrip() if len(tok) > 30: tok = tok[:26] + ' ...' - return '%s (line=%d, col=%d, token=%s)' % (msg, line, col, repr(tok)) + return ('%s (line=%d, col=%d, token=%s)' % + (msg, line, col, repr(tok))) return '%s (line=%d, col=%d)' % (msg, line, col) def warning(self, *arg): - log.warning(self.msg(*arg)) + s = self.msg(*arg) + if s: + log.warning(s) def error(self, *arg): - log.error(self.msg(*arg)) + s = self.msg(*arg) + if s: + log.error(s) def exception(self, *arg): raise PdfParseError(self.msg(*arg)) |