summaryrefslogtreecommitdiff
path: root/pdfrw/tokens.py
diff options
context:
space:
mode:
Diffstat (limited to 'pdfrw/tokens.py')
-rw-r--r--pdfrw/tokens.py92
1 files changed, 50 insertions, 42 deletions
diff --git a/pdfrw/tokens.py b/pdfrw/tokens.py
index 8067490..5b061d5 100644
--- a/pdfrw/tokens.py
+++ b/pdfrw/tokens.py
@@ -1,5 +1,5 @@
-# A part of pdfrw (pdfrw.googlecode.com)
-# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas
+# A part of pdfrw (https://github.com/pmaupin/pdfrw)
+# Copyright (C) 2006-2015 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details
'''
@@ -10,12 +10,13 @@ sixth edition, for PDF version 1.7, dated November 2006.
'''
-from __future__ import generators
-
import re
import itertools
-from pdfrw.objects import PdfString, PdfObject
-from pdfrw.errors import log, PdfParseError
+from .objects import PdfString, PdfObject
+from .objects.pdfname import BasePdfName
+from .errors import log, PdfParseError
+from .py23_diffs import nextattr
+
def linepos(fdata, loc):
line = fdata.count('\n', 0, loc) + 1
@@ -23,6 +24,7 @@ def linepos(fdata, loc):
col = loc - max(fdata.rfind('\n', 0, loc), fdata.rfind('\r', 0, loc))
return line, col
+
class PdfTokens(object):
# Table 3.1, page 50 of reference, defines whitespace
@@ -35,7 +37,8 @@ class PdfTokens(object):
# "normal" stuff is all but delimiters or whitespace.
- p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters, whitespace)
+ p_normal = r'(?:[^\\%s%s]+|\\[^%s])+' % (whitespace, delimiters,
+ whitespace)
p_comment = r'\%%[^%s]*' % eol
@@ -54,10 +57,12 @@ class PdfTokens(object):
p_catchall = '[^%s]' % whitespace
- pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim, p_literal_string, p_comment, p_catchall])
- findtok = re.compile('(%s)[%s]*' % (pattern, whitespace), re.DOTALL).finditer
- findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend, whitespace), re.DOTALL).finditer
- splitname = re.compile(r'\#([0-9A-Fa-f]{2})').split
+ pattern = '|'.join([p_normal, p_name, p_hex_string, p_dictdelim,
+ p_literal_string, p_comment, p_catchall])
+ findtok = re.compile('(%s)[%s]*' % (pattern, whitespace),
+ re.DOTALL).finditer
+ findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
+ whitespace), re.DOTALL).finditer
def _cacheobj(cache, obj, constructor):
''' This caching relies on the constructors
@@ -71,23 +76,10 @@ class PdfTokens(object):
cache[result] = result
return result
- def fixname(self, cache, token, constructor, splitname=splitname, join=''.join, cacheobj=_cacheobj):
- ''' Inside name tokens, a '#' character indicates that
- the next two bytes are hex characters to be used
- to form the 'real' character.
- '''
- substrs = splitname(token)
- if '#' in join(substrs[::2]):
- self.warning('Invalid /Name token')
- return token
- substrs[1::2] = (chr(int(x, 16)) for x in substrs[1::2])
- result = cacheobj(cache, join(substrs), constructor)
- result.encoded = token
- return result
-
def _gettoks(self, startloc, cacheobj=_cacheobj,
- delimiters=delimiters, findtok=findtok, findparen=findparen,
- PdfString=PdfString, PdfObject=PdfObject):
+ delimiters=delimiters, findtok=findtok,
+ findparen=findparen, PdfString=PdfString,
+ PdfObject=PdfObject, BasePdfName=BasePdfName):
''' Given a source data string and a location inside it,
gettoks generates tokens. Each token is a tuple of the form:
<starting file loc>, <ending file loc>, <token string>
@@ -102,7 +94,6 @@ class PdfTokens(object):
'''
fdata = self.fdata
current = self.current = [(startloc, startloc)]
- namehandler = (cacheobj, self.fixname)
cache = {}
while 1:
for match in findtok(fdata, current[0][1]):
@@ -114,7 +105,10 @@ class PdfTokens(object):
elif firstch in '/<(%':
if firstch == '/':
# PDF Name
- token = namehandler['#' in token](cache, token, PdfObject)
+ encoded = token
+ token = cache.get(encoded)
+ if token is None:
+ token = cache[token] = BasePdfName(encoded)
elif firstch == '<':
# << dict delim, or < hex string >
if token[1:2] != '<':
@@ -126,12 +120,12 @@ class PdfTokens(object):
# they are present, we exit the for loop
# and get back in with a new starting location.
ends = None # For broken strings
- if fdata[match.end(1)-1] != ')':
+ if fdata[match.end(1) - 1] != ')':
nest = 2
m_start, loc = tokspan
for match in findparen(fdata, loc):
loc = match.end(1)
- ending = fdata[loc-1] == ')'
+ ending = fdata[loc - 1] == ')'
nest += 1 - ending * 2
if not nest:
break
@@ -140,12 +134,14 @@ class PdfTokens(object):
token = fdata[m_start:loc]
current[0] = m_start, match.end()
if nest:
- # There is one possible recoverable error seen in
- # the wild -- some stupid generators don't escape (.
- # If this happens, just terminate on first unescaped ).
- # The string won't be quite right, but that's a science
+ # There is one possible recoverable error
+ # seen in the wild -- some stupid generators
+ # don't escape (. If this happens, just
+ # terminate on first unescaped ). The string
+ # won't be quite right, but that's a science
# fair project for another time.
- (self.error, self.exception)[not ends]('Unterminated literal string')
+ (self.error, self.exception)[not ends](
+ 'Unterminated literal string')
loc, ends, nest = ends
token = fdata[m_start:loc] + ')' * nest
current[0] = m_start, ends
@@ -155,7 +151,8 @@ class PdfTokens(object):
if self.strip_comments:
continue
else:
- self.exception('Tokenizer logic incorrect -- should never get here')
+ self.exception(('Tokenizer logic incorrect -- '
+ 'should never get here'))
yield token
if current[0] is not tokspan:
@@ -165,11 +162,12 @@ class PdfTokens(object):
break
raise StopIteration
- def __init__(self, fdata, startloc=0, strip_comments=True):
+ def __init__(self, fdata, startloc=0, strip_comments=True, verbose=True):
self.fdata = fdata
self.strip_comments = strip_comments
self.iterator = iterator = self._gettoks(startloc)
- self.next = iterator.next
+ self.msgs_dumped = None if verbose else set()
+ self.next = getattr(iterator, nextattr)
def setstart(self, startloc):
''' Change the starting location.
@@ -206,6 +204,11 @@ class PdfTokens(object):
return default
def msg(self, msg, *arg):
+ dumped = self.msgs_dumped
+ if dumped is not None:
+ if msg in dumped:
+ return
+ dumped.add(msg)
if arg:
msg %= arg
fdata = self.fdata
@@ -215,14 +218,19 @@ class PdfTokens(object):
tok = fdata[begin:end].rstrip()
if len(tok) > 30:
tok = tok[:26] + ' ...'
- return '%s (line=%d, col=%d, token=%s)' % (msg, line, col, repr(tok))
+ return ('%s (line=%d, col=%d, token=%s)' %
+ (msg, line, col, repr(tok)))
return '%s (line=%d, col=%d)' % (msg, line, col)
def warning(self, *arg):
- log.warning(self.msg(*arg))
+ s = self.msg(*arg)
+ if s:
+ log.warning(s)
def error(self, *arg):
- log.error(self.msg(*arg))
+ s = self.msg(*arg)
+ if s:
+ log.error(s)
def exception(self, *arg):
raise PdfParseError(self.msg(*arg))