diff options
author | Matthias Klose <doko@debian.org> | 2014-07-13 17:50:59 +0200 |
---|---|---|
committer | Matthias Klose <doko@debian.org> | 2014-07-13 17:50:59 +0200 |
commit | a1959ba9c0c9f3881c3e593e5aef1046750880f2 (patch) | |
tree | e4fc630e9e26b227d9a7e41db65d80f6158e8ae9 /pdfrw/objects/pdfstring.py |
pdfrw (0.1-3) unstable; urgency=medium
* QA upload.
* Build using dh_python2
# imported from the archive
Diffstat (limited to 'pdfrw/objects/pdfstring.py')
-rw-r--r-- | pdfrw/objects/pdfstring.py | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/pdfrw/objects/pdfstring.py b/pdfrw/objects/pdfstring.py new file mode 100644 index 0000000..7a7d1e4 --- /dev/null +++ b/pdfrw/objects/pdfstring.py @@ -0,0 +1,73 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2012 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +import re + +class PdfString(str): + ''' A PdfString is an encoded string. It has a decode + method to get the actual string data out, and there + is an encode class method to create such a string. + Like any PDF object, it could be indirect, but it + defaults to being a direct object. + ''' + indirect = False + unescape_dict = {'\\b':'\b', '\\f':'\f', '\\n':'\n', + '\\r':'\r', '\\t':'\t', + '\\\r\n': '', '\\\r':'', '\\\n':'', + '\\\\':'\\', '\\':'', + } + unescape_pattern = r'(\\\\|\\b|\\f|\\n|\\r|\\t|\\\r\n|\\\r|\\\n|\\[0-9]+|\\)' + unescape_func = re.compile(unescape_pattern).split + + hex_pattern = '([a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' + hex_func = re.compile(hex_pattern).split + + hex_pattern2 = '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9][a-fA-F0-9]|[a-fA-F0-9])' + hex_func2 = re.compile(hex_pattern2).split + + hex_funcs = hex_func, hex_func2 + + def decode_regular(self, remap=chr): + assert self[0] == '(' and self[-1] == ')' + mylist = self.unescape_func(self[1:-1]) + result = [] + unescape = self.unescape_dict.get + for chunk in mylist: + chunk = unescape(chunk, chunk) + if chunk.startswith('\\') and len(chunk) > 1: + value = int(chunk[1:], 8) + # FIXME: TODO: Handle unicode here + if value > 127: + value = 127 + chunk = remap(value) + if chunk: + result.append(chunk) + return ''.join(result) + + def decode_hex(self, remap=chr, twobytes=False): + data = ''.join(self.split()) + data = self.hex_funcs[twobytes](data) + chars = data[1::2] + other = data[0::2] + assert other[0] == '<' and other[-1] == '>' and ''.join(other) == '<>', self + return ''.join([remap(int(x, 16)) for x in chars]) + + def decode(self, remap=chr, twobytes=False): + if self.startswith('('): + return self.decode_regular(remap) + + else: + return self.decode_hex(remap, twobytes) + + def encode(cls, source, usehex=False): + assert not usehex, "Not supported yet" + if isinstance(source, unicode): + source = source.encode('utf-8') + else: + source = str(source) + source = source.replace('\\', '\\\\') + source = source.replace('(', '\\(') + source = source.replace(')', '\\)') + return cls('(' +source + ')') + encode = classmethod(encode) |