diff options
author | Matthias Klose <doko@debian.org> | 2014-07-13 17:50:59 +0200 |
---|---|---|
committer | Matthias Klose <doko@debian.org> | 2014-07-13 17:50:59 +0200 |
commit | a1959ba9c0c9f3881c3e593e5aef1046750880f2 (patch) | |
tree | e4fc630e9e26b227d9a7e41db65d80f6158e8ae9 /examples/rl2/decodegraphics.py |
pdfrw (0.1-3) unstable; urgency=medium
* QA upload.
* Build using dh_python2
# imported from the archive
Diffstat (limited to 'examples/rl2/decodegraphics.py')
-rw-r--r-- | examples/rl2/decodegraphics.py | 378 |
1 files changed, 378 insertions, 0 deletions
diff --git a/examples/rl2/decodegraphics.py b/examples/rl2/decodegraphics.py new file mode 100644 index 0000000..9fa5a05 --- /dev/null +++ b/examples/rl2/decodegraphics.py @@ -0,0 +1,378 @@ +# A part of pdfrw (pdfrw.googlecode.com) +# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas +# MIT license -- See LICENSE.txt for details + +''' +This file is an example parser that will parse a graphics stream +into a reportlab canvas. + +Needs work on fonts and unicode, but works on a few PDFs. + +Better to use Form XObjects for most things (see the example in rl1). + +''' +from inspect import getargspec + +import find_pdfrw +from pdfrw import PdfTokens +from pdfrw.pdfobjects import PdfString + +############################################################################# +# Graphics parsing + +def parse_array(self, token='[', params=None): + mylist = [] + for token in self.tokens: + if token == ']': + break + mylist.append(token) + self.params.append(mylist) + +def parse_savestate(self, token='q', params=''): + self.canv.saveState() + +def parse_restorestate(self, token='Q', params=''): + self.canv.restoreState() + +def parse_transform(self, token='cm', params='ffffff'): + self.canv.transform(*params) + +def parse_linewidth(self, token='w', params='f'): + self.canv.setLineWidth(*params) + +def parse_linecap(self, token='J', params='i'): + self.canv.setLineCap(*params) + +def parse_linejoin(self, token='j', params='i'): + self.canv.setLineJoin(*params) + +def parse_miterlimit(self, token='M', params='f'): + self.canv.setMiterLimit(*params) + +def parse_dash(self, token='d', params='as'): # Array, string + self.canv.setDash(*params) + +def parse_intent(self, token='ri', params='n'): + # TODO: add logging + pass + +def parse_flatness(self, token='i', params='i'): + # TODO: add logging + pass + +def parse_gstate(self, token='gs', params='n'): + # TODO: add logging + # Could parse stuff we care about from here later + pass + +def parse_move(self, token='m', params='ff'): + if self.gpath is None: + self.gpath = self.canv.beginPath() + self.gpath.moveTo(*params) + self.current_point = params + +def parse_line(self, token='l', params='ff'): + self.gpath.lineTo(*params) + self.current_point = params + +def parse_curve(self, token='c', params='ffffff'): + self.gpath.curveTo(*params) + self.current_point = params[-2:] + +def parse_curve1(self, token='v', params='ffff'): + parse_curve(self, token, tuple(self.current_point) + tuple(params)) + +def parse_curve2(self, token='y', params='ffff'): + parse_curve(self, token, tuple(params) + tuple(params[-2:])) + +def parse_close(self, token='h', params=''): + self.gpath.close() + +def parse_rect(self, token='re', params='ffff'): + if self.gpath is None: + self.gpath = self.canv.beginPath() + self.gpath.rect(*params) + self.current_point = params[-2:] + +def parse_stroke(self, token='S', params=''): + finish_path(self, 1, 0, 0) + +def parse_close_stroke(self, token='s', params=''): + self.gpath.close() + finish_path(self, 1, 0, 0) + +def parse_fill(self, token='f', params=''): + finish_path(self, 0, 1, 1) + +def parse_fill_compat(self, token='F', params=''): + finish_path(self, 0, 1, 1) + +def parse_fill_even_odd(self, token='f*', params=''): + finish_path(self, 0, 1, 0) + +def parse_fill_stroke_even_odd(self, token='B*', params=''): + finish_path(self, 1, 1, 0) + +def parse_fill_stroke(self, token='B', params=''): + finish_path(self, 1, 1, 1) + +def parse_close_fill_stroke_even_odd(self, token='b*', params=''): + self.gpath.close() + finish_path(self, 1, 1, 0) + +def parse_close_fill_stroke(self, token='b', params=''): + self.gpath.close() + finish_path(self, 1, 1, 1) + +def parse_nop(self, token='n', params=''): + finish_path(self, 0, 0, 0) + +def finish_path(self, stroke, fill, fillmode): + if self.gpath is not None: + canv = self.canv + canv._fillMode, oldmode = fillmode, canv._fillMode + canv.drawPath(self.gpath, stroke, fill) + canv._fillMode = oldmode + self.gpath = None + +def parse_clip_path(self, token='W', params=''): + # TODO: add logging + pass + +def parse_clip_path_even_odd(self, token='W*', params=''): + # TODO: add logging + pass + +def parse_stroke_gray(self, token='G', params='f'): + self.canv.setStrokeGray(*params) + +def parse_fill_gray(self, token='g', params='f'): + self.canv.setFillGray(*params) + +def parse_stroke_rgb(self, token='RG', params='fff'): + self.canv.setStrokeColorRGB(*params) + +def parse_fill_rgb(self, token='rg', params='fff'): + self.canv.setFillColorRGB(*params) + +def parse_stroke_cmyk(self, token='K', params='ffff'): + self.canv.setStrokeColorCMYK(*params) + +def parse_fill_cmyk(self, token='k', params='ffff'): + self.canv.setFillColorCMYK(*params) + +############################################################################# +# Text parsing + +def parse_begin_text(self, token='BT', params=''): + assert self.tpath is None + self.tpath = self.canv.beginText() + +def parse_text_transform(self, token='Tm', params='ffffff'): + path = self.tpath + + # Stoopid optimization to remove nop + try: + code = path._code + except AttributeError: + pass + else: + if code[-1] == '1 0 0 1 0 0 Tm': + code.pop() + + path.setTextTransform(*params) + +def parse_setfont(self, token='Tf', params='nf'): + fontinfo = self.fontdict[params[0]] + self.tpath._setFont(fontinfo.name, params[1]) + self.curfont = fontinfo + +def parse_text_out(self, token='Tj', params='t'): + text = params[0].decode(self.curfont.remap, self.curfont.twobyte) + self.tpath.textOut(text) + +def parse_TJ(self, token='TJ', params='a'): + remap = self.curfont.remap + twobyte = self.curfont.twobyte + result = [] + for x in params[0]: + if isinstance(x, PdfString): + result.append(x.decode(remap, twobyte)) + else: + # TODO: Adjust spacing between characters here + int(x) + text = ''.join(result) + self.tpath.textOut(text) + +def parse_end_text(self, token='ET', params=''): + assert self.tpath is not None + self.canv.drawText(self.tpath) + self.tpath=None + +def parse_move_cursor(self, token='Td', params='ff'): + self.tpath.moveCursor(params[0], -params[1]) + +def parse_set_leading(self, token='TL', params='f'): + self.tpath.setLeading(*params) + +def parse_text_line(self, token='T*', params=''): + self.tpath.textLine() + +def parse_set_char_space(self, token='Tc', params='f'): + self.tpath.setCharSpace(*params) + +def parse_set_word_space(self, token='Tw', params='f'): + self.tpath.setWordSpace(*params) + +def parse_set_hscale(self, token='Tz', params='f'): + self.tpath.setHorizScale(params[0] - 100) + +def parse_set_rise(self, token='Ts', params='f'): + self.tpath.setRise(*params) + +def parse_xobject(self, token='Do', params='n'): + # TODO: Need to do this + pass + +class FontInfo(object): + ''' Pretty basic -- needs a lot of work to work right for all fonts + ''' + lookup = { + 'BitstreamVeraSans' : 'Helvetica', # WRONG -- have to learn about font stuff... + } + + def __init__(self, source): + name = source.BaseFont[1:] + self.name = self.lookup.get(name, name) + self.remap = chr + self.twobyte = False + info = source.ToUnicode + if not info: + return + info = info.stream.split('beginbfchar')[1].split('endbfchar')[0] + info = list(PdfTokens(info)) + assert not len(info) & 1 + info2 = [] + for x in info: + assert x[0] == '<' and x[-1] == '>' and len(x) in (4,6), x + i = int(x[1:-1], 16) + info2.append(i) + self.remap = dict((x,chr(y)) for (x,y) in zip(info2[::2], info2[1::2])).get + self.twobyte = len(info[0]) > 4 + +############################################################################# +# Control structures + +def findparsefuncs(): + def checkname(n): + assert n.startswith('/') + return n + + def checkarray(a): + assert isinstance(a, list), a + return a + + def checktext(t): + assert isinstance(t, PdfString) + return t + + fixparam = dict(f=float, i=int, n=checkname, a=checkarray, s=str, t=checktext) + fixcache = {} + def fixlist(params): + try: + result = fixcache[params] + except KeyError: + result = tuple(fixparam[x] for x in params) + fixcache[params] = result + return result + + dispatch = {} + expected_args = 'self token params'.split() + for key, func in globals().iteritems(): + if key.startswith('parse_'): + args, varargs, keywords, defaults = getargspec(func) + assert args == expected_args and varargs is None \ + and keywords is None and len(defaults) == 2, \ + (key, args, varargs, keywords, defaults) + token, params = defaults + if params is not None: + params = fixlist(params) + value = func, params + assert dispatch.setdefault(token, value) is value, repr(token) + return dispatch + +class _ParseClass(object): + dispatch = findparsefuncs() + + @classmethod + def parsepage(cls, page, canvas=None): + self = cls() + contents = page.Contents + if contents.Filter is not None: + raise SystemExit('Cannot parse graphics -- page encoded with %s' % contents.Filter) + dispatch = cls.dispatch.get + self.tokens = tokens = iter(PdfTokens(contents.stream)) + self.params = params = [] + self.canv = canvas + self.gpath = None + self.tpath = None + self.fontdict = dict((x,FontInfo(y)) for (x, y) in page.Resources.Font.iteritems()) + + for token in self.tokens: + info = dispatch(token) + if info is None: + params.append(token) + continue + func, paraminfo = info + if paraminfo is None: + func(self, token, ()) + continue + delta = len(params) - len(paraminfo) + if delta: + if delta < 0: + print 'Operator %s expected %s parameters, got %s' % (token, len(paraminfo), params) + params[:] = [] + continue + else: + print "Unparsed parameters/commands:", params[:delta] + del params[:delta] + paraminfo = zip(paraminfo, params) + try: + params[:] = [x(y) for (x,y) in paraminfo] + except: + for i, (x,y) in enumerate(paraminfo): + try: + x(y) + except: + raise # For now + continue + func(self, token, params) + params[:] = [] + +def debugparser(undisturbed = set('parse_array'.split())): + def debugdispatch(): + def getvalue(oldval): + name = oldval[0].__name__ + def myfunc(self, token, params): + print '%s called %s(%s)' % (token, name, ', '.join(str(x) for x in params)) + if name in undisturbed: + myfunc = oldval[0] + return myfunc, oldval[1] + return dict((x, getvalue(y)) for (x,y) in _ParseClass.dispatch.iteritems()) + + class _DebugParse(_ParseClass): + dispatch = debugdispatch() + + return _DebugParse.parsepage + +parsepage = _ParseClass.parsepage + +if __name__ == '__main__': + import sys + from pdfreader import PdfReader + parse = debugparser() + fname, = sys.argv[1:] + pdf = PdfReader(fname) + for i, page in enumerate(pdf.pages): + print '\nPage %s ------------------------------------' % i + parse(page) |