summaryrefslogtreecommitdiff
path: root/utils/ofx2xlsx.py
blob: 32b19c65dbb0932b33b39272cc69ef948bac6c89 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from ofxparse import OfxParser
import pandas as pd

import argparse

# TODO automatically extract from transactions
fields = ['id','type', 'date', 'memo', 'payee', 'amount', 'checknum', 'mcc']

parser = argparse.ArgumentParser(description='Convert multiple .qfx or .ofx to'
                                             ' .xlsx.\n'
                                             'Remove duplicate transactions '
                                             'from different files.\n'
                                             'use fixed columns:'
                                             ' %s'%', '.join(fields))
parser.add_argument('files', metavar='*.ofx *.qfx', type=str, nargs='+',
                   help='.qfx or .ofx file names')
parser.add_argument('--start', type=str, metavar='2014-01-01',
                    default='2014-01-01',
                   help="Don't take transaction before this date")
parser.add_argument('--end', type=str, metavar='2014-12-31',
                    default='2014-12-31',
                    help="Don't take transaction after this date")
parser.add_argument('--output', metavar='output.xlsx', type=str,
                    default='output.xlsx', help='Were to store the xlsx')
parser.add_argument('--id-length', metavar='24', type=int, default=24,
                   help='Truncate the number of digits in a transaction ID.'
                        ' This is important because this program remove'
                        ' transactions with duplicate IDs (after verifing'
                        ' that they are identical.'
                        ' If you feel unsafe then use a large number but'
                        'usually the last digits of the transaction ID are'
                        'running numbers which change from download to download'
                        ' as a result you will have duplicate transactions'
                        ' unless you truncate the ID.')


args = parser.parse_args()


data = {}
for fname in args.files:
    ofx = OfxParser.parse(file(fname))
    for account in ofx.accounts:
        df = data.get(account.number, pd.DataFrame(columns=fields+['fname']))
        for transaction in account.statement.transactions:
            s = pd.Series([getattr(transaction,f) for f in fields], index=fields)
            s['fname'] = fname.split('/')[-1]
            df = df.append(s, ignore_index=True)
        df['id'] = df['id'].str[:args.id_length]  # clip the last part of the ID which changes from download to download
        data[account.number] = df

print "Writing result to", args.output
writer = pd.ExcelWriter(args.output)

for account_number, df in data.iteritems():
    # A transaction is identified using all `fields`
    # collapse all repeated transactions from the same file into one row
    # find the number of repeated transactions and
    # put it in samedayrepeat column
    df_count = df.groupby(fields+['fname']).size()
    df_count = df_count.reset_index()
    df_count.columns = list(df_count.columns[:-1]) + ['samedayrepeat']

    # two transactions from the same file are always different
    # but the same transaction can appear in multiple files if they overlap.
    # check we have the same samedayrepeat for the same transaction on different files
    df_size_fname_count = df_count.reset_index().groupby(fields).samedayrepeat.nunique()
    assert (df_size_fname_count == 1).all(), "Different samedayrepeat in different files"

    # take one file as an example
    df1 = df_count.reset_index().groupby(fields+['samedayrepeat']).first()
    df1 = df1.reset_index()

    # expand back the collapsed transactions
    # duplicate rows according to samedayrepeat value
    df2 = df1.copy()
    for i in range(2,df1.samedayrepeat.max()+1):
        df2 = df2.append(df1[i<=df1.samedayrepeat])

    # sort according to date
    df2 = df2.reset_index().set_index('date').sort_index()
    # filter dates
    df2 = df2.ix[args.start:args.end]

    #cleanup
    df2 = df2.reset_index()[fields]

    df2.to_excel(writer, account_number, index=False)

writer.save()