summaryrefslogtreecommitdiff
path: root/misc/packer.c
blob: ffd89fa24ad8b9c306baccbb83e91748b1da7c7f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/* packer -- a program to pack a list of files into one */

/* The input is a file with one line per file to be packed.
   Each line starts with either 'a' or 'b' for ascii or binary.
   The second character is a space or tab (it's ignored).
   The third character to the end of the line is the file to be packed.

   The encoding is as follows:
   A file starts with '!' for ascii or '#' for binary in column one.
   The remainder of the line is a file name/path.
   Ascii files have a special character in column 1:
       'A'-'~' indicates 0 to n blanks.  Leading Tabs are converted to blanks
           and included in this count.
     '0'-'9' indicates 0 to 9 blank lines (always followed by an additional newline).
   Otherwise, all characters are just copied to output except:
   '$' is an escape character:
        $$ indicates $,
        $@ through $_ indicate 0x00 through 0x1F,
        $\n indicates an empty string (useful for avoiding long lines on output)
        After $\n, the following character is not treated specially even though
        it is in column 1.

   Should we be doing more to compress files?  It looks like the special
   handling of leading blanks compresses files about 4%.  This is not much,
   but the encoding allows us to put markers (! and #) in column 1 to
   separate files. Originally, leading blank handling also converted between
   8 and 4 character tab stops, but you can no longer assume tab stops under,
   say, Windows, are 8 characters wide. Source files should not have
   tabs.

   Further simple encoding such as run-length encoding and word
   substitution doesn't buy too much and was deemed not worth the effort.
   Run-length encoding seems to buy another couple of percent.
   Substitution for common words like int, print, return, the, register,
   etc. buys maybe .5% per word, but it seems unlikely this will buy
   more than a total of 10%, so we're looking at a max of 15% to 20%
   compression without starting to huffman encode at the bit level.

   For binary files, every 3 bytes are used to form a 24-bit number which is
   split into 4 fields of 6 bits.  Each field is encoded by adding ascii '0'.
   If only one or two bytes are left at the end of the file, the encoding is
   as if zeros were appended to the file, but only 2 or 3 ascii characters
   (instead of the usual 4) are output.  The ascii file encoding is terminated
   with a period ('.').  Newlines are inserted to keep line lengths down but
   should be ignored by the reader.
 */


#include "switches.h"
#include "stdlib.h"
#include "string.h"
#include "cext.h"
#include "convert.h"
/* since we aren't using the cleanup package, expose exit(): */
#undef exit

#include "stdio.h"
#ifdef MACINTOSH
#include "console.h"
#endif

#define EOS 0

#define string_max 500

void pack_newline();
void pack_ascii();
void pack_binary();
void put_binary();

/* main -- pack a list of files */
/**/
int main(argc, argv)
  int argc;
  char *argv[];
{
    FILE *inf;  /* input file: a list of file names to pack */
    FILE *outf; /* the packed output */
    char filename[string_max];  /* holds names of input files */
    char convname[string_max];  /* filename converted to local syntax */
    int base = 1;
#ifdef MACINTOSH
    argc = ccommand(&argv);
#endif
    if (argc != 3 && argc != 4) {
        fprintf(stderr, "Usage: packer [-p] input-list-file output-file\n");
        exit(1);
    }
    if (argc == 4) {
        base = 2;
        if (strcmp(argv[1], "-p") == 0) {
            pauseflag = 1;
        } else {
            fprintf(stderr, "Expected \"-p\" as 1st argument.\n");
            exit(1);
        }
    }
    inf = fopen(argv[base], "r");
    if (!inf) {
        fprintf(stderr, "Couldn't open |%s|\n", argv[base]);
        exit(1);
    }
    outf = fopen(argv[base+1], "w");
    if (!outf) {
        fclose(inf);
        fprintf(stderr, "Couldn't open |%s|\n", argv[base + 1]);
        exit(1);
    }
    printf("Using tab width of %d\n", TAB_WIDTH);
    while (fgets(filename, string_max, inf)) {
        filename[strlen(filename) - 1] = EOS;   /* remove newline at end */
        if (filename[0] == EOS) continue;       /* skip blank lines */
        puts(filename);
        strcpy(convname, filename + 2);
        convert(convname);
        if (filename[0] == 'a') pack_ascii(filename + 2, convname, outf);
        else if (filename[0] == 'b') pack_binary(filename + 2, convname, outf);
        else {
            fprintf(stderr, "Bad file spec (expecting a or b in col 1): %s\n",
                     filename);
            if (PAUSE) getchar();
        }

    }
    fclose(outf);
    fclose(inf);
    return 0;
}


/* pack_ascii -- open filename and append its encoding to outf */
/**/
void pack_ascii(filename, convname, outf)
  char *filename;
  char *convname;
  FILE *outf;
{
    int line_len = 0;
    int c;
    FILE *inf;

    inf = fopen(convname, "r");
    /* printf("opened %lx\n", inf); */
    if (!inf) {
        fprintf(stderr, "Couldn't open |%s| - skipped\n", convname);
        if (PAUSE) getchar();
        return;
    }
    fprintf(outf, "!%s\n", filename);

    pack_newline(inf, outf, &line_len);
    while ((c = getc(inf)) != EOF) {
        if (c > 127) {
            fprintf(stderr, "non-ascii char 0x%x in %s.\n", c, convname);
            exit(1);
        } else if (c == '\n') {
            putc(c, outf);
            line_len = 0;
            pack_newline(inf, outf, &line_len);
        } else if (c == '$') {
            putc('$', outf);
            putc('$', outf);
            line_len += 2;
        } else if (c < 32) {
            putc('$', outf);
            putc('@' + c, outf);
            line_len += 2;
        } else {
            putc(c, outf);
            line_len++;
        }
        if (line_len > 70) {
            putc('$', outf);
            putc('\n', outf);
            line_len = 0;
        }
    }
    if (line_len) {
        fprintf(stderr, "missing newline added to the end of %s\n", convname);
        putc('\n', outf);
        if (PAUSE) getchar();
    }
    /* printf("closing %lx\n", inf); */
    fclose(inf);
}


/* pack_binary -- open binary filename and append its encoding to outf */
/**/
void pack_binary(filename, convname, outf)
  char *filename;
  char *convname;
  FILE *outf;
{
    int line_len = 0;
    int c;
    long data;
    int n;
    FILE *inf;
    boolean isbinary = false;

    inf = fopen(convname, "rb");
    /* printf("opened %lx\n", inf); */
    if (!inf) {
        fprintf(stderr, "Couldn't open |%s| - skipped\n", convname);
        if (PAUSE) getchar();
        return;
    }
    fprintf(outf, "#%s\n", filename);

    n = 0;
    data = 0;
    while ((c = getc(inf)) != EOF) {
        if (c > 127) isbinary = true;
        data = (data << 8) | c;
        n++;
        if (n == 3) {
            put_binary(data, outf);
            n = 0;
            data = 0;
            line_len += 4;
            if (line_len >= 70) {
                putc('\n', outf);
                line_len = 0;
            }
        }
    }
    if (n == 1) {
        data = data << 16;
        putc('0' + ((data >> 18) & 0x3F), outf);
        putc('0' + ((data >> 12) & 0x3F), outf);
    }
    if (n == 2) {
        data = data << 8;
        putc('0' + ((data >> 18) & 0x3F), outf);
        putc('0' + ((data >> 12) & 0x3F), outf);
        putc('0' + ((data >> 6) & 0x3F), outf);
    }
    putc('.', outf);
    putc('\n', outf);
    if (!isbinary) {
        fprintf(stderr, "%s seems to be an ascii file.\n", convname);
        if (PAUSE) getchar();
    }
    /* printf("closing %lx\n", inf); */
    fclose(inf);
}


/* pack_newline -- newline sequence encoding to outf */
/**/
void pack_newline(inf, outf, line_len)
  FILE *inf;    /* input file */
  FILE *outf;   /* where to write output */
  int *line_len;
{
    int c;
    int count = 0;
    int outc;
    
    while (((c = getc(inf)) != EOF) && (c == '\n')) {
         count++;                       
    }
    while (count >= 10) {
        fprintf(outf, "9\n");
        *line_len = 0;
        count -= 10;
    }
    if (count > 0) {
        fprintf(outf, "%c\n", '0' + count - 1);
        *line_len = 0;
    }
    
    /* now run-length encode leading blanks... */
    count = 0;
    while (c != EOF) {
        if (c == ' ') count++;
        /* we no longer convert tabs to spaces...
         else if (c == '\t') count += TAB_WIDTH;
         */
        else break;
        c = getc(inf);
    }
    if (c != EOF || count) {
        outc = 'A' + count;
        if (outc > '~') outc = '~';
        putc(outc, outf);
        (*line_len) += 1;
        count -= (outc - 'A');
        while (count > 0) {
            putc(' ', outf);
            (*line_len) += 1;
            count--;
        }
    }
    /* now do the rest of the line */
    if (c != EOF) ungetc(c, inf);
}


/* put_binary -- write 3 binary bytes as 4 ascii bytes */
/**/    
void put_binary(data, outf)
  long data;
  FILE *outf;
{
    putc('0' + ((data >> 18) & 0x3F), outf);
    putc('0' + ((data >> 12) & 0x3F), outf);
    putc('0' + ((data >> 6) & 0x3F), outf);
    putc('0' + (data & 0x3F), outf);
}