summaryrefslogtreecommitdiff
path: root/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java
blob: 0cf7cc0585c42be94f4c9a077b07046840ca1ee8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
package morfologik.fsa;

import java.util.*;

import morfologik.util.Arrays;

import static morfologik.fsa.ConstantArcSizeFSA.*;

/**
 * Fast, memory-conservative finite state automaton builder, returning a
 * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction
 * speed and memory consumption).
 */
public final class FSABuilder {
    /**
     * Debug and information constants.
     * 
     * @see FSABuilder#getInfo()
     */
    public enum InfoEntry {
        SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
        SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"),
        CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"),
        MAX_ACTIVE_PATH_LENGTH("Max active path"),
        STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"),
        STATE_REGISTRY_SIZE("Registry hash entries"),
        ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");

        private final String stringified;
        
        InfoEntry(String stringified) {
            this.stringified = stringified;
        }

        @Override
        public String toString() {
            return stringified;
        }
    }

    /** A megabyte. */
    private final static int MB = 1024 * 1024;

    /**
     * Internal serialized FSA buffer expand ratio.
     */
    private final static int BUFFER_GROWTH_SIZE = 5 * MB;

    /**
     * Maximum number of labels from a single state.
     */
    private final static int MAX_LABELS = 256;

    /**
     * Comparator comparing full byte arrays consistently with 
     * {@link #compare(byte[], int, int, byte[], int, int)}.
     */
    public static final Comparator<byte[]> LEXICAL_ORDERING = new Comparator<byte[]>() {
        public int compare(byte[] o1, byte[] o2) {
            return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length);
        }
    };

    /**
     * Internal serialized FSA buffer expand ratio.
     */
    private final int bufferGrowthSize;

    /**
     * Holds serialized and mutable states. Each state is a sequential list of
     * arcs, the last arc is marked with {@link #BIT_ARC_LAST}.
     */
    private byte[] serialized = new byte[0];

    /**
     * Number of bytes already taken in {@link #serialized}. Start from 1 to
     * keep 0 a sentinel value (for the hash set and final state).
     */
    private int size;

    /**
     * States on the "active path" (still mutable). Values are addresses of each
     * state's first arc.
     */
    private int[] activePath = new int[0];

    /**
     * Current length of the active path.
     */
    private int activePathLen;

    /**
     * The next offset at which an arc will be added to the given state on
     * {@link #activePath}.
     */
    private int[] nextArcOffset = new int[0];

    /**
     * Root state. If negative, the automaton has been built already and cannot be extended.
     */
    private int root;
    
    /**
     * An epsilon state. The first and only arc of this state points either 
     * to the root or to the terminal state, indicating an empty automaton. 
     */
    private int epsilon;

    /**
     * Hash set of state addresses in {@link #serialized}, hashed by
     * {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
     */
    private int[] hashSet = new int[2];
    
    /**
     * Number of entries currently stored in {@link #hashSet}.
     */
    private int hashSize = 0;

    /**
     * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only.
     */
    private byte [] previous;

    /**
     * Information about the automaton and its compilation.
     */
    private TreeMap<InfoEntry, Object> info; 

    /**
     * {@link #previous} sequence's length, used in assertions only.
     */
    private int previousLength;

    /** */
    public FSABuilder() {
        this(BUFFER_GROWTH_SIZE);
    }
    
    /** */
    public FSABuilder(int bufferGrowthSize) {
        this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);

        // Allocate epsilon state.
        epsilon = allocateState(1);
        serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;

        // Allocate root, with an initial empty set of output arcs.
        expandActivePath(1);
        root = activePath[0];
    }

    /**
     * Add a single sequence of bytes to the FSA. The input must be lexicographically greater
     * than any previously added sequence.
     */
    public void add(byte[] sequence, int start, int len) {
        assert serialized != null : "Automaton already built.";
        assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 : 
            "Input must be sorted: " 
                + Arrays.toString(previous, 0, previousLength) + " >= " 
                + Arrays.toString(sequence, start, len);
        assert setPrevious(sequence, start, len);
    
        // Determine common prefix length.
        final int commonPrefix = commonPrefix(sequence, start, len);
    
        // Make room for extra states on active path, if needed.
        expandActivePath(len);
    
        // Freeze all the states after the common prefix.
        for (int i = activePathLen - 1; i > commonPrefix; i--) {
            final int frozenState = freezeState(i);
            setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
            nextArcOffset[i] = activePath[i];
        }

        // Create arcs to new suffix states.
        for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
            final int p = nextArcOffset[i - 1];
    
            serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
            serialized[p + LABEL_OFFSET] = sequence[j++];
            setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);
    
            nextArcOffset[i - 1] = p + ARC_SIZE;
        }

        // Save last sequence's length so that we don't need to calculate it again.
        this.activePathLen = len;
    }

    /** Number of serialization buffer reallocations. */
    private int serializationBufferReallocations;
    
    /**
     * Complete the automaton.
     */
    public FSA complete() {
        add(new byte[0], 0, 0);
    
        if (nextArcOffset[0] - activePath[0] == 0) {
            // An empty FSA.
            setArcTarget(epsilon, TERMINAL_STATE);
        } else {
            // An automaton with at least a single arc from root.
            root = freezeState(0);
            setArcTarget(epsilon, root);
        }

        info = new TreeMap<InfoEntry, Object>();
        info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
        info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
        info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
        info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
        info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
        info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
        info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB, 
                (this.serialized.length + this.hashSet.length * 4) / (double) MB);

        final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon);
        this.serialized = null;
        this.hashSet = null;
        return fsa;
    }

    /**
     * Build a minimal, deterministic automaton from a sorted list of byte sequences.
     */
    public static FSA build(byte[][] input) {
        final FSABuilder builder = new FSABuilder(); 
    
        for (byte [] chs : input)
            builder.add(chs, 0, chs.length);
    
        return builder.complete();
    }

    /**
     * Build a minimal, deterministic automaton from an iterable list of byte sequences.
     */
    public static FSA build(Iterable<byte[]> input) {
        final FSABuilder builder = new FSABuilder(); 
    
        for (byte [] chs : input)
            builder.add(chs, 0, chs.length);
    
        return builder.complete();
    }
    
    /**
     * Return various statistics concerning the FSA and its compilation.
     */
    public Map<InfoEntry, Object> getInfo() {
        return info;
    }
    
    /** Is this arc the state's last? */
    private boolean isArcLast(int arc) {
        return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
    }

    /** Is this arc final? */
    private boolean isArcFinal(int arc) {
        return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
    }

    /** Get label's arc. */
    private byte getArcLabel(int arc) {
        return serialized[arc + LABEL_OFFSET];
    }

    /**
     * Fills the target state address of an arc.
     */
    private void setArcTarget(int arc, int state) {
        arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
        for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
            serialized[--arc] = (byte) state;
            state >>>= 8;
        }
    }

    /**
     * Returns the address of an arc.
     */
    private int getArcTarget(int arc) {
        arc += ADDRESS_OFFSET;
        return (serialized[arc]) << 24 | 
               (serialized[arc + 1] & 0xff) << 16 | 
               (serialized[arc + 2] & 0xff) << 8 |
               (serialized[arc + 3] & 0xff);
    }

    /**
     * @return The number of common prefix characters with the previous
     *         sequence.
     */
    private int commonPrefix(byte[] sequence, int start, int len) {
        // Empty root state case.
        final int max = Math.min(len, activePathLen);
        int i;
        for (i = 0; i < max; i++) {
            final int lastArc = nextArcOffset[i] - ARC_SIZE;
            if (sequence[start++] != getArcLabel(lastArc)) {
                break;
            }
        }

        return i;
    }

    /**
     * Freeze a state: try to find an equivalent state in the interned states
     * dictionary first, if found, return it, otherwise, serialize the mutable
     * state at <code>activePathIndex</code> and return it.
     */
    private int freezeState(final int activePathIndex) {
        final int start = activePath[activePathIndex];
        final int end = nextArcOffset[activePathIndex];
        final int len = end - start;

        // Set the last arc flag on the current active path's state.
        serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;

        // Try to locate a state with an identical content in the hash set.
        final int bucketMask = (hashSet.length - 1);
        int slot = hash(start, len) & bucketMask;
        for (int i = 0;;) {
            int state = hashSet[slot];
            if (state == 0) {
                state = hashSet[slot] = serialize(activePathIndex);
                if (++hashSize > hashSet.length / 2)
                    expandAndRehash();
                return state;
            } else if (equivalent(state, start, len)) {
                return state;
            }

            slot = (slot + (++i)) & bucketMask;
        }
    }

    /**
     * Reallocate and rehash the hash set.
     */
    private void expandAndRehash() {
        final int[] newHashSet = new int[hashSet.length * 2];
        final int bucketMask = (newHashSet.length - 1);

        for (int j = 0; j < hashSet.length; j++) {
            final int state = hashSet[j];
            if (state > 0) {
                int slot = hash(state, stateLength(state)) & bucketMask;
                for (int i = 0; newHashSet[slot] > 0;) {
                    slot = (slot + (++i)) & bucketMask;
                }
                newHashSet[slot] = state;
            }
        }
        this.hashSet = newHashSet;
    }

    /**
     * The total length of the serialized state data (all arcs). 
     */
    private int stateLength(int state) {
        int arc = state;
        while (!isArcLast(arc)) {
            arc += ARC_SIZE;
        }
        return arc - state + ARC_SIZE;
    }

    /** Return <code>true</code> if two regions in {@link #serialized} are identical. */
    private boolean equivalent(int start1, int start2, int len) {
        if (start1 + len > size || start2 + len > size)
            return false;

        while (len-- > 0)
            if (serialized[start1++] != serialized[start2++])
                return false;

        return true;
    }

    /**
     * Serialize a given state on the active path.
     */
    private int serialize(final int activePathIndex) {
        expandBuffers();

        final int newState = size;
        final int start = activePath[activePathIndex];
        final int len = nextArcOffset[activePathIndex] - start;
        System.arraycopy(serialized, start, serialized, newState, len);

        size += len;
        return newState;
    }

    /**
     * Hash code of a fragment of {@link #serialized} array.
     */
    private int hash(int start, int byteCount) {
        assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?";

        int h = 0;
        for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) {
            h = 17 * h + getArcLabel(start);
            h = 17 * h + getArcTarget(start);
            if (isArcFinal(start)) h += 17;
        }

        return h;
    }

    /**
     * Append a new mutable state to the active path.
     */
    private void expandActivePath(int size) {
        if (activePath.length < size) {
            final int p = activePath.length;
            activePath = java.util.Arrays.copyOf(activePath, size);
            nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size);

            for (int i = p; i < size; i++) {
                nextArcOffset[i] = activePath[i] = 
                    allocateState(/* assume max labels count */ MAX_LABELS);
            }
        }
    }

    /**
     * Expand internal buffers for the next state.
     */
    private void expandBuffers() {
        if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
            serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
            serializationBufferReallocations++;
        }
    }

    /**
     * Allocate space for a state with the given number of outgoing labels.
     * 
     * @return state offset
     */
    private int allocateState(int labels) {
        expandBuffers();
        final int state = size;
        size += labels * ARC_SIZE;
        return state;
    }
    
    /**
     * Copy <code>current</code> into an internal buffer.
     */
    private boolean setPrevious(byte [] sequence, int start, int length) {
        if (previous == null || previous.length < length) {
            previous = new byte [length];
        }

        System.arraycopy(sequence, start, previous, 0, length);
        previousLength = length;
        return true;
    }
    
    /**
     * Lexicographic order of input sequences. By default, consistent with the "C" sort
     * (absolute value of bytes, 0-255).
     */
    public static int compare(byte [] s1, int start1, int lens1, 
                              byte [] s2, int start2, int lens2) {
        final int max = Math.min(lens1, lens2);

        for (int i = 0; i < max; i++) {
            final byte c1 = s1[start1++];
            final byte c2 = s2[start2++];
            if (c1 != c2)
                return (c1 & 0xff) - (c2 & 0xff);
        }

        return lens1 - lens2;
    }
}