diff options
Diffstat (limited to 'runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java')
-rw-r--r-- | runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java | 117 |
1 files changed, 99 insertions, 18 deletions
diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java index 9821b8b..0ebcdeb 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ @@ -45,6 +45,12 @@ public class ATNDeserializer { */ private static final UUID ADDED_LEXER_ACTIONS; /** + * This UUID indicates the serialized ATN contains two sets of + * IntervalSets, where the second set's values are encoded as + * 32-bit integers to support the full Unicode SMP range up to U+10FFFF. + */ + private static final UUID ADDED_UNICODE_SMP; + /** * This list contains all of the currently supported UUIDs, ordered by when * the feature first appeared in this branch. */ @@ -61,15 +67,59 @@ public class ATNDeserializer { BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3"); ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61"); ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); + ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089"); SUPPORTED_UUIDS = new ArrayList<UUID>(); SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID); SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS); SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS); + SUPPORTED_UUIDS.add(ADDED_UNICODE_SMP); + + SERIALIZED_UUID = ADDED_UNICODE_SMP; + } + + interface UnicodeDeserializer { + // Wrapper for readInt() or readInt32() + int readUnicode(char[] data, int p); + + // Work around Java not allowing mutation of captured variables + // by returning amount by which to increment p after each read + int size(); + } - SERIALIZED_UUID = ADDED_LEXER_ACTIONS; + enum UnicodeDeserializingMode { + UNICODE_BMP, + UNICODE_SMP } + static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) { + if (mode == UnicodeDeserializingMode.UNICODE_BMP) { + return new UnicodeDeserializer() { + @Override + public int readUnicode(char[] data, int p) { + return toInt(data[p]); + } + + @Override + public int size() { + return 1; + } + }; + } + else { + return new UnicodeDeserializer() { + @Override + public int readUnicode(char[] data, int p) { + return toInt32(data, p); + } + + @Override + public int size() { + return 2; + } + }; + } + } private final ATNDeserializationOptions deserializationOptions; @@ -98,7 +148,7 @@ public class ATNDeserializer { * serialized ATN at or after the feature identified by {@code feature} was * introduced; otherwise, {@code false}. */ - protected boolean isFeatureSupported(UUID feature, UUID actualUuid) { + static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) { int featureIndex = SUPPORTED_UUIDS.indexOf(feature); if (featureIndex < 0) { return false; @@ -110,7 +160,22 @@ public class ATNDeserializer { @SuppressWarnings("deprecation") public ATN deserialize(char[] data) { data = data.clone(); - // don't adjust the first value since that's the version number + + // Each char value in data is shifted by +2 at the entry to this method. + // This is an encoding optimization targeting the serialized values 0 + // and -1 (serialized to 0xFFFF), each of which are very common in the + // serialized form of the ATN. In the modified UTF-8 that Java uses for + // compiled string literals, these two character values have multi-byte + // forms. By shifting each value by +2, they become characters 2 and 1 + // prior to writing the string, each of which have single-byte + // representations. Since the shift occurs in the tool during ATN + // serialization, each target is responsible for adjusting the values + // during deserialization. + // + // As a special case, note that the first element of data is not + // adjusted because it contains the major version number of the + // serialized ATN, which was fixed at 3 at the time the value shifting + // was implemented. for (int i = 1; i < data.length; i++) { data[i] = (char)(data[i] - 2); } @@ -243,22 +308,14 @@ public class ATNDeserializer { // SETS // List<IntervalSet> sets = new ArrayList<IntervalSet>(); - int nsets = toInt(data[p++]); - for (int i=0; i<nsets; i++) { - int nintervals = toInt(data[p]); - p++; - IntervalSet set = new IntervalSet(); - sets.add(set); - boolean containsEof = toInt(data[p++]) != 0; - if (containsEof) { - set.add(-1); - } + // First, read all sets with 16-bit Unicode code points <= U+FFFF. + p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP)); - for (int j=0; j<nintervals; j++) { - set.add(toInt(data[p]), toInt(data[p + 1])); - p += 2; - } + // Next, if the ATN was serialized with the Unicode SMP feature, + // deserialize sets with 32-bit arguments <= U+10FFFF. + if (isFeatureSupported(ADDED_UNICODE_SMP, uuid)) { + p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP)); } // @@ -495,6 +552,30 @@ public class ATNDeserializer { return atn; } + private int deserializeSets(char[] data, int p, List<IntervalSet> sets, UnicodeDeserializer unicodeDeserializer) { + int nsets = toInt(data[p++]); + for (int i=0; i<nsets; i++) { + int nintervals = toInt(data[p]); + p++; + IntervalSet set = new IntervalSet(); + sets.add(set); + + boolean containsEof = toInt(data[p++]) != 0; + if (containsEof) { + set.add(-1); + } + + for (int j=0; j<nintervals; j++) { + int a = unicodeDeserializer.readUnicode(data, p); + p += unicodeDeserializer.size(); + int b = unicodeDeserializer.readUnicode(data, p); + p += unicodeDeserializer.size(); + set.add(a, b); + } + } + return p; + } + /** * Analyze the {@link StarLoopEntryState} states in the specified ATN to set * the {@link StarLoopEntryState#isPrecedenceDecision} field to the |