1 files changed, 99 insertions, 18 deletions
diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java
index 9821b8b..0ebcdeb 100644
--- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java
+++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved.
+ * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
  * Use of this file is governed by the BSD 3-clause license that
  * can be found in the LICENSE.txt file in the project root.
  */
@@ -45,6 +45,12 @@ public class ATNDeserializer {
 	 */
 	private static final UUID ADDED_LEXER_ACTIONS;
 	/**
+	 * This UUID indicates the serialized ATN contains two sets of
+	 * IntervalSets, where the second set's values are encoded as
+	 * 32-bit integers to support the full Unicode SMP range up to U+10FFFF.
+	 */
+	private static final UUID ADDED_UNICODE_SMP;
+	/**
 	 * This list contains all of the currently supported UUIDs, ordered by when
 	 * the feature first appeared in this branch.
 	 */
@@ -61,15 +67,59 @@ public class ATNDeserializer {
 		BASE_SERIALIZED_UUID = UUID.fromString("33761B2D-78BB-4A43-8B0B-4F5BEE8AACF3");
 		ADDED_PRECEDENCE_TRANSITIONS = UUID.fromString("1DA0C57D-6C06-438A-9B27-10BCB3CE0F61");
 		ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E");
+		ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089");
 
 		SUPPORTED_UUIDS = new ArrayList<UUID>();
 		SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID);
 		SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS);
 		SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS);
+		SUPPORTED_UUIDS.add(ADDED_UNICODE_SMP);
+
+		SERIALIZED_UUID = ADDED_UNICODE_SMP;
+	}
+
+	interface UnicodeDeserializer {
+		// Wrapper for readInt() or readInt32()
+		int readUnicode(char[] data, int p);
+
+		// Work around Java not allowing mutation of captured variables
+		// by returning amount by which to increment p after each read
+		int size();
+	}
 
-		SERIALIZED_UUID = ADDED_LEXER_ACTIONS;
+	enum UnicodeDeserializingMode {
+		UNICODE_BMP,
+		UNICODE_SMP
 	}
 
+	static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) {
+		if (mode == UnicodeDeserializingMode.UNICODE_BMP) {
+			return new UnicodeDeserializer() {
+				@Override
+				public int readUnicode(char[] data, int p) {
+					return toInt(data[p]);
+				}
+
+				@Override
+				public int size() {
+					return 1;
+				}
+			};
+		}
+		else {
+			return new UnicodeDeserializer() {
+				@Override
+				public int readUnicode(char[] data, int p) {
+					return toInt32(data, p);
+				}
+
+				@Override
+				public int size() {
+					return 2;
+				}
+			};
+		}
+	}
 
 	private final ATNDeserializationOptions deserializationOptions;
 
@@ -98,7 +148,7 @@ public class ATNDeserializer {
 	 * serialized ATN at or after the feature identified by {@code feature} was
 	 * introduced; otherwise, {@code false}.
 	 */
-	protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
+	static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) {
 		int featureIndex = SUPPORTED_UUIDS.indexOf(feature);
 		if (featureIndex < 0) {
 			return false;
@@ -110,7 +160,22 @@ public class ATNDeserializer {
 	@SuppressWarnings("deprecation")
 	public ATN deserialize(char[] data) {
 		data = data.clone();
-		// don't adjust the first value since that's the version number
+
+		// Each char value in data is shifted by +2 at the entry to this method.
+		// This is an encoding optimization targeting the serialized values 0
+		// and -1 (serialized to 0xFFFF), each of which are very common in the
+		// serialized form of the ATN. In the modified UTF-8 that Java uses for
+		// compiled string literals, these two character values have multi-byte
+		// forms. By shifting each value by +2, they become characters 2 and 1
+		// prior to writing the string, each of which have single-byte
+		// representations. Since the shift occurs in the tool during ATN
+		// serialization, each target is responsible for adjusting the values
+		// during deserialization.
+		//
+		// As a special case, note that the first element of data is not
+		// adjusted because it contains the major version number of the
+		// serialized ATN, which was fixed at 3 at the time the value shifting
+		// was implemented.
 		for (int i = 1; i < data.length; i++) {
 			data[i] = (char)(data[i] - 2);
 		}
@@ -243,22 +308,14 @@ public class ATNDeserializer {
 		// SETS
 		//
 		List<IntervalSet> sets = new ArrayList<IntervalSet>();
-		int nsets = toInt(data[p++]);
-		for (int i=0; i<nsets; i++) {
-			int nintervals = toInt(data[p]);
-			p++;
-			IntervalSet set = new IntervalSet();
-			sets.add(set);
 
-			boolean containsEof = toInt(data[p++]) != 0;
-			if (containsEof) {
-				set.add(-1);
-			}
+		// First, read all sets with 16-bit Unicode code points <= U+FFFF.
+		p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP));
 
-			for (int j=0; j<nintervals; j++) {
-				set.add(toInt(data[p]), toInt(data[p + 1]));
-				p += 2;
-			}
+		// Next, if the ATN was serialized with the Unicode SMP feature,
+		// deserialize sets with 32-bit arguments <= U+10FFFF.
+		if (isFeatureSupported(ADDED_UNICODE_SMP, uuid)) {
+			p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP));
 		}
 
 		//
@@ -495,6 +552,30 @@ public class ATNDeserializer {
 		return atn;
 	}
 
+	private int deserializeSets(char[] data, int p, List<IntervalSet> sets, UnicodeDeserializer unicodeDeserializer) {
+		int nsets = toInt(data[p++]);
+		for (int i=0; i<nsets; i++) {
+			int nintervals = toInt(data[p]);
+			p++;
+			IntervalSet set = new IntervalSet();
+			sets.add(set);
+
+			boolean containsEof = toInt(data[p++]) != 0;
+			if (containsEof) {
+				set.add(-1);
+			}
+
+			for (int j=0; j<nintervals; j++) {
+				int a = unicodeDeserializer.readUnicode(data, p);
+				p += unicodeDeserializer.size();
+				int b = unicodeDeserializer.readUnicode(data, p);
+				p += unicodeDeserializer.size();
+				set.add(a, b);
+			}
+		}
+		return p;
+	}
+
 	/**
 	 * Analyze the {@link StarLoopEntryState} states in the specified ATN to set
 	 * the {@link StarLoopEntryState#isPrecedenceDecision} field to the