diff options
author | Andrius Merkys <andrius.merkys@gmail.com> | 2018-11-23 04:52:08 -0500 |
---|---|---|
committer | Andrius Merkys <andrius.merkys@gmail.com> | 2018-11-23 04:52:08 -0500 |
commit | 7b19e9be5be41c69c451b63c526bee059881f9b1 (patch) | |
tree | 699bf0523df6868d15843981ea9914ac096ee270 /tool/src/org/antlr/v4/misc | |
parent | 1d0464db4ec5e5c20b2ae62bb3c4eceaa6840bde (diff) |
New upstream version 4.7.1
Diffstat (limited to 'tool/src/org/antlr/v4/misc')
-rw-r--r-- | tool/src/org/antlr/v4/misc/CharSupport.java | 147 | ||||
-rw-r--r-- | tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java | 186 | ||||
-rw-r--r-- | tool/src/org/antlr/v4/misc/FrequencySet.java | 2 | ||||
-rw-r--r-- | tool/src/org/antlr/v4/misc/Graph.java | 2 | ||||
-rw-r--r-- | tool/src/org/antlr/v4/misc/MutableInt.java | 2 | ||||
-rw-r--r-- | tool/src/org/antlr/v4/misc/OrderedHashMap.java | 2 | ||||
-rw-r--r-- | tool/src/org/antlr/v4/misc/Utils.java | 5 |
7 files changed, 296 insertions, 50 deletions
diff --git a/tool/src/org/antlr/v4/misc/CharSupport.java b/tool/src/org/antlr/v4/misc/CharSupport.java index 50db0d9..47e0033 100644 --- a/tool/src/org/antlr/v4/misc/CharSupport.java +++ b/tool/src/org/antlr/v4/misc/CharSupport.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ @@ -7,6 +7,10 @@ package org.antlr.v4.misc; import org.antlr.v4.runtime.Lexer; +import org.antlr.v4.runtime.misc.Interval; +import org.antlr.v4.runtime.misc.IntervalSet; + +import java.util.Iterator; /** */ public class CharSupport { @@ -26,47 +30,48 @@ public class CharSupport { ANTLRLiteralEscapedCharValue['b'] = '\b'; ANTLRLiteralEscapedCharValue['f'] = '\f'; ANTLRLiteralEscapedCharValue['\\'] = '\\'; - ANTLRLiteralEscapedCharValue['\''] = '\''; - ANTLRLiteralEscapedCharValue['"'] = '"'; - ANTLRLiteralEscapedCharValue['-'] = '-'; - ANTLRLiteralEscapedCharValue[']'] = ']'; ANTLRLiteralCharValueEscape['\n'] = "\\n"; ANTLRLiteralCharValueEscape['\r'] = "\\r"; ANTLRLiteralCharValueEscape['\t'] = "\\t"; ANTLRLiteralCharValueEscape['\b'] = "\\b"; ANTLRLiteralCharValueEscape['\f'] = "\\f"; ANTLRLiteralCharValueEscape['\\'] = "\\\\"; - ANTLRLiteralCharValueEscape['\''] = "\\'"; } /** Return a string representing the escaped char for code c. E.g., If c - * has value 0x100, you will get "\u0100". ASCII gets the usual - * char (non-hex) representation. Control characters are spit out - * as unicode. While this is specially set up for returning Java strings, - * it can be used by any language target that has the same syntax. :) + * has value 0x100, you will get "\\u0100". ASCII gets the usual + * char (non-hex) representation. Non-ASCII characters are spit out + * as \\uXXXX or \\u{XXXXXX} escapes. */ public static String getANTLRCharLiteralForChar(int c) { - if ( c< Lexer.MIN_CHAR_VALUE ) { - return "'<INVALID>'"; - } - if ( c<ANTLRLiteralCharValueEscape.length && ANTLRLiteralCharValueEscape[c]!=null ) { - return '\''+ANTLRLiteralCharValueEscape[c]+'\''; + String result; + if ( c < Lexer.MIN_CHAR_VALUE ) { + result = "<INVALID>"; } - if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN && - !Character.isISOControl((char)c) ) { - if ( c=='\\' ) { - return "'\\\\'"; + else { + String charValueEscape = c < ANTLRLiteralCharValueEscape.length ? ANTLRLiteralCharValueEscape[c] : null; + if (charValueEscape != null) { + result = charValueEscape; + } + else if (Character.UnicodeBlock.of((char) c) == Character.UnicodeBlock.BASIC_LATIN && + !Character.isISOControl((char) c)) { + if (c == '\\') { + result = "\\\\"; + } + else if (c == '\'') { + result = "\\'"; + } + else { + result = Character.toString((char) c); + } } - if ( c=='\'') { - return "'\\''"; + else if (c <= 0xFFFF) { + result = String.format("\\u%04X", c); + } else { + result = String.format("\\u{%06X}", c); } - return '\''+Character.toString((char)c)+'\''; } - // turn on the bit above max "\uFFFF" value so that we pad with zeros - // then only take last 4 digits - String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5); - String unicodeStr = "'\\u"+hex+"'"; - return unicodeStr; + return '\'' + result + '\''; } /** Given a literal like (the 3 char sequence with single quotes) 'a', @@ -87,11 +92,26 @@ public class CharSupport { if ( literal.charAt(i) == '\\' ) { end = i+2; if ( i+1 < n && literal.charAt(i+1) == 'u' ) { - for (end = i + 2; end < i + 6; end++) { - if ( end>n ) return null; // invalid escape sequence. - char charAt = literal.charAt(end); - if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { - return null; // invalid escape sequence. + if ( i+2 < n && literal.charAt(i+2) == '{' ) { // extended escape sequence + end = i + 3; + while (true) { + if ( end + 1 > n ) return null; // invalid escape sequence. + char charAt = literal.charAt(end++); + if (charAt == '}') { + break; + } + if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { + return null; // invalid escape sequence. + } + } + } + else { + for (end = i + 2; end < i + 6; end++) { + if ( end>n ) return null; // invalid escape sequence. + char charAt = literal.charAt(end); + if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) { + return null; // invalid escape sequence. + } } } } @@ -102,13 +122,13 @@ public class CharSupport { if ( c==-1 ) { return null; // invalid escape sequence. } - else buf.append((char)c); + else buf.appendCodePoint(c); i = end; } return buf.toString(); } - /** Given char x or \t or \u1234 return the char value; + /** Given char x or \\t or \\u1234 return the char value; * Unnecessary escapes like '\{' yield -1. */ public static int getCharValueFromCharInGrammarLiteral(String cstr) { @@ -119,28 +139,67 @@ public class CharSupport { case 2: if ( cstr.charAt(0)!='\\' ) return -1; // '\x' (antlr lexer will catch invalid char) - if ( Character.isDigit(cstr.charAt(1)) ) return -1; - int escChar = cstr.charAt(1); + char escChar = cstr.charAt(1); + if (escChar == '\'') return escChar; // escape quote only in string literals. int charVal = ANTLRLiteralEscapedCharValue[escChar]; - if ( charVal==0 ) return -1; + if (charVal == 0) return -1; return charVal; case 6: - // '\u1234' + // '\\u1234' or '\\u{12}' if ( !cstr.startsWith("\\u") ) return -1; - String unicodeChars = cstr.substring(2, cstr.length()); - int result = -1; - try { - result = Integer.parseInt(unicodeChars, 16); + int startOff; + int endOff; + if ( cstr.charAt(2) == '{' ) { + startOff = 3; + endOff = cstr.indexOf('}'); } - catch (NumberFormatException e) { + else { + startOff = 2; + endOff = cstr.length(); } - return result; + return parseHexValue(cstr, startOff, endOff); default: + if ( cstr.startsWith("\\u{") ) { + return parseHexValue(cstr, 3, cstr.indexOf('}')); + } return -1; } } + public static int parseHexValue(String cstr, int startOff, int endOff) { + if (startOff < 0 || endOff < 0) { + return -1; + } + String unicodeChars = cstr.substring(startOff, endOff); + int result = -1; + try { + result = Integer.parseInt(unicodeChars, 16); + } + catch (NumberFormatException e) { + } + return result; + } + public static String capitalize(String s) { return Character.toUpperCase(s.charAt(0)) + s.substring(1); } + + public static String getIntervalSetEscapedString(IntervalSet intervalSet) { + StringBuilder buf = new StringBuilder(); + Iterator<Interval> iter = intervalSet.getIntervals().iterator(); + while (iter.hasNext()) { + Interval interval = iter.next(); + buf.append(getRangeEscapedString(interval.a, interval.b)); + if (iter.hasNext()) { + buf.append(" | "); + } + } + return buf.toString(); + } + + public static String getRangeEscapedString(int codePointStart, int codePointEnd) { + return codePointStart != codePointEnd + ? getANTLRCharLiteralForChar(codePointStart) + ".." + getANTLRCharLiteralForChar(codePointEnd) + : getANTLRCharLiteralForChar(codePointStart); + } } diff --git a/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java b/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java new file mode 100644 index 0000000..d34988d --- /dev/null +++ b/tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +package org.antlr.v4.misc; + +import org.antlr.v4.runtime.misc.IntervalSet; +import org.antlr.v4.unicode.UnicodeData; + +import java.util.Objects; + +/** + * Utility class to parse escapes like: + * \\n + * \\uABCD + * \\u{10ABCD} + * \\p{Foo} + * \\P{Bar} + * \\p{Baz=Blech} + * \\P{Baz=Blech} + */ +public abstract class EscapeSequenceParsing { + public static class Result { + public enum Type { + INVALID, + CODE_POINT, + PROPERTY + }; + + public final Type type; + public final int codePoint; + public final IntervalSet propertyIntervalSet; + public final int startOffset; + public final int parseLength; + + public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int startOffset, int parseLength) { + this.type = type; + this.codePoint = codePoint; + this.propertyIntervalSet = propertyIntervalSet; + this.startOffset = startOffset; + this.parseLength = parseLength; + } + + @Override + public String toString() { + return String.format( + "%s type=%s codePoint=%d propertyIntervalSet=%s parseLength=%d", + super.toString(), + type, + codePoint, + propertyIntervalSet, + parseLength); + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof Result)) { + return false; + } + Result that = (Result) other; + if (this == that) { + return true; + } + return Objects.equals(this.type, that.type) && + Objects.equals(this.codePoint, that.codePoint) && + Objects.equals(this.propertyIntervalSet, that.propertyIntervalSet) && + Objects.equals(this.parseLength, that.parseLength); + } + + @Override + public int hashCode() { + return Objects.hash(type, codePoint, propertyIntervalSet, parseLength); + } + } + + /** + * Parses a single escape sequence starting at {@code startOff}. + * + * Returns a type of INVALID if no valid escape sequence was found, a Result otherwise. + */ + public static Result parseEscape(String s, int startOff) { + int offset = startOff; + if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') { + return invalid(startOff, s.length()-1); + } + // Move past backslash + offset++; + int escaped = s.codePointAt(offset); + // Move past escaped code point + offset += Character.charCount(escaped); + if (escaped == 'u') { + // \\u{1} is the shortest we support + if (offset + 3 > s.length()) { + return invalid(startOff, s.length()-1); + } + int hexStartOffset; + int hexEndOffset; // appears to be exclusive + if (s.codePointAt(offset) == '{') { + hexStartOffset = offset + 1; + hexEndOffset = s.indexOf('}', hexStartOffset); + if (hexEndOffset == -1) { + return invalid(startOff, s.length()-1); + } + offset = hexEndOffset + 1; + } + else { + if (offset + 4 > s.length()) { + return invalid(startOff, s.length()-1); + } + hexStartOffset = offset; + hexEndOffset = offset + 4; + offset = hexEndOffset; + } + int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset); + if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) { + return invalid(startOff, startOff+6-1); + } + return new Result( + Result.Type.CODE_POINT, + codePointValue, + IntervalSet.EMPTY_SET, + startOff, + offset - startOff); + } + else if (escaped == 'p' || escaped == 'P') { + // \p{L} is the shortest we support + if (offset + 3 > s.length()) { + return invalid(startOff, s.length()-1); + } + if (s.codePointAt(offset) != '{') { + return invalid(startOff, offset); + } + int openBraceOffset = offset; + int closeBraceOffset = s.indexOf('}', openBraceOffset); + if (closeBraceOffset == -1) { + return invalid(startOff, s.length()-1); + } + String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset); + IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName); + if (propertyIntervalSet == null) { + return invalid(startOff, closeBraceOffset); + } + offset = closeBraceOffset + 1; + if (escaped == 'P') { + propertyIntervalSet = propertyIntervalSet.complement(IntervalSet.COMPLETE_CHAR_SET); + } + return new Result( + Result.Type.PROPERTY, + -1, + propertyIntervalSet, + startOff, + offset - startOff); + } + else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) { + int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped]; + if (codePoint == 0) { + if (escaped != ']' && escaped != '-') { // escape ']' and '-' only in char sets. + return invalid(startOff, startOff+1); + } + else { + codePoint = escaped; + } + } + return new Result( + Result.Type.CODE_POINT, + codePoint, + IntervalSet.EMPTY_SET, + startOff, + offset - startOff); + } + else { + return invalid(startOff,s.length()-1); + } + } + + private static Result invalid(int start, int stop) { // start..stop is inclusive + return new Result( + Result.Type.INVALID, + 0, + IntervalSet.EMPTY_SET, + start, + stop - start + 1); + } +} diff --git a/tool/src/org/antlr/v4/misc/FrequencySet.java b/tool/src/org/antlr/v4/misc/FrequencySet.java index d3d4c73..343f12b 100644 --- a/tool/src/org/antlr/v4/misc/FrequencySet.java +++ b/tool/src/org/antlr/v4/misc/FrequencySet.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ diff --git a/tool/src/org/antlr/v4/misc/Graph.java b/tool/src/org/antlr/v4/misc/Graph.java index ec4c147..ea10cc0 100644 --- a/tool/src/org/antlr/v4/misc/Graph.java +++ b/tool/src/org/antlr/v4/misc/Graph.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ diff --git a/tool/src/org/antlr/v4/misc/MutableInt.java b/tool/src/org/antlr/v4/misc/MutableInt.java index 607b8f6..007f4c2 100644 --- a/tool/src/org/antlr/v4/misc/MutableInt.java +++ b/tool/src/org/antlr/v4/misc/MutableInt.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ diff --git a/tool/src/org/antlr/v4/misc/OrderedHashMap.java b/tool/src/org/antlr/v4/misc/OrderedHashMap.java index 67f9deb..7a988f9 100644 --- a/tool/src/org/antlr/v4/misc/OrderedHashMap.java +++ b/tool/src/org/antlr/v4/misc/OrderedHashMap.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ diff --git a/tool/src/org/antlr/v4/misc/Utils.java b/tool/src/org/antlr/v4/misc/Utils.java index d9a2473..abdedb7 100644 --- a/tool/src/org/antlr/v4/misc/Utils.java +++ b/tool/src/org/antlr/v4/misc/Utils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2016 The ANTLR Project. All rights reserved. + * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ @@ -122,7 +122,8 @@ public class Utils { public static void setSize(List<?> list, int size) { if (size < list.size()) { list.subList(size, list.size()).clear(); - } else { + } + else { while (size > list.size()) { list.add(null); } |