diff options
Diffstat (limited to 'tool/src/org/antlr/v4/misc/CharSupport.java')
-rw-r--r-- | tool/src/org/antlr/v4/misc/CharSupport.java | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/tool/src/org/antlr/v4/misc/CharSupport.java b/tool/src/org/antlr/v4/misc/CharSupport.java new file mode 100644 index 0000000..22e9648 --- /dev/null +++ b/tool/src/org/antlr/v4/misc/CharSupport.java @@ -0,0 +1,153 @@ +/* + * [The "BSD license"] + * Copyright (c) 2012 Terence Parr + * Copyright (c) 2012 Sam Harwell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.antlr.v4.misc; + +import org.antlr.v4.runtime.Lexer; + +/** */ +public class CharSupport { + /** When converting ANTLR char and string literals, here is the + * value set of escape chars. + */ + public static int ANTLRLiteralEscapedCharValue[] = new int[255]; + + /** Given a char, we need to be able to show as an ANTLR literal. + */ + public static String ANTLRLiteralCharValueEscape[] = new String[255]; + + static { + ANTLRLiteralEscapedCharValue['n'] = '\n'; + ANTLRLiteralEscapedCharValue['r'] = '\r'; + ANTLRLiteralEscapedCharValue['t'] = '\t'; + ANTLRLiteralEscapedCharValue['b'] = '\b'; + ANTLRLiteralEscapedCharValue['f'] = '\f'; + ANTLRLiteralEscapedCharValue['\\'] = '\\'; + ANTLRLiteralEscapedCharValue['\''] = '\''; + ANTLRLiteralEscapedCharValue['"'] = '"'; + ANTLRLiteralCharValueEscape['\n'] = "\\n"; + ANTLRLiteralCharValueEscape['\r'] = "\\r"; + ANTLRLiteralCharValueEscape['\t'] = "\\t"; + ANTLRLiteralCharValueEscape['\b'] = "\\b"; + ANTLRLiteralCharValueEscape['\f'] = "\\f"; + ANTLRLiteralCharValueEscape['\\'] = "\\\\"; + ANTLRLiteralCharValueEscape['\''] = "\\'"; + } + + /** Return a string representing the escaped char for code c. E.g., If c + * has value 0x100, you will get "\u0100". ASCII gets the usual + * char (non-hex) representation. Control characters are spit out + * as unicode. While this is specially set up for returning Java strings, + * it can be used by any language target that has the same syntax. :) + */ + public static String getANTLRCharLiteralForChar(int c) { + if ( c< Lexer.MIN_CHAR_VALUE ) { + return "'<INVALID>'"; + } + if ( c<ANTLRLiteralCharValueEscape.length && ANTLRLiteralCharValueEscape[c]!=null ) { + return '\''+ANTLRLiteralCharValueEscape[c]+'\''; + } + if ( Character.UnicodeBlock.of((char)c)==Character.UnicodeBlock.BASIC_LATIN && + !Character.isISOControl((char)c) ) { + if ( c=='\\' ) { + return "'\\\\'"; + } + if ( c=='\'') { + return "'\\''"; + } + return '\''+Character.toString((char)c)+'\''; + } + // turn on the bit above max "\uFFFF" value so that we pad with zeros + // then only take last 4 digits + String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5); + String unicodeStr = "'\\u"+hex+"'"; + return unicodeStr; + } + + /** Given a literal like (the 3 char sequence with single quotes) 'a', + * return the int value of 'a'. Convert escape sequences here also. + * Return -1 if not single char. + */ + public static int getCharValueFromGrammarCharLiteral(String literal) { + if ( literal==null || literal.length()<3 ) return -1; + return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1)); + } + + /** Given char x or \t or \u1234 return the char value; + * Unnecessary escapes like '\{' yield -1. + */ + public static int getCharValueFromCharInGrammarLiteral(String cstr) { + switch ( cstr.length() ) { + case 1 : + // 'x' + return cstr.charAt(0); // no escape char + case 2 : + if ( cstr.charAt(0)!='\\' ) return -1; + // '\x' (antlr lexer will catch invalid char) + if ( Character.isDigit(cstr.charAt(1)) ) return -1; + int escChar = cstr.charAt(1); + int charVal = ANTLRLiteralEscapedCharValue[escChar]; + if ( charVal==0 ) return -1; + return charVal; + case 6 : + // '\u1234' + if ( !cstr.startsWith("\\u") ) return -1; + String unicodeChars = cstr.substring(2, cstr.length()); + return Integer.parseInt(unicodeChars, 16); + default : + return -1; + } + } + + public static String getStringFromGrammarStringLiteral(String literal) { + StringBuilder buf = new StringBuilder(); + int i = 1; // skip first quote + int n = literal.length()-1; // skip last quote + while ( i < n ) { // scan all but last quote + int end = i+1; + if ( literal.charAt(i) == '\\' ) { + end = i+2; + if ( (i+1)>=n ) break; // ignore spurious \ on end + if ( literal.charAt(i+1) == 'u' ) end = i+6; + } + if ( end>n ) break; + String esc = literal.substring(i, end); + int c = getCharValueFromCharInGrammarLiteral(esc); + if ( c==-1 ) { buf.append(esc); } + else buf.append((char)c); + i = end; + } + return buf.toString(); + } + + public static String capitalize(String s) { + return Character.toUpperCase(s.charAt(0)) + s.substring(1); + } +} |