com.ibm.icu.impl.PatternTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2006-2009, Google, International Business Machines Corporation *
* and others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
* The '' (two quotes) is treated as a single quote, inside or outside a quote
*
* - Any ignorable characters are ignored in parsing.
* - Any syntax characters are broken into separate tokens
* - Quote characters can be specified: '...', "...", and \x
* - Other characters are treated as literals
*
*/
public class PatternTokenizer {
// settings used in the interpretation of the pattern
private UnicodeSet ignorableCharacters = new UnicodeSet();
private UnicodeSet syntaxCharacters = new UnicodeSet();
private UnicodeSet extraQuotingCharacters = new UnicodeSet();
private UnicodeSet escapeCharacters = new UnicodeSet();
private boolean usingSlash = false;
private boolean usingQuote = false;
// transient data, set when needed. Null it out for any changes in the above fields.
private transient UnicodeSet needingQuoteCharacters = null;
// data about the current pattern being parsed. start gets moved as we go along.
private int start;
private int limit;
private String pattern;
public UnicodeSet getIgnorableCharacters() {
return (UnicodeSet) ignorableCharacters.clone();
}
/**
* Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
* @param ignorableCharacters Characters to be ignored.
* @return A PatternTokenizer object in which characters are specified as ignored characters.
*/
public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
needingQuoteCharacters = null;
return this;
}
public UnicodeSet getSyntaxCharacters() {
return (UnicodeSet) syntaxCharacters.clone();
}
public UnicodeSet getExtraQuotingCharacters() {
return (UnicodeSet) extraQuotingCharacters.clone();
}
/**
* Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
* @param syntaxCharacters Characters to be set as syntax characters.
* @return A PatternTokenizer object in which characters are specified as syntax characters.
*/
public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
needingQuoteCharacters = null;
return this;
}
/**
* Sets the extra characters to be quoted in literals
* @param syntaxCharacters Characters to be set as extra quoting characters.
* @return A PatternTokenizer object in which characters are specified as extra quoting characters.
*/
public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
needingQuoteCharacters = null;
return this;
}
public UnicodeSet getEscapeCharacters() {
return (UnicodeSet) escapeCharacters.clone();
}
/**
* Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
* @param escapeCharacters Characters to be set as escape characters.
* @return A PatternTokenizer object in which characters are specified as escape characters.
*/
public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
return this;
}
public boolean isUsingQuote() {
return usingQuote;
}
public PatternTokenizer setUsingQuote(boolean usingQuote) {
this.usingQuote = usingQuote;
needingQuoteCharacters = null;
return this;
}
public boolean isUsingSlash() {
return usingSlash;
}
public PatternTokenizer setUsingSlash(boolean usingSlash) {
this.usingSlash = usingSlash;
needingQuoteCharacters = null;
return this;
}
// public UnicodeSet getQuoteCharacters() {
// return (UnicodeSet) quoteCharacters.clone();
// }
// public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
// this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
// needingQuoteCharacters = null;
// return this;
// }
public int getLimit() {
return limit;
}
public PatternTokenizer setLimit(int limit) {
this.limit = limit;
return this;
}
public int getStart() {
return start;
}
public PatternTokenizer setStart(int start) {
this.start = start;
return this;
}
public PatternTokenizer setPattern(CharSequence pattern) {
return setPattern(pattern.toString());
}
public PatternTokenizer setPattern(String pattern) {
if (pattern == null) {
throw new IllegalArgumentException("Inconsistent arguments");
}
this.start = 0;
this.limit = pattern.length();
this.pattern = pattern;
return this;
}
public static final char SINGLE_QUOTE = '\'';
public static final char BACK_SLASH = '\\';
private static int NO_QUOTE = -1, IN_QUOTE = -2;
public String quoteLiteral(CharSequence string) {
return quoteLiteral(string.toString());
}
/**
* Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
* @param string String passed to quote a literal string.
* @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
*/
public String quoteLiteral(String string) {
if (needingQuoteCharacters == null) {
needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
}
StringBuffer result = new StringBuffer();
int quotedChar = NO_QUOTE;
int cp;
for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(string, i);
if (escapeCharacters.contains(cp)) {
// we may have to fix up previous characters
if (quotedChar == IN_QUOTE) {
result.append(SINGLE_QUOTE);
quotedChar = NO_QUOTE;
}
appendEscaped(result, cp);
continue;
}
if (needingQuoteCharacters.contains(cp)) {
// if we have already started a quote
if (quotedChar == IN_QUOTE) {
UTF16.append(result, cp);
if (usingQuote && cp == SINGLE_QUOTE) { // double it
result.append(SINGLE_QUOTE);
}
continue;
}
// otherwise not already in quote
if (usingSlash) {
result.append(BACK_SLASH);
UTF16.append(result, cp);
continue;
}
if (usingQuote) {
if (cp == SINGLE_QUOTE) { // double it and continue
result.append(SINGLE_QUOTE);
result.append(SINGLE_QUOTE);
continue;
}
result.append(SINGLE_QUOTE);
UTF16.append(result, cp);
quotedChar = IN_QUOTE;
continue;
}
// we have no choice but to use \\u or \\U
appendEscaped(result, cp);
continue;
}
// otherwise cp doesn't need quoting
// we may have to fix up previous characters
if (quotedChar == IN_QUOTE) {
result.append(SINGLE_QUOTE);
quotedChar = NO_QUOTE;
}
UTF16.append(result, cp);
}
// all done.
// we may have to fix up previous characters
if (quotedChar == IN_QUOTE) {
result.append(SINGLE_QUOTE);
}
return result.toString();
}
private void appendEscaped(StringBuffer result, int cp) {
if (cp <= 0xFFFF) {
result.append("\\u").append(Utility.hex(cp,4));
} else {
result.append("\\U").append(Utility.hex(cp,8));
}
}
public String normalize() {
int oldStart = start;
StringBuffer result = new StringBuffer();
StringBuffer buffer = new StringBuffer();
while (true) {
buffer.setLength(0);
int status = next(buffer);
if (status == DONE) {
start = oldStart;
return result.toString();
}
if (status != SYNTAX) {
result.append(quoteLiteral(buffer));
} else {
result.append(buffer);
}
}
}
public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
public int next(StringBuffer buffer) {
if (start >= limit) return DONE;
int status = UNKNOWN;
int lastQuote = UNKNOWN;
int quoteStatus = NONE;
int hexCount = 0;
int hexValue = 0;
int cp;
main:
for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(pattern, i);
// if we are in a quote, then handle it.
switch (quoteStatus) {
case SLASH_START:
switch (cp) {
case 'u':
quoteStatus = HEX;
hexCount = 4;
hexValue = 0;
continue main;
case 'U':
quoteStatus = HEX;
hexCount = 8;
hexValue = 0;
continue main;
default:
if (usingSlash) {
UTF16.append(buffer, cp);
quoteStatus = NONE;
continue main;
} else {
buffer.append(BACK_SLASH);
quoteStatus = NONE;
}
}
break; // fall through to NONE
case HEX:
hexValue <<= 4;
hexValue += cp;
switch (cp) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
hexValue -= '0'; break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
hexValue -= 'a' - 10; break;
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
hexValue -= 'A' - 10; break;
default:
start = i;
return BROKEN_ESCAPE;
}
--hexCount;
if (hexCount == 0) {
quoteStatus = NONE;
UTF16.append(buffer, hexValue);
}
continue main;
case AFTER_QUOTE:
// see if we get another quote character
// if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
if (cp == lastQuote) {
UTF16.append(buffer, cp);
quoteStatus = NORMAL_QUOTE;
continue main;
}
quoteStatus = NONE;
break; // fall through to NONE
case START_QUOTE:
// if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
if (cp == lastQuote) {
UTF16.append(buffer, cp);
quoteStatus = NONE; // get out of quote, with no trace remaining
continue;
}
// otherwise get into quote
UTF16.append(buffer, cp);
quoteStatus = NORMAL_QUOTE;
continue main;
case NORMAL_QUOTE:
if (cp == lastQuote) {
quoteStatus = AFTER_QUOTE; // get out of quote
continue main;
}
UTF16.append(buffer, cp);
continue main;
}
if (ignorableCharacters.contains(cp)) {
continue;
}
// do syntax characters
if (syntaxCharacters.contains(cp)) {
if (status == UNKNOWN) {
UTF16.append(buffer, cp);
start = i + UTF16.getCharCount(cp);
return SYNTAX;
} else { // LITERAL, so back up and break
start = i;
return status;
}
}
// otherwise it is a literal; keep on going
status = LITERAL;
if (cp == BACK_SLASH) {
quoteStatus = SLASH_START;
continue;
} else if (usingQuote && cp == SINGLE_QUOTE) {
lastQuote = cp;
quoteStatus = START_QUOTE;
continue;
}
// normal literals
UTF16.append(buffer, cp);
}
// handle final cleanup
start = limit;
switch (quoteStatus) {
case HEX:
status = BROKEN_ESCAPE;
break;
case SLASH_START:
if (usingSlash) {
status = BROKEN_ESCAPE;
} else {
buffer.append(BACK_SLASH);
}
break;
case START_QUOTE: case NORMAL_QUOTE:
status = BROKEN_QUOTE;
break;
}
return status;
}
}
//eof