All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.PatternTokenizer Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
 *******************************************************************************
 * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
 * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */
package com.ibm.icu.impl;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
 * The '' (two quotes) is treated as a single quote, inside or outside a quote
 * 
    *
  • Any ignorable characters are ignored in parsing.
  • *
  • Any syntax characters are broken into separate tokens
  • *
  • Quote characters can be specified: '...', "...", and \x
  • *
  • Other characters are treated as literals
  • *
*/ public class PatternTokenizer { // settings used in the interpretation of the pattern private UnicodeSet ignorableCharacters = new UnicodeSet(); private UnicodeSet syntaxCharacters = new UnicodeSet(); private UnicodeSet extraQuotingCharacters = new UnicodeSet(); private UnicodeSet escapeCharacters = new UnicodeSet(); private boolean usingSlash = false; private boolean usingQuote = false; // transient data, set when needed. Null it out for any changes in the above fields. private transient UnicodeSet needingQuoteCharacters = null; // data about the current pattern being parsed. start gets moved as we go along. private int start; private int limit; private String pattern; public UnicodeSet getIgnorableCharacters() { return (UnicodeSet) ignorableCharacters.clone(); } /** * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); * @param ignorableCharacters Characters to be ignored. * @return A PatternTokenizer object in which characters are specified as ignored characters. */ public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); needingQuoteCharacters = null; return this; } public UnicodeSet getSyntaxCharacters() { return (UnicodeSet) syntaxCharacters.clone(); } public UnicodeSet getExtraQuotingCharacters() { return (UnicodeSet) extraQuotingCharacters.clone(); } /** * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") * @param syntaxCharacters Characters to be set as syntax characters. * @return A PatternTokenizer object in which characters are specified as syntax characters. */ public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); needingQuoteCharacters = null; return this; } /** * Sets the extra characters to be quoted in literals * @param syntaxCharacters Characters to be set as extra quoting characters. * @return A PatternTokenizer object in which characters are specified as extra quoting characters. */ public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); needingQuoteCharacters = null; return this; } public UnicodeSet getEscapeCharacters() { return (UnicodeSet) escapeCharacters.clone(); } /** * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); * @param escapeCharacters Characters to be set as escape characters. * @return A PatternTokenizer object in which characters are specified as escape characters. */ public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); return this; } public boolean isUsingQuote() { return usingQuote; } public PatternTokenizer setUsingQuote(boolean usingQuote) { this.usingQuote = usingQuote; needingQuoteCharacters = null; return this; } public boolean isUsingSlash() { return usingSlash; } public PatternTokenizer setUsingSlash(boolean usingSlash) { this.usingSlash = usingSlash; needingQuoteCharacters = null; return this; } // public UnicodeSet getQuoteCharacters() { // return (UnicodeSet) quoteCharacters.clone(); // } // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); // needingQuoteCharacters = null; // return this; // } public int getLimit() { return limit; } public PatternTokenizer setLimit(int limit) { this.limit = limit; return this; } public int getStart() { return start; } public PatternTokenizer setStart(int start) { this.start = start; return this; } public PatternTokenizer setPattern(CharSequence pattern) { return setPattern(pattern.toString()); } public PatternTokenizer setPattern(String pattern) { if (pattern == null) { throw new IllegalArgumentException("Inconsistent arguments"); } this.start = 0; this.limit = pattern.length(); this.pattern = pattern; return this; } public static final char SINGLE_QUOTE = '\''; public static final char BACK_SLASH = '\\'; private static int NO_QUOTE = -1, IN_QUOTE = -2; public String quoteLiteral(CharSequence string) { return quoteLiteral(string.toString()); } /** * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. * @param string String passed to quote a literal string. * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. */ public String quoteLiteral(String string) { if (needingQuoteCharacters == null) { needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); } StringBuffer result = new StringBuffer(); int quotedChar = NO_QUOTE; int cp; for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(string, i); if (escapeCharacters.contains(cp)) { // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.append(SINGLE_QUOTE); quotedChar = NO_QUOTE; } appendEscaped(result, cp); continue; } if (needingQuoteCharacters.contains(cp)) { // if we have already started a quote if (quotedChar == IN_QUOTE) { UTF16.append(result, cp); if (usingQuote && cp == SINGLE_QUOTE) { // double it result.append(SINGLE_QUOTE); } continue; } // otherwise not already in quote if (usingSlash) { result.append(BACK_SLASH); UTF16.append(result, cp); continue; } if (usingQuote) { if (cp == SINGLE_QUOTE) { // double it and continue result.append(SINGLE_QUOTE); result.append(SINGLE_QUOTE); continue; } result.append(SINGLE_QUOTE); UTF16.append(result, cp); quotedChar = IN_QUOTE; continue; } // we have no choice but to use \\u or \\U appendEscaped(result, cp); continue; } // otherwise cp doesn't need quoting // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.append(SINGLE_QUOTE); quotedChar = NO_QUOTE; } UTF16.append(result, cp); } // all done. // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.append(SINGLE_QUOTE); } return result.toString(); } private void appendEscaped(StringBuffer result, int cp) { if (cp <= 0xFFFF) { result.append("\\u").append(Utility.hex(cp,4)); } else { result.append("\\U").append(Utility.hex(cp,8)); } } public String normalize() { int oldStart = start; StringBuffer result = new StringBuffer(); StringBuffer buffer = new StringBuffer(); while (true) { buffer.setLength(0); int status = next(buffer); if (status == DONE) { start = oldStart; return result.toString(); } if (status != SYNTAX) { result.append(quoteLiteral(buffer)); } else { result.append(buffer); } } } public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; public int next(StringBuffer buffer) { if (start >= limit) return DONE; int status = UNKNOWN; int lastQuote = UNKNOWN; int quoteStatus = NONE; int hexCount = 0; int hexValue = 0; int cp; main: for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(pattern, i); // if we are in a quote, then handle it. switch (quoteStatus) { case SLASH_START: switch (cp) { case 'u': quoteStatus = HEX; hexCount = 4; hexValue = 0; continue main; case 'U': quoteStatus = HEX; hexCount = 8; hexValue = 0; continue main; default: if (usingSlash) { UTF16.append(buffer, cp); quoteStatus = NONE; continue main; } else { buffer.append(BACK_SLASH); quoteStatus = NONE; } } break; // fall through to NONE case HEX: hexValue <<= 4; hexValue += cp; switch (cp) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': hexValue -= '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': hexValue -= 'a' - 10; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': hexValue -= 'A' - 10; break; default: start = i; return BROKEN_ESCAPE; } --hexCount; if (hexCount == 0) { quoteStatus = NONE; UTF16.append(buffer, hexValue); } continue main; case AFTER_QUOTE: // see if we get another quote character // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote if (cp == lastQuote) { UTF16.append(buffer, cp); quoteStatus = NORMAL_QUOTE; continue main; } quoteStatus = NONE; break; // fall through to NONE case START_QUOTE: // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote if (cp == lastQuote) { UTF16.append(buffer, cp); quoteStatus = NONE; // get out of quote, with no trace remaining continue; } // otherwise get into quote UTF16.append(buffer, cp); quoteStatus = NORMAL_QUOTE; continue main; case NORMAL_QUOTE: if (cp == lastQuote) { quoteStatus = AFTER_QUOTE; // get out of quote continue main; } UTF16.append(buffer, cp); continue main; } if (ignorableCharacters.contains(cp)) { continue; } // do syntax characters if (syntaxCharacters.contains(cp)) { if (status == UNKNOWN) { UTF16.append(buffer, cp); start = i + UTF16.getCharCount(cp); return SYNTAX; } else { // LITERAL, so back up and break start = i; return status; } } // otherwise it is a literal; keep on going status = LITERAL; if (cp == BACK_SLASH) { quoteStatus = SLASH_START; continue; } else if (usingQuote && cp == SINGLE_QUOTE) { lastQuote = cp; quoteStatus = START_QUOTE; continue; } // normal literals UTF16.append(buffer, cp); } // handle final cleanup start = limit; switch (quoteStatus) { case HEX: status = BROKEN_ESCAPE; break; case SLASH_START: if (usingSlash) { status = BROKEN_ESCAPE; } else { buffer.append(BACK_SLASH); } break; case START_QUOTE: case NORMAL_QUOTE: status = BROKEN_QUOTE; break; } return status; } } //eof




© 2015 - 2024 Weber Informatics LLC | Privacy Policy