com.ibm.icu.impl.UnicodeRegex Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2009-2015, Google, International Business Machines Corporation
* and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.ParsePosition;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import com.ibm.icu.text.StringTransform;
import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Freezable;
/**
* Contains utilities to supplement the JDK Regex, since it doesn't handle
* Unicode well.
*
* TODO: Move to com.ibm.icu.dev.somewhere.
* 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
*
* @author markdavis
*/
public class UnicodeRegex implements Cloneable, Freezable, StringTransform {
private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})");
// Note: we don't currently have any state, but intend to in the future,
// particularly for the regex style supported.
private SymbolTable symbolTable;
/**
* Set the symbol table for internal processing
* @internal
*/
public SymbolTable getSymbolTable() {
return symbolTable;
}
/**
* Get the symbol table for internal processing
* @internal
*/
public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
this.symbolTable = symbolTable;
return this;
}
/**
* Adds full Unicode property support, with the latest version of Unicode,
* to Java Regex, bringing it up to Level 1 (see
* http://www.unicode.org/reports/tr18/). It does this by preprocessing the
* regex pattern string and interpreting the character classes (\p{...},
* \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
* this utility, Java regex expressions can be updated to work with the
* latest version of Unicode, and with all Unicode properties. Note that the
* UnicodeSet syntax has not yet, however, been updated to be completely
* consistent with Java regex, so be careful of the differences.
* Not thread-safe; create a separate copy for different threads.
*
In the future, we may extend this to support other regex packages.
*
* @param regex A modified Java regex pattern, as in the input to
* Pattern.compile(), except that all "character classes" are
* processed as if they were UnicodeSet patterns. Example:
* "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
* @return A processed Java regex pattern, suitable for input to
* Pattern.compile().
*/
@Override
public String transform(String regex) {
StringBuilder result = new StringBuilder();
UnicodeSet temp = new UnicodeSet();
ParsePosition pos = new ParsePosition(0);
int state = 0; // 1 = after \
// We add each character unmodified to the output, unless we have a
// UnicodeSet. Note that we don't worry about supplementary characters,
// since none of the syntax uses them.
for (int i = 0; i < regex.length(); ++i) {
// look for UnicodeSets, allowing for quoting with \ and \Q
char ch = regex.charAt(i);
switch (state) {
case 0: // we only care about \, and '['.
if (ch == '\\') {
if (UnicodeSet.resemblesPattern(regex, i)) {
// should only happen with \p
i = processSet(regex, i, result, temp, pos);
continue;
}
state = 1;
} else if (ch == '[') {
// if we have what looks like a UnicodeSet
if (UnicodeSet.resemblesPattern(regex, i)) {
i = processSet(regex, i, result, temp, pos);
continue;
}
}
break;
case 1: // we are after a \
if (ch == 'Q') {
state = 2;
} else {
state = 0;
}
break;
case 2: // we are in a \Q...
if (ch == '\\') {
state = 3;
}
break;
case 3: // we are in a \Q...\
if (ch == 'E') {
state = 0;
} else if (ch != '\\') {
state = 2;
}
break;
}
result.append(ch);
}
return result.toString();
}
/**
* Convenience static function, using standard parameters.
* @param regex as in process()
* @return processed regex pattern, as in process()
*/
public static String fix(String regex) {
return STANDARD.transform(regex);
}
/**
* Compile a regex string, after processing by fix(...).
*
* @param regex Raw regex pattern, as in fix(...).
* @return Pattern
*/
public static Pattern compile(String regex) {
return Pattern.compile(STANDARD.transform(regex));
}
/**
* Compile a regex string, after processing by fix(...).
*
* @param regex Raw regex pattern, as in fix(...).
* @return Pattern
*/
public static Pattern compile(String regex, int options) {
return Pattern.compile(STANDARD.transform(regex), options);
}
/**
* Compile a composed string from a set of BNF lines; see the List version for more information.
*
* @param bnfLines Series of BNF lines.
* @return Pattern
*/
public String compileBnf(String bnfLines) {
return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
}
/**
* Compile a composed string from a set of BNF lines, such as for composing a regex
* expression. The lines can be in any order, but there must not be any
* cycles. The result can be used as input for fix().
*
* Example:
*
* uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
* scheme = reserved+;
* host = // reserved+;
* query = [\\=reserved]+;
* fragment = reserved+;
* reserved = [[:ascii:][:alphabetic:]];
*
*
* Caveats: at this point the parsing is simple; for example, # cannot be
* quoted (use \\u0023); you can set it to null to disable.
* The equality sign and a few others can be reset with
* setBnfX().
*
* @param lines Series of lines that represent a BNF expression. The lines contain
* a series of statements that of the form x=y;. A statement can take
* multiple lines, but there can't be multiple statements on a line.
* A hash quotes to the end of the line.
* @return Pattern
*/
public String compileBnf(List lines) {
Map variables = getVariables(lines);
Set unused = new LinkedHashSet<>(variables.keySet());
// brute force replacement; do twice to allow for different order
// later on can optimize
for (int i = 0; i < 2; ++i) {
for (Entry entry : variables.entrySet()) {
String variable = entry.getKey(),
definition = entry.getValue();
for (Entry entry2 : variables.entrySet()) {
String variable2 = entry2.getKey(),
definition2 = entry2.getValue();
if (variable.equals(variable2)) {
continue;
}
String altered2 = definition2.replace(variable, definition);
if (!altered2.equals(definition2)) {
unused.remove(variable);
variables.put(variable2, altered2);
// if (log != null) {
// try {
// log.append(variable2 + "=" + altered2 + ";");
// } catch (IOException e) {
// throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
// }
// }
}
}
}
}
if (unused.size() != 1) {
throw new IllegalArgumentException("Not a single root: " + unused);
}
return variables.get(unused.iterator().next());
}
public String getBnfCommentString() {
return bnfCommentString;
}
public void setBnfCommentString(String bnfCommentString) {
this.bnfCommentString = bnfCommentString;
}
public String getBnfVariableInfix() {
return bnfVariableInfix;
}
public void setBnfVariableInfix(String bnfVariableInfix) {
this.bnfVariableInfix = bnfVariableInfix;
}
public String getBnfLineSeparator() {
return bnfLineSeparator;
}
public void setBnfLineSeparator(String bnfLineSeparator) {
this.bnfLineSeparator = bnfLineSeparator;
}
/**
* Utility for loading lines from a file.
* @param result The result of the appended lines.
* @param file The file to have an input stream.
* @param encoding if null, then UTF-8
* @return filled list
* @throws IOException If there were problems opening the file for input stream.
*/
public static List appendLines(List result, String file, String encoding) throws IOException {
InputStream is = new FileInputStream(file);
try {
return appendLines(result, is, encoding);
} finally {
is.close();
}
}
/**
* Utility for loading lines from a UTF8 file.
* @param result The result of the appended lines.
* @param inputStream The input stream.
* @param encoding if null, then UTF-8
* @return filled list
* @throws IOException If there were problems opening the input stream for reading.
*/
public static List appendLines(List result, InputStream inputStream, String encoding)
throws UnsupportedEncodingException, IOException {
BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
while (true) {
String line = in.readLine();
if (line == null) break;
result.add(line);
}
return result;
}
/* (non-Javadoc)
* @see com.ibm.icu.util.Freezable#cloneAsThawed()
*/
@Override
public UnicodeRegex cloneAsThawed() {
// TODO Auto-generated method stub
try {
return (UnicodeRegex)clone();
} catch (CloneNotSupportedException e) {
throw new IllegalArgumentException(); // should never happen
}
}
/* (non-Javadoc)
* @see com.ibm.icu.util.Freezable#freeze()
*/
@Override
public UnicodeRegex freeze() {
// no action needed now.
return this;
}
/* (non-Javadoc)
* @see com.ibm.icu.util.Freezable#isFrozen()
*/
@Override
public boolean isFrozen() {
// at this point, always true
return true;
}
// ===== PRIVATES =====
private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
try {
pos.setIndex(i);
UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
x.complement().complement(); // hack to fix toPattern
String pattern = x.toPattern(false);
// Escaping of supplementary code points differs between ICU UnicodeSet and Java regex.
if (pattern.contains("\\U")) {
pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}");
}
result.append(pattern);
i = pos.getIndex() - 1; // allow for the loop increment
return i;
} catch (Exception e) {
throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
}
}
private static final UnicodeRegex STANDARD = new UnicodeRegex();
private String bnfCommentString = "#";
private String bnfVariableInfix = "=";
private String bnfLineSeparator = "\n";
// private Appendable log = null;
private Comparator