org.exist.util.Collations Maven / Gradle / Ivy
/*
* eXist Open Source Native XML Database
* Copyright (C) 2001-2017 The eXist Project
* http://exist-db.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
package org.exist.util;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import com.ibm.icu.text.*;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.exist.xquery.ErrorCodes;
import org.exist.xquery.XPathException;
import javax.annotation.Nullable;
/**
* Utility methods dealing with collations.
*
* @author wolf
* @author Adam Retter
*/
public class Collations {
private final static Logger logger = LogManager.getLogger(Collations.class);
/**
* The default Unicode Codepoint Collation URI as defined by the XQuery
* spec.
*/
public final static String UNICODE_CODEPOINT_COLLATION_URI = "http://www.w3.org/2005/xpath-functions/collation/codepoint";
/**
* Short string to select the default codepoint collation
*/
public final static String CODEPOINT_SHORT = "codepoint";
/**
* The UCA (Unicode Collation Algorithm) Codepoint URI as defined by the XQuery
* spec.
*/
public final static String UCA_COLLATION_URI = "http://www.w3.org/2013/collation/UCA";
/**
* The HTML ASCII Case-Insensitive Collation as defined by the XPath F&O spec.
*/
public final static String HTML_ASCII_CASE_INSENSITIVE_COLLATION_URI = "http://www.w3.org/2005/xpath-functions/collation/html-ascii-case-insensitive";
/**
* The XQTS ASCII Case-blind Collation as defined by the XQTS 3.1.
*/
public final static String XQTS_ASCII_CASE_BLIND_COLLATION_URI = "http://www.w3.org/2010/09/qt-fots-catalog/collation/caseblind";
/**
* The URI used to select collations in eXist.
*/
public final static String EXIST_COLLATION_URI = "http://exist-db.org/collation";
/**
* Lazy-initialized singleton Html Ascii Case Insensitive Collator
*/
private final static AtomicReference htmlAsciiCaseInsensitiveCollator = new AtomicReference<>();
/**
* Lazy-initialized singleton XQTS Case Blind Collator
*/
private final static AtomicReference xqtsAsciiCaseBlindCollator = new AtomicReference<>();
/**
* Lazy-initialized singleton Samisk Collator
*/
private final static AtomicReference samiskCollator = new AtomicReference<>();
/**
* Get a {@link Comparator}from the specified URI.
*
* The original code is from saxon (@linkplain http://saxon.sf.net).
*
* @param uri The URI describing the collation and settings
*
* @return The Collator for the URI, or null.
*
* @throws XPathException If an error occurs whilst constructing the Collator
*/
public static @Nullable Collator getCollationFromURI(final String uri) throws XPathException {
final Collator collator;
if (uri.startsWith(EXIST_COLLATION_URI) || uri.startsWith(UCA_COLLATION_URI) || uri.startsWith("?")) {
URI u;
try {
u = new URI(uri);
} catch (final URISyntaxException e) {
return null;
}
final String query = u.getQuery();
if (query == null) {
collator = Collator.getInstance();
} else {
boolean fallback = true; // default is "yes"
String lang = null;
String version = null;
String strength = null;
String maxVariable = "punct"; // default is punct
String alternate = "non-ignorable"; // default is non-ignorable
boolean backwards = false; // default is "no"
boolean normalization = false; // default is "no"
boolean caseLevel = false; // default is "no"
String caseFirst = null;
boolean numeric = false; // default is "no"
String reorder = null;
String decomposition = null;
final StringTokenizer queryTokenizer = new StringTokenizer(query, ";&");
while (queryTokenizer.hasMoreElements()) {
final String param = queryTokenizer.nextToken();
final int eq = param.indexOf('=');
if (eq > 0) {
final String kw = param.substring(0, eq);
if (kw != null) {
final String val = param.substring(eq + 1);
switch (kw) {
case "fallback":
fallback = "yes".equals(val);
break;
case "lang":
lang = val;
break;
case "version":
version = val;
break;
case "strength":
strength = val;
break;
case "maxVariable":
maxVariable = val;
break;
case "alternate":
alternate = val;
break;
case "backwards":
backwards = "yes".equals(val);
break;
case "normalization":
normalization = "yes".equals(val);
break;
case "caseLevel":
caseLevel = "yes".equals(val);
break;
case "caseFirst":
caseFirst = val;
break;
case "numeric":
numeric = "yes".equals(val);
break;
case "reorder":
reorder = val;
break;
case "decomposition":
decomposition = val;
break;
default:
logger.warn("Unrecognized Collation parameter: " + kw);
break;
}
}
}
}
collator = getCollationFromParams(fallback, lang, version,
strength, maxVariable, alternate, backwards,
normalization, caseLevel, caseFirst, numeric,
reorder, decomposition);
}
} else if(HTML_ASCII_CASE_INSENSITIVE_COLLATION_URI.equals(uri)) {
try {
collator = getHtmlAsciiCaseInsensitiveCollator();
} catch (final Exception e) {
throw new XPathException("Unable to instantiate HTML ASCII Case Insensitive Collator: " + e.getMessage(), e);
}
} else if(XQTS_ASCII_CASE_BLIND_COLLATION_URI.equals(uri)) {
try {
collator = getXqtsAsciiCaseBlindCollator();
} catch (final Exception e) {
throw new XPathException("Unable to instantiate XQTS ASCII Case Blind Collator: " + e.getMessage(), e);
}
} else if (uri.startsWith("java:")) {
// java class specified: this should be a subclass of
// com.ibm.icu.text.RuleBasedCollator
final String uriClassName = uri.substring("java:".length());
try {
final Class> collatorClass = Class.forName(uriClassName);
if (!Collator.class.isAssignableFrom(collatorClass)) {
final String msg = "The specified collator class '" + collatorClass.getName() + "' is not a subclass of com.ibm.icu.text.Collator";
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
collator = (Collator) collatorClass.newInstance();
} catch (final Exception e) {
final String msg = "The specified collator class " + uriClassName + " could not be found";
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg, e);
}
} else if (UNICODE_CODEPOINT_COLLATION_URI.equals(uri)) {
collator = null;
} else {
final String msg = "Unknown collation : '" + uri + "'";
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
if (collator != null) {
// make immutable and therefore thread-safe!
collator.freeze();
}
return collator;
}
/**
* Determines if the two strings are equal with regards to a Collation.
*
* @param collator The collation, or null if no collation should be used.
* @param s1 The first string to compare against the second.
* @param s2 The second string to compare against the first.
*
* @return true if the Strings are equal.
*/
public static boolean equals(@Nullable final Collator collator, final String s1, final String s2) {
if (collator == null) {
return s1.equals(s2);
} else {
return collator.equals(s1, s2);
}
}
/**
* Compares two strings with regards to a Collation.
*
* @param collator The collation, or null if no collation should be used.
* @param s1 The first string to compare against the second.
* @param s2 The second string to compare against the first.
*
* @return a negative integer, zero, or a positive integer if the
* {@code s1} is less than, equal to, or greater than {@code s2}.
*
* @throws UnsupportedOperationException if ICU4J does not support collation
*/
public static int compare(@Nullable final Collator collator, final String s1,final String s2) {
if (collator == null) {
return s1 == null ? (s2 == null ? 0 : -1) : s1.compareTo(s2);
} else {
return collator.compare(s1, s2);
}
}
/**
* Determines if one string starts with another with regards to a Collation.
*
* @param collator The collation, or null if no collation should be used.
* @param s1 The first string to compare against the second.
* @param s2 The second string to compare against the first.
*
* @return true if {@code s1} starts with {@code @s2}.
*
* @throws UnsupportedOperationException if ICU4J does not support collation
*/
public static boolean startsWith(@Nullable final Collator collator, final String s1, final String s2) {
if (collator == null) {
return s1.startsWith(s2);
} else {
if (s2.length() == 0) {
return true;
} else if (s1.length() == 0) {
return false;
} else {
final SearchIterator searchIterator =
new StringSearch(s2, new StringCharacterIterator(s1), (RuleBasedCollator) collator);
return searchIterator.first() == 0;
}
}
}
/**
* Determines if one string ends with another with regards to a Collation.
*
* @param collator The collation, or null if no collation should be used.
* @param s1 The first string to compare against the second.
* @param s2 The second string to compare against the first.
*
* @return true if {@code s1} ends with {@code @s2}.
*
* @throws UnsupportedOperationException if ICU4J does not support collation
*/
public static boolean endsWith(@Nullable final Collator collator, final String s1, final String s2) {
if (collator == null) {
return s1.endsWith(s2);
} else {
if (s2.length() == 0) {
return true;
} else if (s1.length() == 0) {
return false;
} else {
final SearchIterator searchIterator =
new StringSearch(s2, new StringCharacterIterator(s1), (RuleBasedCollator) collator);
int lastPos = SearchIterator.DONE;
int lastLen = 0;
for (int pos = searchIterator.first(); pos != SearchIterator.DONE;
pos = searchIterator.next()) {
lastPos = pos;
lastLen = searchIterator.getMatchLength();
}
return lastPos > SearchIterator.DONE && lastPos + lastLen == s1.length();
}
}
}
/**
* Determines if one string contains another with regards to a Collation.
*
* @param collator The collation, or null if no collation should be used.
* @param s1 The first string to compare against the second.
* @param s2 The second string to compare against the first.
*
* @return true if {@code s1} contains {@code @s2}.
*
* @throws UnsupportedOperationException if ICU4J does not support collation
*/
public static boolean contains(@Nullable final Collator collator, final String s1, final String s2) {
if (collator == null) {
return s1.contains(s2);
} else {
if (s2.length() == 0) {
return true;
} else if (s1.length() == 0) {
return false;
} else {
final SearchIterator searchIterator =
new StringSearch(s2, new StringCharacterIterator(s1), (RuleBasedCollator) collator);
return searchIterator.first() >= 0;
}
}
}
/**
* Finds the index of one string within another string with regards to a Collation.
*
* @param collator The collation, or null if no collation should be used.
* @param s1 The string to look for {@code s2} in
* @param s2 The substring to look for in {@code s1}.
*
* @return the index of the first occurrence of the specified substring,
* or {@code -1} if there is no such occurrence.
*/
public static int indexOf(@Nullable final Collator collator, final String s1, final String s2) {
if (collator == null) {
return s1.indexOf(s2);
} else {
if (s2.length() == 0) {
return 0;
} else if (s1.length() == 0) {
return -1;
} else {
final SearchIterator searchIterator =
new StringSearch(s2, new StringCharacterIterator(s1), (RuleBasedCollator) collator);
return searchIterator.first();
}
}
}
/**
* Get a Collator with the provided settings.
*
* @param fallback Determines whether the processor uses a fallback
* collation if a conformant collation is not available.
* @param lang language code: a string in the lexical space of xs:language.
* @param strength The collation strength as defined in UCA.
* @param maxVariable Indicates that all characters in the specified group
* and earlier groups are treated as "noise" characters to be handled
* as defined by the alternate parameter. "space" | "punct" | "symbol".
* | "currency".
* @param alternate Controls the handling of characters such as spaces and
* hyphens; specifically, the "noise" characters in the groups selected
* by the maxVariable parameter. "non-ignorable" | "shifted" |
* "blanked".
* @param backwards indicates that the last accent in the string is the
* most significant.
* @param normalization Indicates whether strings are converted to
* normalization form D.
* @param caseLevel When used with primary strength, setting caseLevel has
* the effect of ignoring accents while taking account of case.
* @param caseFirst Indicates whether upper-case precedes lower-case or
* vice versa.
* @param numeric When numeric is specified, a sequence of consecutive
* digits is interpreted as a number, for example chap2 sorts before
* chap12.
* @param reorder Determines the relative ordering of text in different
* scripts; for example the value digit,Grek,Latn indicates that
* digits precede Greek letters, which precede Latin letters.
* @param decomposition The decomposition
*
* @return The collator of null if a Collator could not be retrieved
*
* @throws XPathException if an error occurs whilst getting the Collator
*/
private static @Nullable Collator getCollationFromParams(
final boolean fallback, @Nullable final String lang,
@Nullable final String version, @Nullable final String strength,
final String maxVariable, final String alternate,
final boolean backwards, final boolean normalization,
final boolean caseLevel, @Nullable final String caseFirst,
final boolean numeric, @Nullable final String reorder,
@Nullable final String decomposition) throws XPathException {
final Collator collator;
if ("sme-SE".equals(lang)) {
try {
collator = getSamiskCollator();
} catch (final Exception pe) {
logger.error(pe.getMessage(), pe);
return null;
}
} else {
final ULocale locale = getLocale(lang);
collator = Collator.getInstance(locale);
}
if(!fallback) {
//TODO(AR) how to disable fallback in ICU?
logger.warn("eXist-db does not yet support disabling collation fallback");
}
if(version != null) {
final VersionInfo versionInfo;
try {
versionInfo = VersionInfo.getInstance(version);
} catch (final IllegalArgumentException iae) {
logger.error(iae.getMessage(), iae);
throw new XPathException(iae.getMessage(), iae);
}
if(collator.getVersion().compareTo(versionInfo) < 0) {
throw new XPathException("Requested UCA Collation version: " + version + ", however eXist-db only has ICU UCA: " + collator.getVersion().toString());
}
}
if (strength != null) {
switch(strength) {
case "identical":
// the default setting
collator.setStrength(Collator.IDENTICAL);
break;
case "1":
case "primary":
collator.setStrength(Collator.PRIMARY);
break;
case "2":
case "secondary":
collator.setStrength(Collator.SECONDARY);
break;
case "3":
case "tertiary":
collator.setStrength(Collator.TERTIARY);
break;
case "4":
case "quaternary":
collator.setStrength(Collator.QUATERNARY);
break;
default:
final String msg = "eXist-db only supports Collation strengths of 'identical', 'primary', 'secondary', 'tertiary' or 'quaternary', requested: " + strength;
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
}
if(maxVariable != null) {
switch(maxVariable) {
case "space":
collator.setMaxVariable(Collator.ReorderCodes.SPACE);
break;
case "punct":
collator.setMaxVariable(Collator.ReorderCodes.PUNCTUATION);
break;
case "symbol":
collator.setMaxVariable(Collator.ReorderCodes.SYMBOL);
break;
case "currency":
collator.setMaxVariable(Collator.ReorderCodes.CURRENCY);
break;
default:
final String msg = "eXist-db only supports Collation maxVariables of 'space', 'punct', 'symbol', or 'currency', requested: " + maxVariable;
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
}
if(alternate != null) {
switch(alternate) {
case "non-ignorable":
((RuleBasedCollator)collator).setAlternateHandlingShifted(false);
break;
case "shifted":
case "blanked":
((RuleBasedCollator)collator).setAlternateHandlingShifted(true);
break;
default:
final String msg = "Collation alternate should be either 'non-ignorable', 'shifted' or 'blanked', but received: " + caseFirst;
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
}
if(backwards) {
((RuleBasedCollator)collator).setFrenchCollation(true);
}
if(normalization) {
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
} else {
collator.setDecomposition(Collator.NO_DECOMPOSITION);
}
if(caseLevel && collator.getStrength() == Collator.PRIMARY) {
((RuleBasedCollator)collator).setCaseLevel(true);
}
if(caseFirst != null) {
switch(caseFirst) {
case "upper":
((RuleBasedCollator)collator).setUpperCaseFirst(true);
break;
case "lower":
((RuleBasedCollator)collator).setLowerCaseFirst(true);
break;
default:
final String msg = "Collation case first should be either 'upper' or 'lower', but received: " + caseFirst;
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
}
if(numeric) {
((RuleBasedCollator)collator).setNumericCollation(true);
}
if(reorder != null) {
final String reorderCodes[] = reorder.split(",");
final List icuCollatorReorderCodes =
Arrays.stream(reorderCodes)
.map(Collations::toICUCollatorReorderCode)
.filter(i -> i > -1)
.collect(Collectors.toList());
if(!icuCollatorReorderCodes.isEmpty()) {
final int[] codes = new int[icuCollatorReorderCodes.size()];
for(int i = 0; i < codes.length; i++) {
codes[i] = icuCollatorReorderCodes.get(i);
}
collator.setReorderCodes(codes);
}
}
if (decomposition != null) {
switch(decomposition) {
case "none":
collator.setDecomposition(Collator.NO_DECOMPOSITION);
break;
case "full":
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
break;
case "standard":
case "":
// the default setting
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
break;
default:
final String msg = "Collation decomposition should be either 'none', 'full' or 'standard', but received: " + decomposition;
logger.error(msg);
throw new XPathException(ErrorCodes.FOCH0002, msg);
}
}
return collator;
}
private static int toICUCollatorReorderCode(final String reorderCode) {
switch(reorderCode.toLowerCase()) {
case "default":
return Collator.ReorderCodes.DEFAULT;
case "none":
return Collator.ReorderCodes.NONE;
case "others":
return Collator.ReorderCodes.OTHERS;
case "space":
return Collator.ReorderCodes.SPACE;
case "first":
return Collator.ReorderCodes.FIRST;
case "punctuation":
return Collator.ReorderCodes.PUNCTUATION;
case "symbol":
return Collator.ReorderCodes.SYMBOL;
case "currency":
return Collator.ReorderCodes.CURRENCY;
case "digit":
return Collator.ReorderCodes.DIGIT;
default:
logger.warn("eXist-db does not support the collation reorderCode: " + reorderCode);
return -1;
}
}
/**
* Get a locale for the provided language.
*
* @param lang The language
*
* @return The locale
*/
private static ULocale getLocale(@Nullable final String lang) throws XPathException {
if(lang == null) {
return ULocale.getDefault();
} else {
final String[] components = lang.split("-");
switch (components.length) {
case 3:
return new ULocale(components[0], components[1], components[2]);
case 2:
return new ULocale(components[0], components[1]);
case 1:
return new ULocale(components[0]);
default:
throw new XPathException(ErrorCodes.FOCH0002, "Unrecognized lang=" + lang);
}
}
}
private static Collator getSamiskCollator() throws Exception {
Collator collator = samiskCollator.get();
if (collator == null) {
// Collation rules contained in a String object.
// Codes for the representation of names of languages:
// http://www.loc.gov/standards/iso639-2/englangn.html
// UTF-8 characters from:
// http://chouette.info/entities/table-utf8.php
samiskCollator.compareAndSet(null,
new RuleBasedCollator("< a,A< \u00E1,\u00C1< b,B< c,C"
+ "< \u010d,\u010c< d,D< \u0111,\u0110< e,E"
+ "< f,F< g,G< h,H< i,I< j,J< k,K< l,L< m,M"
+ "< n,N< \u014b,\u014a< o,O< p,P< r,R< s,S"
+ "< \u0161,\u0160< t,T< \u0167,\u0166< u,U"
+ "< v,V< z,Z< \u017e,\u017d").freeze());
collator = samiskCollator.get();
}
return collator;
}
private static Collator getHtmlAsciiCaseInsensitiveCollator() throws Exception {
Collator collator = htmlAsciiCaseInsensitiveCollator.get();
if (collator == null) {
collator = new RuleBasedCollator("&a=A &b=B &c=C &d=D &e=E &f=F &g=G &h=H "
+ "&i=I &j=J &k=K &l=L &m=M &n=N &o=O &p=P &q=Q &r=R &s=S &t=T "
+ "&u=U &v=V &w=W &x=X &y=Y &z=Z");
collator.setStrength(Collator.PRIMARY);
htmlAsciiCaseInsensitiveCollator.compareAndSet(null,
collator.freeze());
collator = htmlAsciiCaseInsensitiveCollator.get();
}
return collator;
}
private static Collator getXqtsAsciiCaseBlindCollator() throws Exception {
Collator collator = xqtsAsciiCaseBlindCollator.get();
if (collator == null) {
collator = new RuleBasedCollator("&a=A &b=B &c=C &d=D &e=E &f=F &g=G &h=H "
+ "&i=I &j=J &k=K &l=L &m=M &n=N &o=O &p=P &q=Q &r=R &s=S &t=T "
+ "&u=U &v=V &w=W &x=X &y=Y &z=Z");
collator.setStrength(Collator.PRIMARY);
xqtsAsciiCaseBlindCollator.compareAndSet(null,
collator.freeze());
collator = xqtsAsciiCaseBlindCollator.get();
}
return collator;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy