org.terasoluna.gfw.common.codepoints.CodePoints Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of terasoluna-gfw-codepoints Show documentation
Show all versions of terasoluna-gfw-codepoints Show documentation
Functionalities to handle Codepoints
/*
* Copyright(c) 2013 NTT DATA Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.terasoluna.gfw.common.codepoints;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/**
* Represents the collection of code point. This class holds immutable code points as {@link java.util.Set} and provides
*
* - check method if the code points in the given string are included
* - set operations (union, subtract, intersect)
*
* How to create an instance
Use Factory method to create a cached instance
*
*
* CodePoints cp = CodePoints.of(ASCIIPrintableChars.class);
*
*
* The constructor can be also used. In this case, of course, the set of code points are not cached and created every time.
*
*
* CodePoints cp = new ASCIIPrintableChars();
*
*
* There are three types of constructor:
*
* - Pass {@code int} varargs
*
*
* CodePoints cp = new CodePoints(0x0061, 0x0062); // a b
*
*
*
* - Pass {@link java.util.Collection} of {@link java.lang.Integer}
*
*
* {@literal Set} set = new {@literal HashSet<>}();
* set.add(0x0061); // a
* set.add(0x0062); // b
* CodePoints cp = new CodePoints(set);
*
*
*
* - Pass {@link java.lang.String} varargs including the target code points
*
*
* CodePoints cp = new CodePoints("ab");
* CodePoints cp = new CodePoints("a", "b"); // is same
*
*
*
* - Pass existing {@link CodePoints}. This type is intended to use for the definition of new code points. The set in the
* {@link CodePoints} are shared.
*
*
* CodePoints cp = ...;
* CodePoints newCp = new CodePoints(cp);
*
*
*
*
* How to check strings
{@link #containsAll(String)} returns {@code true} if all code points in the given string are
* included in the target code points. Otherwise {@code false} is returned.
*
*
* CodePoints cp = new CodePoints(0x0061, 0x0062); // a b
* cp.containsAll("a"); // true
* cp.containsAll("b"); // true
* cp.containsAll("ab"); // true
* cp.containsAll("c"); // false
* cp.containsAll("abc"); // false
*
*
*
* {@link #firstExcludedCodePoint(String)} return the first code point in the given string which is not included in the target
* code points.
*
*
* CodePoints cp = new CodePoints(0x0061, 0x0062); // a b
* cp.firstExcludedContPoint("abc"); // 0x0063 (c)
* cp.firstExcludedContPoint("abcad"); // 0x0063 (c)
* cp.firstExcludedContPoint("ab"); // CodePoints#NOT_FOUND
*
*
*
* {@link #allExcludedCodePoints(String)} returns set of code points in the given string which are not not included in the
* target.
*
*
* CodePoints cp = new CodePoints(0x0061, 0x0062); // a b
* cp.allExcludedCodePoints("abc"); // [0x0063 (c)]
* cp.allExcludedCodePoints("abcad"); // [0x0063 (c), 0x0064 (d)]
* cp.allExcludedCodePoints("ab"); // []
*
*
*
* How to compose code points
*
* {@code CodePoints} provides composable APIs. Since a {@code CodePoints} instance is immutable. These API does not effect the
* state of {@code CodePoints} instances.
*
* Union
*
* Use {@link #union(CodePoints)}
*
*
*
* CodePoints ab = new CodePoints(0x0061 , 0x0062); // a b
* CodePoints cd = new CodePoints(0x0063, 0x0064); // c d
* CodePoints abcd = ab.union(cd); // a b c d
*
*
* Subtract
*
* Use {@link #subtract(CodePoints)}
*
*
*
* CodePoints abcd = new CodePoints(0x0061 , 0x0062, 0x0063, 0x0064); // a b c d
* CodePoints cd = new CodePoints(0x0063, 0x0064); // c d
* CodePoints ab = abcd.subtract(cd); // a b
*
*
* Intersect
*
* Use {@link #intersect(CodePoints)}
*
*
*
* CodePoints abcd = new CodePoints(0x0061 , 0x0062, 0x0063, 0x0064); // a b c d
* CodePoints cde = new CodePoints(0x0063, 0x0064, 0x0064 ); // c d e
* CodePoints cd = abcd.intersect(cde); // c d
*
*
* How to define new code points
*
* Extend {@link CodePoints} to define new code points. Following is a simple code points:
*
*
*
* public class ABCD extends CodePoints {
* public ABCD() {
* super(0x0061, 0x0062, 0x0063, 0x0064); // a b c d
* }
* }
*
*
* New code points can be created using the combination of existing code points.
*
*
*
* public class X_JIS_0208_Hiragana_Katakana extends CodePoints {
* public X_JIS_0208_Hiragana_Katakana() {
* super(new X_JIS_0208_Hiragana().union(new X_JIS_0208_Hiragana_Katakana()));
* }
* }
*
*
* Not that, new
is used not to cache temporary code points. If {@code X_JIS_0208_Hiragana} and
* {@code X_JIS_0208_Hiragana_Katakana} are also intended to be used, use {@link #of(Class)} instead of {@code new} so that
* these are cached:
*
*
*
* public class X_JIS_0208_Hiragana_Katakana extends CodePoints {
* public X_JIS_0208_Hiragana_Katakana() {
* super(CodePoints.of(X_JIS_0208_Hiragana.class).union(CodePoints.of(X_JIS_0208_Hiragana_Katakana.class)));
* }
* }
*
* @since 5.1.0
*/
public class CodePoints implements Serializable {
private static final long serialVersionUID = 1L;
/**
* shows no code point is found in the given string which is not included in the target code points.
*/
public static final int NOT_FOUND = Integer.MIN_VALUE;
/**
* {@code CodePoints} cache
*/
private static final ConcurrentMap, CodePoints> cache = new ConcurrentHashMap, CodePoints>();
/**
* set for code points.
*/
private final Set set;
/**
* Constructor with the given {@code java.lang.Integer} code points
* @param codePoints array of actual code points
*/
public CodePoints(Integer... codePoints) {
Set s = new HashSet(codePoints.length);
Collections.addAll(s, codePoints);
this.set = Collections.unmodifiableSet(s);
}
/**
* Constructor with the given {@code java.lang.String}
* @param strings array of strings which include target code points
*/
public CodePoints(String... strings) {
Set s = new HashSet();
for (String str : strings) {
int len = str.length();
int codePoint;
for (int i = 0; i < len; i += Character.charCount(codePoint)) {
codePoint = str.codePointAt(i);
s.add(codePoint);
}
}
this.set = Collections.unmodifiableSet(s);
}
/**
* Constructor with the given {@code java.lang.Integer} code points
* @param codePoints collection of actual code points
*/
public CodePoints(Collection codePoints) {
Set s = new HashSet(codePoints);
this.set = Collections.unmodifiableSet(s);
}
/**
* Constructor with the given {@code CodePoints}. The {@code java.util.Set} object inside {@code CodePoints} is shared.
* @param codePoints actual code points
*/
public CodePoints(CodePoints codePoints) {
this.set = codePoints.set;
}
/**
* returns whether all code points in the given string are included in the target code points.
* @param s target string
* @return {@code true} if all code points in the given string are included in the target code points。Otherwise
* {@code false} is returned.
*/
public boolean containsAll(String s) {
return this.firstExcludedCodePoint(s) == NOT_FOUND;
}
/**
* returns the first code point in the given string which is not included in the target code points.
* @param s target string
* @return first code point in the given string which is not included in the target code points. {@link #NOT_FOUND} is
* returned if all code points in the given string are included in the target code points.
*/
public int firstExcludedCodePoint(String s) {
if (s == null || s.isEmpty()) {
return NOT_FOUND;
}
// http://www.ibm.com/developerworks/jp/ysl/library/java/j-unicode_surrogate/
int len = s.length();
int codePoint;
for (int i = 0; i < len; i += Character.charCount(codePoint)) {
codePoint = s.codePointAt(i);
if (!set.contains(codePoint)) {
return codePoint;
}
}
return NOT_FOUND;
}
/**
* returns set of code points in the given string which are not not included in the target.
* @param s target string
* @return set of code points in the given string which are not not included in the target. an empty set is returned if all
* code points in the given string are included in the target code points.
*/
public Set allExcludedCodePoints(String s) {
if (s == null || s.isEmpty()) {
return Collections.emptySet();
}
Set excludedCodePoints = new LinkedHashSet();
// http://www.ibm.com/developerworks/jp/ysl/library/java/j-unicode_surrogate/
int len = s.length();
Integer codePoint;
for (int i = 0; i < len; i += Character.charCount(codePoint)) {
codePoint = s.codePointAt(i);
if (!set.contains(codePoint)) {
excludedCodePoints.add(codePoint);
}
}
return excludedCodePoints;
}
/**
* unite two set of code points
* @param codePoints code points to unite
* @return united code points
*/
public CodePoints union(CodePoints codePoints) {
Set setTmp = new HashSet(this.set);
setTmp.addAll(codePoints.set);
return new CodePoints(setTmp);
}
/**
* subtract two set of code points
* @param codePoints code points to subtract
* @return subtracted code points
*/
public CodePoints subtract(CodePoints codePoints) {
Set setTmp = new HashSet(this.set);
setTmp.removeAll(codePoints.set);
return new CodePoints(setTmp);
}
/**
* intersect two set of code points
* @param codePoints code points to intersect
* @return intersected code points
*/
public CodePoints intersect(CodePoints codePoints) {
Set setTmp = new HashSet(this.set);
setTmp.retainAll(codePoints.set);
return new CodePoints(setTmp);
}
/**
* Produces cached {@link CodePoints}. At first time, a new {@link CodePoints} is created. After second time, same instance
* is returned.
* @param clazz {@link CodePoints} class to create
* @param {@link CodePoints} class
* @return cached instance
*/
@SuppressWarnings("unchecked")
public static T of(Class clazz) {
if (cache.containsKey(clazz)) {
return (T) cache.get(clazz);
}
try {
T codePoints = clazz.getDeclaredConstructor().newInstance();
cache.put(clazz, codePoints);
return codePoints;
} catch (NoSuchMethodException | SecurityException | IllegalAccessException | IllegalArgumentException e) {
throw new IllegalArgumentException("public default constructor not found", e);
} catch (InstantiationException | InvocationTargetException e) {
throw new IllegalArgumentException("exception occurred while initializing", e);
}
}
/**
* Helper method to check whether all code points in the given string are included in any of the code points list.
* @param s target string
* @param codePointsList array of code points
* @return {@code true} if all code points in the given string are included in any of the code points list. Otherwise
* {@code false} is returned.
*/
public static boolean containsAllInAnyCodePoints(String s,
final CodePoints... codePointsList) {
Map excludedCounts = new HashMap();
for (CodePoints codePoints : codePointsList) {
Set excluded = codePoints.allExcludedCodePoints(s);
if (excluded.isEmpty()) {
// return immediately if the given string consists of a code points.
return true;
}
for (Integer codePoint : excluded) {
// count the number of CodePoints in the given list which forbade the given code point
Integer count = excludedCounts.get(codePoint);
if (count != null) {
excludedCounts.put(codePoint, count + 1);
} else {
excludedCounts.put(codePoint, 1);
}
}
}
for (Map.Entry entry : excludedCounts.entrySet()) {
if (entry.getValue() == codePointsList.length) {
// All CodePoints forbade the given code point.
// This means there are some code points which are not included in any given CodePoints' list
return false;
}
}
// OK if each code point is included in some CodePoints' list
return true;
}
/**
* equals method
* @param o object to check
* @return {@code true} if the given object equals to this instance. {@code false} otherwise.
*/
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
CodePoints that = (CodePoints) o;
return set.equals(that.set);
}
/**
* hash code of the instance
* @return hash code
*/
@Override
public int hashCode() {
return set.hashCode();
}
}