edu.isi.nlp.StringNormalizers Maven / Gradle / Ivy
package edu.isi.nlp;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.immutables.value.Value;
* Provides various normalizers for strings
* @author Ryan Gabbard, Jay DeYoung
public final class StringNormalizers {
private StringNormalizers() {
throw new UnsupportedOperationException();
/** A {@code StringNormalizer} which does nothing. */
public static StringNormalizer identity() {
return IdentityNormalizer.INSTANCE;
* Applies the NFKC Unicode normalization to the given String. You should probably do this or
* {@link #toNfc()} on any strings from multiple sources being compared in all multilingual code
* to deal with e.g. differences in composing accents. In particular, you probably want to apply
* this before applying any other normalizers.
* See Unicode Standard Annex #15 section 1.2
public static StringNormalizer toNfkc() {
return NfkcNormalizer.INSTANCE;
* Applies the NFC Unicode normalization to the given String. You should probably do either this
* or {@link #toNfkc()} on any strings from multiple sources being compared in all multilingual
* code to deal with e.g. differences in composing accents. In particular, you probably want to
* apply this before applying any other normalizers.
See Unicode Standard Annex #15 section 1.2
public static StringNormalizer toNfc() {
return NfcNormalizer.INSTANCE;
* A {@link StringNormalizer} which maps all Unicode codepoints matched by the provided {@link
* CodepointMatcher} to the specified {@code replacementCharacter}.
public static StringNormalizer translate(
CodepointMatcher codepointMatcher, char replacementCharacter) {
return CodepointTranslatorStringNormalizer.of(codepointMatcher, replacementCharacter);
* A {@link StringNormalizer} which collapses all consecutive characters which match the specified
* {@link CodepointMatcher} to the first such character.
public static StringNormalizer collapseConsecutive(CodepointMatcher codepointMatcher) {
return CollapseConsecutiveCharacters.of(codepointMatcher);
@EvalHack(eval = "LORELEI-Y2")
public static StringNormalizer stripFromEnd(CharMatcher toRemove) {
return StripFromEnd.of(toRemove);
* Converts the input to lower-case in a locale-sensitive way. Locales matter - for example the
* lower case verison of I in Turkish is not i but dot-less i!
public static StringNormalizer toLowercase(NlpLocale locale) {
return ToLowerCase.forLocale(locale);
* Converts the input to upper-case in a locale-sensitive way. Locales matter - for example the
* upper case verison of i in Turkish is not I but İ!
public static StringNormalizer toUppercase(NlpLocale locale) {
return ToUpperCase.forLocale(locale);
* A {@link StringNormalizer} which composes a sequence of {@link StringNormalizer}s. The first
* {@code StringNormalizer} in the provided sequence it applied first, then the second is applied
* to its output, and so on.
public static StringNormalizer compose(Iterable extends StringNormalizer> stringNormalizers) {
return CompositeStringNormalizer.of(stringNormalizers);
/** A {@link StringNormalizer} which composes a sequence of {@link StringNormalizer}s. */
public static StringNormalizer compose(StringNormalizer norm1, StringNormalizer... others) {
final List asList = new ArrayList<>();
return CompositeStringNormalizer.of(asList);
public static Function asFunction(StringNormalizer normalizer) {
return NormalizerAsFunction.of(normalizer);
enum IdentityNormalizer implements StringNormalizer {
public String normalize(final String input) {
return input;
public String toString() {
return "Identity";
/** See {@link StringNormalizers#translate(CodepointMatcher, char)} */
abstract class CodepointTranslatorStringNormalizer implements StringNormalizer {
public abstract CodepointMatcher codepointMatcher();
public abstract char replacementCharacter();
public String normalize(final String input) {
return codepointMatcher().replaceAll(input, replacementCharacter());
public static CodepointTranslatorStringNormalizer of(
CodepointMatcher matcher, char replacementCharacter) {
return ImmutableCodepointTranslatorStringNormalizer.of(matcher, replacementCharacter);
/** See {@link StringNormalizers#collapseConsecutive(CodepointMatcher)} */
abstract class CollapseConsecutiveCharacters implements StringNormalizer {
public abstract CodepointMatcher toCollapseMatcher();
public String normalize(final String input) {
final StringBuilder ret = new StringBuilder();
// -1 is out-of-range for Unicode
int lastCodePoint = -1;
for (int offset = 0; offset < input.length(); ) {
final int curCodePoint = input.codePointAt(offset);
final boolean curCashpointIsCollapsible = toCollapseMatcher().matches(curCodePoint);
if (!curCashpointIsCollapsible || curCodePoint != lastCodePoint) {
lastCodePoint = curCodePoint;
offset += Character.charCount(curCodePoint);
return ret.toString();
public static CollapseConsecutiveCharacters of(CodepointMatcher matcher) {
return ImmutableCollapseConsecutiveCharacters.of(matcher);
abstract class StripFromEnd implements StringNormalizer {
public abstract CharMatcher toStripMatcher();
public String normalize(final String input) {
return toStripMatcher().trimTrailingFrom(input);
public static StripFromEnd of(CharMatcher matcher) {
return ImmutableStripFromEnd.of(matcher);
/** See {@link StringNormalizers#compose(Iterable)} */
abstract class CompositeStringNormalizer implements StringNormalizer {
* The {@link StringNormalizer}s to apply. The first in the list will be applied first; the last
* in the list will be applied last. Each {@link StringNormalizer} is applied to the output of the
* previous one.
public abstract ImmutableList wordShapers();
public String normalize(final String input) {
String cur = input;
for (final StringNormalizer shaper : wordShapers()) {
cur = shaper.normalize(cur);
return cur;
public static CompositeStringNormalizer of(Iterable extends StringNormalizer> wordShapers) {
return ImmutableCompositeStringNormalizer.of(wordShapers);
abstract class ToLowerCase implements StringNormalizer {
public abstract NlpLocale locale();
public String normalize(final String input) {
return UCharacter.toLowerCase(locale().asIcuLocale(), input);
public static ToLowerCase forLocale(NlpLocale locale) {
return ImmutableToLowerCase.of(locale);
abstract class ToUpperCase implements StringNormalizer {
public abstract NlpLocale locale();
public String normalize(final String input) {
return UCharacter.toUpperCase(locale().asIcuLocale(), input);
public static ToUpperCase forLocale(NlpLocale locale) {
return ImmutableToUpperCase.of(locale);
/** See {@link StringNormalizers#toNfkc()} */
enum NfkcNormalizer implements StringNormalizer {
private static final Normalizer2 icuNormalizer = Normalizer2.getNFKCInstance();
public String normalize(final String input) {
return icuNormalizer.normalize(input);
public String toString() {
return "toNFKC()";
/** See {@link StringNormalizers#toNfc()} */
enum NfcNormalizer implements StringNormalizer {
private static final Normalizer2 icuNormalizer = Normalizer2.getNFCInstance();
public String normalize(final String input) {
return icuNormalizer.normalize(input);
public String toString() {
return "toNFC()";
/** See {@link StringNormalizers#asFunction(StringNormalizer)}. */
abstract class NormalizerAsFunction implements Function {
public static NormalizerAsFunction of(StringNormalizer normalizer) {
return ImmutableNormalizerAsFunction.of(normalizer);
public abstract StringNormalizer stringNormalizer();
public final String apply(String s) {
return stringNormalizer().normalize(s);