All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.StringNormalizers Maven / Gradle / Ivy

The newest version!
package edu.isi.nlp;

import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.google.common.base.CharMatcher;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Normalizer2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.immutables.value.Value;

/**
 * Provides various normalizers for strings
 *
 * @author Ryan Gabbard, Jay DeYoung
 */
public final class StringNormalizers {

  private StringNormalizers() {
    throw new UnsupportedOperationException();
  }

  /** A {@code StringNormalizer} which does nothing. */
  public static StringNormalizer identity() {
    return IdentityNormalizer.INSTANCE;
  }

  /**
   * Applies the NFKC Unicode normalization to the given String. You should probably do this or
   * {@link #toNfc()} on any strings from multiple sources being compared in all multilingual code
   * to deal with e.g. differences in composing accents. In particular, you probably want to apply
   * this before applying any other normalizers.
   *
   * 

See Unicode Standard Annex #15 section 1.2 http://www.unicode.org/reports/tr15/ */ public static StringNormalizer toNfkc() { return NfkcNormalizer.INSTANCE; } /** * Applies the NFC Unicode normalization to the given String. You should probably do either this * or {@link #toNfkc()} on any strings from multiple sources being compared in all multilingual * code to deal with e.g. differences in composing accents. In particular, you probably want to * apply this before applying any other normalizers. * *

See Unicode Standard Annex #15 section 1.2 http://www.unicode.org/reports/tr15/ */ public static StringNormalizer toNfc() { return NfcNormalizer.INSTANCE; } /** * A {@link StringNormalizer} which maps all Unicode codepoints matched by the provided {@link * CodepointMatcher} to the specified {@code replacementCharacter}. */ public static StringNormalizer translate( CodepointMatcher codepointMatcher, char replacementCharacter) { return CodepointTranslatorStringNormalizer.of(codepointMatcher, replacementCharacter); } /** * A {@link StringNormalizer} which collapses all consecutive characters which match the specified * {@link CodepointMatcher} to the first such character. */ public static StringNormalizer collapseConsecutive(CodepointMatcher codepointMatcher) { return CollapseConsecutiveCharacters.of(codepointMatcher); } @UnicodeUnsafe @EvalHack(eval = "LORELEI-Y2") public static StringNormalizer stripFromEnd(CharMatcher toRemove) { return StripFromEnd.of(toRemove); } /** * Converts the input to lower-case in a locale-sensitive way. Locales matter - for example the * lower case verison of I in Turkish is not i but dot-less i! */ public static StringNormalizer toLowercase(NlpLocale locale) { return ToLowerCase.forLocale(locale); } /** * Converts the input to upper-case in a locale-sensitive way. Locales matter - for example the * upper case verison of i in Turkish is not I but İ! */ public static StringNormalizer toUppercase(NlpLocale locale) { return ToUpperCase.forLocale(locale); } /** * A {@link StringNormalizer} which composes a sequence of {@link StringNormalizer}s. The first * {@code StringNormalizer} in the provided sequence it applied first, then the second is applied * to its output, and so on. */ public static StringNormalizer compose(Iterable stringNormalizers) { return CompositeStringNormalizer.of(stringNormalizers); } /** A {@link StringNormalizer} which composes a sequence of {@link StringNormalizer}s. */ public static StringNormalizer compose(StringNormalizer norm1, StringNormalizer... others) { final List asList = new ArrayList<>(); asList.add(norm1); asList.addAll(Arrays.asList(others)); return CompositeStringNormalizer.of(asList); } public static Function asFunction(StringNormalizer normalizer) { return NormalizerAsFunction.of(normalizer); } } enum IdentityNormalizer implements StringNormalizer { INSTANCE; @Override public String normalize(final String input) { return input; } @Override public String toString() { return "Identity"; } } /** See {@link StringNormalizers#translate(CodepointMatcher, char)} */ @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class CodepointTranslatorStringNormalizer implements StringNormalizer { @Value.Parameter public abstract CodepointMatcher codepointMatcher(); @Value.Parameter public abstract char replacementCharacter(); @Override public String normalize(final String input) { return codepointMatcher().replaceAll(input, replacementCharacter()); } public static CodepointTranslatorStringNormalizer of( CodepointMatcher matcher, char replacementCharacter) { return ImmutableCodepointTranslatorStringNormalizer.of(matcher, replacementCharacter); } } /** See {@link StringNormalizers#collapseConsecutive(CodepointMatcher)} */ @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class CollapseConsecutiveCharacters implements StringNormalizer { @Value.Parameter public abstract CodepointMatcher toCollapseMatcher(); @Override public String normalize(final String input) { final StringBuilder ret = new StringBuilder(); // -1 is out-of-range for Unicode int lastCodePoint = -1; for (int offset = 0; offset < input.length(); ) { final int curCodePoint = input.codePointAt(offset); final boolean curCashpointIsCollapsible = toCollapseMatcher().matches(curCodePoint); if (!curCashpointIsCollapsible || curCodePoint != lastCodePoint) { ret.append(Character.toChars(curCodePoint)); } lastCodePoint = curCodePoint; offset += Character.charCount(curCodePoint); } return ret.toString(); } public static CollapseConsecutiveCharacters of(CodepointMatcher matcher) { return ImmutableCollapseConsecutiveCharacters.of(matcher); } } @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class StripFromEnd implements StringNormalizer { @Value.Parameter public abstract CharMatcher toStripMatcher(); @Override @UnicodeUnsafe public String normalize(final String input) { return toStripMatcher().trimTrailingFrom(input); } public static StripFromEnd of(CharMatcher matcher) { return ImmutableStripFromEnd.of(matcher); } } /** See {@link StringNormalizers#compose(Iterable)} */ @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class CompositeStringNormalizer implements StringNormalizer { /** * The {@link StringNormalizer}s to apply. The first in the list will be applied first; the last * in the list will be applied last. Each {@link StringNormalizer} is applied to the output of the * previous one. */ @Value.Parameter public abstract ImmutableList wordShapers(); @Override public String normalize(final String input) { String cur = input; for (final StringNormalizer shaper : wordShapers()) { cur = shaper.normalize(cur); } return cur; } public static CompositeStringNormalizer of(Iterable wordShapers) { return ImmutableCompositeStringNormalizer.of(wordShapers); } } @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class ToLowerCase implements StringNormalizer { @Value.Parameter public abstract NlpLocale locale(); @Override public String normalize(final String input) { return UCharacter.toLowerCase(locale().asIcuLocale(), input); } public static ToLowerCase forLocale(NlpLocale locale) { return ImmutableToLowerCase.of(locale); } } @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class ToUpperCase implements StringNormalizer { @Value.Parameter public abstract NlpLocale locale(); @Override public String normalize(final String input) { return UCharacter.toUpperCase(locale().asIcuLocale(), input); } public static ToUpperCase forLocale(NlpLocale locale) { return ImmutableToUpperCase.of(locale); } } /** See {@link StringNormalizers#toNfkc()} */ enum NfkcNormalizer implements StringNormalizer { INSTANCE; private static final Normalizer2 icuNormalizer = Normalizer2.getNFKCInstance(); @Override public String normalize(final String input) { return icuNormalizer.normalize(input); } @Override public String toString() { return "toNFKC()"; } } /** See {@link StringNormalizers#toNfc()} */ enum NfcNormalizer implements StringNormalizer { INSTANCE; private static final Normalizer2 icuNormalizer = Normalizer2.getNFCInstance(); @Override public String normalize(final String input) { return icuNormalizer.normalize(input); } @Override public String toString() { return "toNFC()"; } } /** See {@link StringNormalizers#asFunction(StringNormalizer)}. */ @IsiNlpImmutable @Value.Immutable @JsonSerialize @JsonDeserialize abstract class NormalizerAsFunction implements Function { public static NormalizerAsFunction of(StringNormalizer normalizer) { return ImmutableNormalizerAsFunction.of(normalizer); } @Value.Parameter public abstract StringNormalizer stringNormalizer(); @Override public final String apply(String s) { return stringNormalizer().normalize(s); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy