All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.UnicodeSetSpanner Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2014-2016, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
package com.ibm.icu.text;

import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.util.OutputInt;

/**
 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
 * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
 * 

Note: The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on. * For the inverse, the following mapping is used: *

    *
  • {@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}
  • *
  • {@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}
  • *
  • {@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}
  • *
* These are actually not complete inverses. However, the alternating works because there are no gaps. * For example, with [a{ab}{bc}], you get the following behavior when scanning forward: * * * * * *
SIMPLExxx[ab]cyyy
CONTAINEDxxx[abc]yyy
NOT_CONTAINED[xxx]ab[cyyy]
*

So here is what happens when you alternate: * *

* * * * *
start|xxxabcyyy
NOT_CONTAINEDxxx|abcyyy
CONTAINEDxxxabc|yyy
NOT_CONTAINEDxxxabcyyy|
*

The entire string is traversed. * * @stable ICU 54 */ public class UnicodeSetSpanner { private final UnicodeSet unicodeSet; /** * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class * can be used with a non-frozen version to avoid the cost of freezing. * * @param source * the original UnicodeSet * * @stable ICU 54 */ public UnicodeSetSpanner(UnicodeSet source) { unicodeSet = source; } /** * Returns the UnicodeSet used for processing. It is frozen iff the original was. * * @return the construction set. * * @stable ICU 54 */ public UnicodeSet getUnicodeSet() { return unicodeSet; } /** * {@inheritDoc} * * @stable ICU 54 */ @Override public boolean equals(Object other) { return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet); } /** * {@inheritDoc} * * @stable ICU 54 */ @Override public int hashCode() { return unicodeSet.hashCode(); } /** * Options for replaceFrom and countIn to control how to treat each matched span. * It is similar to whether one is replacing [abc] by x, or [abc]* by x. * * @stable ICU 54 */ public enum CountMethod { /** * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate * set elements. * * @stable ICU 54 */ WHOLE_SPAN, /** * Use the smallest number of elements in the spanned range for counting and modification, * based on the {@link UnicodeSet.SpanCondition}. * If the set has no strings, this will be the same as the number of spanned code points. *

For example, in the string "abab" with SpanCondition.SIMPLE: *

    *
  • spanning with [ab] will count four MIN_ELEMENTS.
  • *
  • spanning with [{ab}] will count two MIN_ELEMENTS.
  • *
  • spanning with [ab{ab}] will also count two MIN_ELEMENTS.
  • *
* * @stable ICU 54 */ MIN_ELEMENTS, // Note: could in the future have an additional option MAX_ELEMENTS } /** * Returns the number of matching characters found in a character sequence, * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * the sequence to count characters in * @return the count. Zero if there are none. * * @stable ICU 54 */ public int countIn(CharSequence sequence) { return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); } /** * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * the sequence to count characters in * @param countMethod * whether to treat an entire span as a match, or individual elements as matches * @return the count. Zero if there are none. * * @stable ICU 54 */ public int countIn(CharSequence sequence, CountMethod countMethod) { return countIn(sequence, countMethod, SpanCondition.SIMPLE); } /** * Returns the number of matching characters found in a character sequence. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * the sequence to count characters in * @param countMethod * whether to treat an entire span as a match, or individual elements as matches * @param spanCondition * the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span; * NOT_CONTAINED is the reverse. *
WARNING: when a UnicodeSet contains strings, there may be unexpected behavior in edge cases. * @return the count. Zero if there are none. * * @stable ICU 54 */ public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) { int count = 0; int start = 0; SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE : SpanCondition.NOT_CONTAINED; final int length = sequence.length(); OutputInt spanCount = null; while (start != length) { int endOfSpan = unicodeSet.span(sequence, start, skipSpan); if (endOfSpan == length) { break; } if (countMethod == CountMethod.WHOLE_SPAN) { start = unicodeSet.span(sequence, endOfSpan, spanCondition); count += 1; } else { if (spanCount == null) { spanCount = new OutputInt(); } start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount); count += spanCount.value; } } return count; } /** * Delete all the matching spans in sequence, using SpanCondition.SIMPLE * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * charsequence to replace matching spans in. * @return modified string. * * @stable ICU 54 */ public String deleteFrom(CharSequence sequence) { return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE); } /** * Delete all matching spans in sequence, according to the spanCondition. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * charsequence to replace matching spans in. * @param spanCondition * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED) * @return modified string. * * @stable ICU 54 */ public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) { return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition); } /** * Replace all matching spans in sequence by the replacement, * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * charsequence to replace matching spans in. * @param replacement * replacement sequence. To delete, use "" * @return modified string. * * @stable ICU 54 */ public String replaceFrom(CharSequence sequence, CharSequence replacement) { return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); } /** * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * * @param sequence * charsequence to replace matching spans in. * @param replacement * replacement sequence. To delete, use "" * @param countMethod * whether to treat an entire span as a match, or individual elements as matches * @return modified string. * * @stable ICU 54 */ public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) { return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE); } /** * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition. * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. * @param sequence * charsequence to replace matching spans in. * @param replacement * replacement sequence. To delete, use "" * @param countMethod * whether to treat an entire span as a match, or individual elements as matches * @param spanCondition * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching * (NOT_CONTAINED) * @return modified string. * * @stable ICU 54 */ public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, SpanCondition spanCondition) { SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE : SpanCondition.NOT_CONTAINED; final boolean remove = replacement.length() == 0; StringBuilder result = new StringBuilder(); // TODO, we can optimize this to // avoid this allocation unless needed final int length = sequence.length(); OutputInt spanCount = null; for (int endCopy = 0; endCopy != length;) { int endModify; if (countMethod == CountMethod.WHOLE_SPAN) { endModify = unicodeSet.span(sequence, endCopy, spanCondition); } else { if (spanCount == null) { spanCount = new OutputInt(); } endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount); } if (remove || endModify == 0) { // do nothing } else if (countMethod == CountMethod.WHOLE_SPAN) { result.append(replacement); } else { for (int i = spanCount.value; i > 0; --i) { result.append(replacement); } } if (endModify == length) { break; } endCopy = unicodeSet.span(sequence, endModify, copySpan); result.append(sequence.subSequence(endModify, endCopy)); } return result.toString(); } /** * Options for the trim() method * * @stable ICU 54 */ public enum TrimOption { /** * Trim leading spans. * * @stable ICU 54 */ LEADING, /** * Trim leading and trailing spans. * * @stable ICU 54 */ BOTH, /** * Trim trailing spans. * * @stable ICU 54 */ TRAILING; } /** * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example: * *
     * {@code
     * 
     *   new UnicodeSet("[ab]").trim("abacatbab")}
     * 
* * ... returns {@code "cat"}. * @param sequence * the sequence to trim * @return a subsequence * * @stable ICU 54 */ public CharSequence trim(CharSequence sequence) { return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE); } /** * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or * end of the string, using the trimOption and SpanCondition.SIMPLE. For example: * *
     * {@code
     * 
     *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)}
     * 
* * ... returns {@code "catbab"}. * * @param sequence * the sequence to trim * @param trimOption * LEADING, TRAILING, or BOTH * @return a subsequence * * @stable ICU 54 */ public CharSequence trim(CharSequence sequence, TrimOption trimOption) { return trim(sequence, trimOption, SpanCondition.SIMPLE); } /** * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or * end of the string, depending on the trimOption and spanCondition. For example: * *
     * {@code
     * 
     *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)}
     * 
* * ... returns {@code "catbab"}. * * @param sequence * the sequence to trim * @param trimOption * LEADING, TRAILING, or BOTH * @param spanCondition * SIMPLE, CONTAINED or NOT_CONTAINED * @return a subsequence * * @stable ICU 54 */ public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) { int endLeadContained, startTrailContained; final int length = sequence.length(); if (trimOption != TrimOption.TRAILING) { endLeadContained = unicodeSet.span(sequence, spanCondition); if (endLeadContained == length) { return ""; } } else { endLeadContained = 0; } if (trimOption != TrimOption.LEADING) { startTrailContained = unicodeSet.spanBack(sequence, spanCondition); } else { startTrailContained = length; } return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence( endLeadContained, startTrailContained); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy