com.ibm.icu.text.StringMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2001-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Utility;
/**
* An object that matches a fixed input string, implementing the
* UnicodeMatcher API. This object also implements the
* UnicodeReplacer API, allowing it to emit the matched text as
* output. Since the match text may contain flexible match elements,
* such as UnicodeSets, the emitted text is not the match pattern, but
* instead a substring of the actual matched text. Following
* convention, the output text is the leftmost match seen up to this
* point.
*
* A StringMatcher may represent a segment, in which case it has a
* positive segment number. This affects how the matcher converts
* itself to a pattern but does not otherwise affect its function.
*
* A StringMatcher that is not a segment should not be used as a
* UnicodeReplacer.
*/
class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
/**
* The text to be matched.
*/
private String pattern;
/**
* Start offset, in the match text, of the rightmost
* match.
*/
private int matchStart;
/**
* Limit offset, in the match text, of the rightmost
* match.
*/
private int matchLimit;
/**
* The segment number, 1-based, or 0 if not a segment.
*/
private int segmentNumber;
/**
* Context object that maps stand-ins to matcher and replacer
* objects.
*/
private final RuleBasedTransliterator.Data data;
/**
* Construct a matcher that matches the given pattern string.
* @param theString the pattern to be matched, possibly containing
* stand-ins that represent nested UnicodeMatcher objects.
* @param segmentNum the segment number from 1..n, or 0 if this is
* not a segment.
* @param theData context object mapping stand-ins to
* UnicodeMatcher objects.
*/
public StringMatcher(String theString,
int segmentNum,
RuleBasedTransliterator.Data theData) {
data = theData;
pattern = theString;
matchStart = matchLimit = -1;
segmentNumber = segmentNum;
}
/**
* Construct a matcher that matches a substring of the given
* pattern string.
* @param theString the pattern to be matched, possibly containing
* stand-ins that represent nested UnicodeMatcher objects.
* @param start first character of theString to be matched
* @param limit index after the last character of theString to be
* matched.
* @param segmentNum the segment number from 1..n, or 0 if this is
* not a segment.
* @param theData context object mapping stand-ins to
* UnicodeMatcher objects.
*/
public StringMatcher(String theString,
int start,
int limit,
int segmentNum,
RuleBasedTransliterator.Data theData) {
this(theString.substring(start, limit), segmentNum, theData);
}
/**
* Implement UnicodeMatcher
*/
@Override
public int matches(Replaceable text,
int[] offset,
int limit,
boolean incremental) {
// Note (1): We process text in 16-bit code units, rather than
// 32-bit code points. This works because stand-ins are
// always in the BMP and because we are doing a literal match
// operation, which can be done 16-bits at a time.
int i;
int[] cursor = new int[] { offset[0] };
if (limit < cursor[0]) {
// Match in the reverse direction
for (i=pattern.length()-1; i>=0; --i) {
char keyChar = pattern.charAt(i); // OK; see note (1) above
UnicodeMatcher subm = data.lookupMatcher(keyChar);
if (subm == null) {
if (cursor[0] > limit &&
keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
--cursor[0];
} else {
return U_MISMATCH;
}
} else {
int m =
subm.matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
matchStart = cursor[0]+1;
matchLimit = offset[0]+1;
}
} else {
for (i=0; i 0) { // i.e., if this is a segment
result.append('(');
}
for (int i=0; i 0) { // i.e., if this is a segment
result.append(')');
}
// Flush quoteBuf out to result
Utility.appendToRule(result, -1,
true, escapeUnprintable, quoteBuf);
return result.toString();
}
/**
* Implement UnicodeMatcher
*/
@Override
public boolean matchesIndexValue(int v) {
if (pattern.length() == 0) {
return true;
}
int c = UTF16.charAt(pattern, 0);
UnicodeMatcher m = data.lookupMatcher(c);
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
}
/**
* Implementation of UnicodeMatcher API. Union the set of all
* characters that may be matched by this object into the given
* set.
* @param toUnionTo the set into which to union the source characters
*/
@Override
public void addMatchSetTo(UnicodeSet toUnionTo) {
int ch;
for (int i=0; i= 0) {
if (matchStart != matchLimit) {
text.copy(matchStart, matchLimit, dest);
outLen = matchLimit - matchStart;
}
}
text.replace(start, limit, ""); // delete original text
return outLen;
}
/**
* UnicodeReplacer API
*/
@Override
public String toReplacerPattern(boolean escapeUnprintable) {
// assert(segmentNumber > 0);
StringBuffer rule = new StringBuffer("$");
Utility.appendNumber(rule, segmentNumber, 10, 1);
return rule.toString();
}
/**
* Remove any match data. This must be called before performing a
* set of matches with this segment.
*/
public void resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Union the set of all characters that may output by this object
* into the given set.
* @param toUnionTo the set into which to union the output characters
*/
@Override
public void addReplacementSetTo(UnicodeSet toUnionTo) {
// The output of this replacer varies; it is the source text between
// matchStart and matchLimit. Since this varies depending on the
// input text, we can't compute it here. We can either do nothing
// or we can add ALL characters to the set. It's probably more useful
// to do nothing.
}
}
//eof