com.ibm.icu.text.StringMatcher Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2001-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Utility;
/**
* An object that matches a fixed input string, implementing the
* UnicodeMatcher API. This object also implements the
* UnicodeReplacer API, allowing it to emit the matched text as
* output. Since the match text may contain flexible match elements,
* such as UnicodeSets, the emitted text is not the match pattern, but
* instead a substring of the actual matched text. Following
* convention, the output text is the leftmost match seen up to this
* point.
*
* A StringMatcher may represent a segment, in which case it has a
* positive segment number. This affects how the matcher converts
* itself to a pattern but does not otherwise affect its function.
*
* A StringMatcher that is not a segment should not be used as a
* UnicodeReplacer.
*/
class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
/**
* The text to be matched.
*/
private String pattern;
/**
* Start offset, in the match text, of the rightmost
* match.
*/
private int matchStart;
/**
* Limit offset, in the match text, of the rightmost
* match.
*/
private int matchLimit;
/**
* The segment number, 1-based, or 0 if not a segment.
*/
private int segmentNumber;
/**
* Context object that maps stand-ins to matcher and replacer
* objects.
*/
private final RuleBasedTransliterator.Data data;
/**
* Construct a matcher that matches the given pattern string.
* @param theString the pattern to be matched, possibly containing
* stand-ins that represent nested UnicodeMatcher objects.
* @param segmentNum the segment number from 1..n, or 0 if this is
* not a segment.
* @param theData context object mapping stand-ins to
* UnicodeMatcher objects.
*/
public StringMatcher(String theString,
int segmentNum,
RuleBasedTransliterator.Data theData) {
data = theData;
pattern = theString;
matchStart = matchLimit = -1;
segmentNumber = segmentNum;
}
/**
* Construct a matcher that matches a substring of the given
* pattern string.
* @param theString the pattern to be matched, possibly containing
* stand-ins that represent nested UnicodeMatcher objects.
* @param start first character of theString to be matched
* @param limit index after the last character of theString to be
* matched.
* @param segmentNum the segment number from 1..n, or 0 if this is
* not a segment.
* @param theData context object mapping stand-ins to
* UnicodeMatcher objects.
*/
public StringMatcher(String theString,
int start,
int limit,
int segmentNum,
RuleBasedTransliterator.Data theData) {
this(theString.substring(start, limit), segmentNum, theData);
}
/**
* Implement UnicodeMatcher
*/
@Override
public int matches(Replaceable text,
int[] offset,
int limit,
boolean incremental) {
// Note (1): We process text in 16-bit code units, rather than
// 32-bit code points. This works because stand-ins are
// always in the BMP and because we are doing a literal match
// operation, which can be done 16-bits at a time.
int i;
int[] cursor = new int[] { offset[0] };
if (limit < cursor[0]) {
// Match in the reverse direction
for (i=pattern.length()-1; i>=0; --i) {
char keyChar = pattern.charAt(i); // OK; see note (1) above
UnicodeMatcher subm = data.lookupMatcher(keyChar);
if (subm == null) {
if (cursor[0] > limit &&
keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
--cursor[0];
} else {
return U_MISMATCH;
}
} else {
int m =
subm.matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
matchStart = cursor[0]+1;
matchLimit = offset[0]+1;
}
} else {
for (i=0; i 0) { // i.e., if this is a segment
result.append('(');
}
for (int i=0; i 0) { // i.e., if this is a segment
result.append(')');
}
// Flush quoteBuf out to result
Utility.appendToRule(result, -1,
true, escapeUnprintable, quoteBuf);
return result.toString();
}
/**
* Implement UnicodeMatcher
*/
@Override
public boolean matchesIndexValue(int v) {
if (pattern.length() == 0) {
return true;
}
int c = UTF16.charAt(pattern, 0);
UnicodeMatcher m = data.lookupMatcher(c);
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
}
/**
* Implementation of UnicodeMatcher API. Union the set of all
* characters that may be matched by this object into the given
* set.
* @param toUnionTo the set into which to union the source characters
*/
@Override
public void addMatchSetTo(UnicodeSet toUnionTo) {
int ch;
for (int i=0; i= 0) {
if (matchStart != matchLimit) {
text.copy(matchStart, matchLimit, dest);
outLen = matchLimit - matchStart;
}
}
text.replace(start, limit, ""); // delete original text
return outLen;
}
/**
* UnicodeReplacer API
*/
@Override
public String toReplacerPattern(boolean escapeUnprintable) {
// assert(segmentNumber > 0);
StringBuffer rule = new StringBuffer("$");
Utility.appendNumber(rule, segmentNumber, 10, 1);
return rule.toString();
}
/**
* Remove any match data. This must be called before performing a
* set of matches with this segment.
*/
public void resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Union the set of all characters that may output by this object
* into the given set.
* @param toUnionTo the set into which to union the output characters
*/
@Override
public void addReplacementSetTo(UnicodeSet toUnionTo) {
// The output of this replacer varies; it is the source text between
// matchStart and matchLimit. Since this varies depending on the
// input text, we can't compute it here. We can either do nothing
// or we can add ALL characters to the set. It's probably more useful
// to do nothing.
}
}
//eof
© 2015 - 2025 Weber Informatics LLC | Privacy Policy