
software.amazon.event.ruler.ByteMachine Maven / Gradle / Ivy
package software.amazon.event.ruler;
import com.fasterxml.jackson.core.io.doubleparser.JavaDoubleParser;
import software.amazon.event.ruler.input.InputByte;
import software.amazon.event.ruler.input.InputCharacter;
import software.amazon.event.ruler.input.InputCharacterType;
import software.amazon.event.ruler.input.InputMultiByteSet;
import software.amazon.event.ruler.input.MultiByte;
import javax.annotation.concurrent.ThreadSafe;
import java.nio.charset.StandardCharsets;
import java.util.AbstractMap;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import static software.amazon.event.ruler.CompoundByteTransition.coalesce;
import static software.amazon.event.ruler.MatchType.ANYTHING_BUT_SUFFIX;
import static software.amazon.event.ruler.MatchType.EQUALS_IGNORE_CASE;
import static software.amazon.event.ruler.MatchType.EXACT;
import static software.amazon.event.ruler.MatchType.EXISTS;
import static software.amazon.event.ruler.MatchType.SUFFIX;
import static software.amazon.event.ruler.MatchType.SUFFIX_EQUALS_IGNORE_CASE;
import static software.amazon.event.ruler.input.DefaultParser.getParser;
import static software.amazon.event.ruler.input.MultiByte.MAX_CONTINUATION_BYTE;
import static software.amazon.event.ruler.input.MultiByte.MAX_FIRST_BYTE_FOR_ONE_BYTE_CHAR;
import static software.amazon.event.ruler.input.MultiByte.MAX_FIRST_BYTE_FOR_TWO_BYTE_CHAR;
import static software.amazon.event.ruler.input.MultiByte.MAX_NON_FIRST_BYTE;
import static software.amazon.event.ruler.input.MultiByte.MIN_CONTINUATION_BYTE;
import static software.amazon.event.ruler.input.MultiByte.MIN_FIRST_BYTE_FOR_ONE_BYTE_CHAR;
import static software.amazon.event.ruler.input.MultiByte.MIN_FIRST_BYTE_FOR_TWO_BYTE_CHAR;
/**
* Represents a UTF8-byte-level state machine that matches a Ruler state machine's field values.
* Each state is a map keyed by utf8 byte. getTransition(byte) yields a Target, which can contain either or
* both of the next ByteState, and the first of a chain of Matches, which indicate a match to some pattern.
*/
@ThreadSafe
class ByteMachine {
// Only these match types support shortcuts during traversal.
private static final Set SHORTCUT_MATCH_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(EXACT)));
private final ByteState startState = new ByteState();
// For wildcard rule "*", the start state is a match.
private ByteMatch startStateMatch;
// non-zero if the machine has a numerical-comparison pattern included. In which case, values
// should be converted to ComparableNumber before testing for matches, if possible.
//
private final AtomicInteger hasNumeric = new AtomicInteger(0);
// as with the previous, but signals the presence of an attempt to match an IP address
//
private final AtomicInteger hasIP = new AtomicInteger(0);
// signal the presence of suffix match in current byteMachine instance
private final AtomicInteger hasSuffix = new AtomicInteger(0);
// For anything-but patterns, we assume they've matched unless we can prove they didn't. When we
// add an anything-but to the machine, we put it in just like an exact match and remember it in
// this structure. Then when we traverse the machine, if we get an exact match to the anything-but,
// that represents a failure, which we record in the failedAnythingButs hash, and then finally
// do a bit of set arithmetic to add the anythingButs, minus the failedAnythingButs, to the
// result set.
//
// We could do this more straightforwardly by filling in the whole byte state. For
// example, if you had "anything-but: foo", in the first state, "f" would transition to state 2, all 255
// other byte values to a "success" match. In state 2 and state 3, "o" would transition to the next state
// and all 255 others to success; then you'd have all 256 values in state 4 be success. This would be simple
// and fast, but VERY memory-expensive, because you'd have 4 fully-filled states, and a thousand or so
// different NameState objects. Now, there are optimizations to be had with default-values in ByteStates,
// and lazily allocating Matches only when you have unique values, but all of these things would add
// considerable complexity, and this is also easy to understand, and probably not much slower.
//
private final Map> anythingButs = new ConcurrentHashMap<>();
// Multiple different next-namestate steps can result from processing a single field value, for example
// "foot" matches "foot" exactly, "foo" as a prefix, and "hand" as an anything-but. So, this
// method returns a list.
Set transitionOn(final String valString) {
// not thread-safe, but this is only used in the scope of this method on one thread
final Set transitionTo = new HashSet<>();
// Do CIDR matching if there is at least one IP pattern, then move on to NUMERIC or STRING matching below.
if (hasIP.get() > 0) {
try {
// we'll try both the quoted and unquoted version to help people whose events aren't in JSON
String ipString;
if (valString.startsWith("\"") && valString.endsWith("\"")) {
ipString = CIDR.ipToString(valString.substring(1, valString.length() -1));
} else {
ipString = CIDR.ipToString(valString);
}
doTransitionOn(ipString, transitionTo, TransitionValueType.CIDR);
} catch (IllegalArgumentException e) {
// no-op, couldn't treat this as an IP address
}
}
// Do just one of NUMERIC or STRING matching. Do NUMERIC if machine has numeric patterns and provided value
// is a number. Otherwise, do STRING. We'll still get the correct behavior even if there are both numeric and
// string patterns present as no value can satisfy both a numeric and a string pattern. This is due to string
// patterns starting/ending with a double quotation, where as numeric patterns never do.
if (hasNumeric.get() > 0) {
try {
final double numerically = JavaDoubleParser.parseDouble(valString);
doTransitionOn(ComparableNumber.generate(numerically), transitionTo, TransitionValueType.NUMERIC);
return transitionTo;
} catch (Exception e) {
// no-op, couldn't treat this as a sensible number
}
}
doTransitionOn(valString, transitionTo, TransitionValueType.STRING);
return transitionTo;
}
boolean isEmpty() {
if (startState.hasNoTransitions() && startStateMatch == null) {
assert anythingButs.isEmpty();
assert hasNumeric.get() == 0;
assert hasIP.get() == 0;
return true;
}
return false;
}
// this is to support deleteRule(). It deletes the ByteStates that exist to support matching the provided pattern.
void deletePattern(final Patterns pattern) {
switch (pattern.type()) {
case NUMERIC_RANGE:
assert pattern instanceof Range;
deleteRangePattern((Range) pattern);
break;
case ANYTHING_BUT:
assert pattern instanceof AnythingBut;
deleteAnythingButPattern((AnythingBut) pattern);
break;
case ANYTHING_BUT_IGNORE_CASE:
assert pattern instanceof AnythingButEqualsIgnoreCase;
deleteAnythingButEqualsIgnoreCasePattern((AnythingButEqualsIgnoreCase) pattern);
break;
case EXACT:
case NUMERIC_EQ:
case PREFIX:
case PREFIX_EQUALS_IGNORE_CASE:
case SUFFIX:
case SUFFIX_EQUALS_IGNORE_CASE:
case ANYTHING_BUT_PREFIX:
case ANYTHING_BUT_SUFFIX:
case EQUALS_IGNORE_CASE:
case WILDCARD:
assert pattern instanceof ValuePatterns;
deleteMatchPattern((ValuePatterns) pattern);
break;
case EXISTS:
deleteExistencePattern(pattern);
break;
default:
throw new AssertionError(pattern + " is not implemented yet");
}
}
private void deleteExistencePattern(Patterns pattern) {
final InputCharacter[] characters = getParser().parse(pattern.type(), Patterns.EXISTS_BYTE_STRING);
deleteMatchStep(startState, 0, pattern, characters);
}
private void deleteAnythingButPattern(AnythingBut pattern) {
pattern.getValues().forEach(value ->
deleteMatchStep(startState, 0, pattern, getParser().parse(pattern.type(), value)));
}
private void deleteAnythingButEqualsIgnoreCasePattern(AnythingButEqualsIgnoreCase pattern) {
pattern.getValues().forEach(value ->
deleteMatchStep(startState, 0, pattern, getParser().parse(pattern.type(), value)));
}
private void deleteMatchPattern(ValuePatterns pattern) {
final InputCharacter[] characters = getParser().parse(pattern.type(), pattern.pattern());
if (characters.length == 1 && isWildcard(characters[0])) {
// Only character is wildcard. Remove the start state match.
startStateMatch = null;
return;
}
deleteMatchStep(startState, 0, pattern, characters);
}
private void deleteMatchStep(ByteState byteState, int charIndex, Patterns pattern,
final InputCharacter[] characters) {
final InputCharacter currentChar = characters[charIndex];
final ByteTransition trans = getTransition(byteState, currentChar);
for (SingleByteTransition eachTrans : trans.expand()) {
if (charIndex < characters.length - 1) {
// if it is shortcut trans, we delete it directly.
if (eachTrans.isShortcutTrans()) {
deleteMatch(currentChar, byteState, pattern, eachTrans);
} else {
ByteState nextByteState = eachTrans.getNextByteState();
if (nextByteState != null && nextByteState != byteState) {
deleteMatchStep(nextByteState, charIndex + 1, pattern, characters);
}
// Perform handling for certain wildcard cases.
eachTrans = deleteMatchStepForWildcard(byteState, charIndex, pattern, characters, eachTrans,
nextByteState);
if (nextByteState != null &&
(nextByteState.hasNoTransitions() || nextByteState.hasOnlySelfReferentialTransition())) {
// The transition's next state has no meaningful transitions, so compact the transition.
putTransitionNextState(byteState, currentChar, eachTrans, null);
}
}
} else {
deleteMatch(currentChar, byteState, pattern, eachTrans);
}
}
}
/**
* Performs delete match step handling for certain wildcard cases.
*
* @param byteState The current ByteState.
* @param charIndex The index of the current character in characters.
* @param pattern The pattern we are deleting.
* @param characters The array of InputCharacters corresponding to the pattern's value.
* @param transition One of the transitions from byteState using the current byte.
* @param nextByteState The next ByteState led to by transition.
* @return Transition, or a replacement for transition if the original instance no longer exists in the machine.
*/
private SingleByteTransition deleteMatchStepForWildcard(ByteState byteState, int charIndex, Patterns pattern,
InputCharacter[] characters,
SingleByteTransition transition, ByteState nextByteState) {
final InputCharacter currentChar = characters[charIndex];
// There will be a match using second last character on second last state when last character is
// a wildcard. This allows empty substring to satisfy wildcard.
if (charIndex == characters.length - 2 && isWildcard(characters[characters.length - 1])) {
// Delete the match.
SingleByteTransition updatedTransition = deleteMatch(currentChar, byteState, pattern, transition);
if (updatedTransition != null) {
return updatedTransition;
}
// Undo the machine changes for a wildcard as described in the Javadoc of addTransitionNextStateForWildcard.
} else if (nextByteState != null && isWildcard(currentChar)) {
// Remove transition for all possible byte values if it leads to an only self-referencing state
if (nextByteState.hasOnlySelfReferentialTransition()) {
byteState.removeTransitionForAllBytes(transition);
}
// Remove match on last char that exists for second-last char wildcard on second last state to be satisfied
// by empty substring.
if (charIndex == characters.length - 2 && isWildcard(characters[characters.length - 2])) {
deleteMatches(characters[charIndex + 1], byteState, pattern);
// Remove match on second last char that exists for third last and last char wildcard combination on third
// last state to be both satisfied by empty substrings. I.e. "ax" can match "a*x*".
} else if (charIndex == characters.length - 3 && isWildcard(characters[characters.length - 1]) &&
isWildcard(characters[characters.length - 3])) {
deleteMatches(characters[charIndex + 1], byteState, pattern);
}
// Remove transition that exists for wildcard to be satisfied by empty substring.
ByteTransition skipWildcardTransition = getTransition(byteState, characters[charIndex + 1]);
for (SingleByteTransition eachTrans : skipWildcardTransition.expand()) {
ByteState skipWildcardState = eachTrans.getNextByteState();
if (!eachTrans.getMatches().iterator().hasNext() && skipWildcardState != null &&
(skipWildcardState.hasNoTransitions() ||
skipWildcardState.hasOnlySelfReferentialTransition())) {
removeTransition(byteState, characters[charIndex + 1], eachTrans);
}
}
}
return transition;
}
// this method is only called after findRangePattern proves that matched pattern exist.
private void deleteRangePattern(Range range) {
// if range to be deleted does not exist, just return
if (findRangePattern(range) == null) {
return;
}
// Inside Range pattern, there are bottom value and top value, this function will traverse each byte of bottom
// and top value separately to hunt down all matches eligible to be deleted.
// The ArrayDequeue is used to save all byteStates in transition path since state associated with first byte of
// value to state associated with last byte of value. Then, checkAndDeleteStateAlongTraversedPath function will
// check each state saved in ArrayDequeue along reverse direction of traversing path and recursively check and
// delete match and state if it is eligible.
// Note: byteState is deletable only when it has no match and no transition to next byteState.
// Refer to definition of class ComparableNumber, the max length in bytes for Number type is 16,
// so here we take 16 as ArrayDeque capacity which is defined as ComparableNumber.MAX_BYTE_LENGTH.
final ArrayDeque> byteStatesTraversePathAlongRangeBottomValue =
new ArrayDeque<>(ComparableNumber.MAX_LENGTH_IN_BYTES);
final ArrayDeque> byteStatesTraversePathAlongRangeTopValue =
new ArrayDeque<>(ComparableNumber.MAX_LENGTH_IN_BYTES);
ByteTransition forkState = startState;
int forkOffset = 0;
byteStatesTraversePathAlongRangeBottomValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.bottom[0], forkState));
byteStatesTraversePathAlongRangeTopValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.top[0], forkState));
// bypass common prefix of range's bottom and top patterns
// we need move forward the state and save all states traversed for checking later.
while (range.bottom[forkOffset] == range.top[forkOffset]) {
forkState = findNextByteStateForRangePattern(forkState, range.bottom[forkOffset]);
assert forkState != null : "forkState != null";
byteStatesTraversePathAlongRangeBottomValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.bottom[forkOffset], forkState));
byteStatesTraversePathAlongRangeTopValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.top[forkOffset], forkState));
forkOffset++;
}
// when bottom byte on forkOffset position < top byte in same position, there must be matches existing
// in this state, go ahead to delete matches in the fork state.
for (byte bb : Range.digitSequence(range.bottom[forkOffset], range.top[forkOffset], false, false)) {
deleteMatches(getParser().parse(bb), forkState, range);
}
// process all the transitions on the bottom range bytes
ByteTransition state = forkState;
int lastMatchOffset = forkOffset;
// see explanation in addRangePattern(), we need delete state and match accordingly.
for (int offsetB = forkOffset + 1; offsetB < (range.bottom.length - 1); offsetB++) {
byte b = range.bottom[offsetB];
if (b < Constants.MAX_DIGIT) {
while (lastMatchOffset < offsetB) {
state = findNextByteStateForRangePattern(state, range.bottom[lastMatchOffset]);
assert state != null : "state must be existing for this pattern";
byteStatesTraversePathAlongRangeBottomValue.addFirst(
new AbstractMap.SimpleImmutableEntry<>(range.bottom[lastMatchOffset], state));
lastMatchOffset++;
}
assert lastMatchOffset == offsetB : "lastMatchOffset == offsetB";
for (byte bb : Range.digitSequence(b, Constants.MAX_DIGIT, false, true)) {
deleteMatches(getParser().parse(bb), state, range);
}
}
}
// now for last "bottom" digit
// see explanation in addRangePattern(), we need to delete states and matches accordingly.
final byte lastBottom = range.bottom[range.bottom.length - 1];
final byte lastTop = range.top[range.top.length - 1];
if ((lastBottom < Constants.MAX_DIGIT) || !range.openBottom) {
while (lastMatchOffset < range.bottom.length - 1) {
state = findNextByteStateForRangePattern(state, range.bottom[lastMatchOffset]);
assert state != null : "state != null";
byteStatesTraversePathAlongRangeBottomValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.bottom[lastMatchOffset], state));
lastMatchOffset++;
}
assert lastMatchOffset == range.bottom.length - 1 : "lastMatchOffset == range.bottom.length - 1";
if (!range.openBottom) {
deleteMatches(getParser().parse(lastBottom), state, range);
}
// unless the last digit is also at the fork position, fill in the extra matches due to
// the strictly-less-than condition (see discussion above)
if (forkOffset < (range.bottom.length - 1)) {
for (byte bb : Range.digitSequence(lastBottom, Constants.MAX_DIGIT, false, true)) {
deleteMatches(getParser().parse(bb), state, range);
}
}
}
// now process transitions along the top range bytes
// see explanation in addRangePattern(), we need to delete states and matches accordingly.
state = forkState;
lastMatchOffset = forkOffset;
for (int offsetT = forkOffset + 1; offsetT < (range.top.length - 1); offsetT++) {
byte b = range.top[offsetT];
if (b > '0') {
while (lastMatchOffset < offsetT) {
state = findNextByteStateForRangePattern(state, range.top[lastMatchOffset]);
assert state != null : "state must be existing for this pattern";
byteStatesTraversePathAlongRangeTopValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.top[lastMatchOffset], state));
lastMatchOffset++;
}
assert lastMatchOffset == offsetT : "lastMatchOffset == offsetT";
for (byte bb : Range.digitSequence((byte) '0', range.top[offsetT], true, false)) {
deleteMatches(getParser().parse(bb), state, range);
}
}
}
// now for last "top" digit.
// see explanation in addRangePattern(), we need to delete states and matches accordingly.
if ((lastTop > '0') || !range.openTop) {
while (lastMatchOffset < range.top.length - 1) {
state = findNextByteStateForRangePattern(state, range.top[lastMatchOffset]);
assert state != null : "state != null";
byteStatesTraversePathAlongRangeTopValue.addFirst(new AbstractMap.SimpleImmutableEntry<>(range.top[lastMatchOffset], state));
lastMatchOffset++;
}
assert lastMatchOffset == range.top.length - 1 : "lastMatchOffset == range.top.length - 1";
if (!range.openTop) {
deleteMatches(getParser().parse(lastTop), state, range);
}
// unless the last digit is also at the fork position, fill in the extra matches due to
// the strictly-less-than condition (see discussion above)
if (forkOffset < (range.top.length - 1)) {
for (byte bb : Range.digitSequence((byte) '0', lastTop, true, false)) {
deleteMatches(getParser().parse(bb), state, range);
}
}
}
// by now we should have deleted all matches in all associated byteStates,
// now we start cleaning up ineffective byteSates along states traversed path we saved before.
checkAndDeleteStateAlongTraversedPath(byteStatesTraversePathAlongRangeBottomValue);
checkAndDeleteStateAlongTraversedPath(byteStatesTraversePathAlongRangeTopValue);
// well done now, we have deleted all matches pattern matched and cleaned all empty state as if that pattern
// wasn't added into machine before.
}
private void checkAndDeleteStateAlongTraversedPath(
ArrayDeque> byteStateQueue) {
if (byteStateQueue.isEmpty()) {
return;
}
Byte childByteKey = byteStateQueue.pollFirst().getKey();
while (!byteStateQueue.isEmpty()) {
final AbstractMap.SimpleImmutableEntry parentStatePair = byteStateQueue.pollFirst();
final Byte parentByteKey = parentStatePair.getKey();
final ByteTransition parentByteTransition = parentStatePair.getValue();
for (SingleByteTransition singleParent : parentByteTransition.expand()) {
// Check all transitions from singleParent using childByteKey. Delete each transition that is a dead-end
// (i.e. transition that has no transitions itself).
ByteTransition transition = getTransition(singleParent, childByteKey);
for (SingleByteTransition eachTrans : transition.expand()) {
ByteState nextState = eachTrans.getNextByteState();
if (nextState != null && nextState.hasNoTransitions()) {
putTransitionNextState(singleParent.getNextByteState(), getParser().parse(childByteKey),
eachTrans, null);
}
}
}
childByteKey = parentByteKey;
}
}
private void doTransitionOn(final String valString, final Set transitionTo,
TransitionValueType valueType) {
final Map> failedAnythingButs = new HashMap<>();
final byte[] val = valString.getBytes(StandardCharsets.UTF_8);
// we need to add the name state for key existence
addExistenceMatch(transitionTo);
// attempt to harvest the possible suffix match
addSuffixMatch(val, transitionTo, failedAnythingButs);
if (startStateMatch != null) {
transitionTo.add(new NameStateWithPattern(startStateMatch.getNextNameState(), startStateMatch.getPattern()));
}
// we have to do old-school indexing rather than "for (byte b : val)" because there is some special-casing
// on transitions on the last byte in the value array
ByteTransition trans = startState;
for (int valIndex = 0; valIndex < val.length; valIndex++) {
final ByteTransition nextTrans = getTransition(trans, val[valIndex]);
attemptAddShortcutTransitionMatch(nextTrans, valString, EXACT, transitionTo);
if (!nextTrans.isShortcutTrans()) {
// process any matches hanging off this transition
for (ByteMatch match : nextTrans.getMatches()) {
switch (match.getPattern().type()) {
case EXACT:
case EQUALS_IGNORE_CASE:
case WILDCARD:
if (valIndex == (val.length - 1)) {
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
}
break;
case NUMERIC_EQ:
// only matches at last character
if (valueType == TransitionValueType.NUMERIC && valIndex == (val.length - 1)) {
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
}
break;
case PREFIX:
case PREFIX_EQUALS_IGNORE_CASE:
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
break;
case ANYTHING_BUT_SUFFIX:
case SUFFIX:
case SUFFIX_EQUALS_IGNORE_CASE:
case EXISTS:
// we already harvested these matches via separate functions due to special matching
// requirements, so just ignore them here.
break;
case NUMERIC_RANGE:
// as soon as you see the match, you've matched
Range range = (Range) match.getPattern();
if ((valueType == TransitionValueType.NUMERIC && !range.isCIDR) ||
(valueType != TransitionValueType.NUMERIC && range.isCIDR)) {
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
}
break;
case ANYTHING_BUT:
AnythingBut anythingBut = (AnythingBut) match.getPattern();
// only applies if at last character
if (valIndex == (val.length - 1) &&
anythingBut.isNumeric() == (valueType == TransitionValueType.NUMERIC)) {
addToAnythingButsMap(failedAnythingButs, match.getNextNameState(), match.getPattern());
}
break;
case ANYTHING_BUT_IGNORE_CASE:
// only applies if at last character
if (valIndex == (val.length - 1)) {
addToAnythingButsMap(failedAnythingButs, match.getNextNameState(), match.getPattern());
}
break;
case ANYTHING_BUT_PREFIX:
addToAnythingButsMap(failedAnythingButs, match.getNextNameState(), match.getPattern());
break;
default:
throw new RuntimeException("Not implemented yet");
}
}
}
trans = nextTrans.getTransitionForNextByteStates();
if (trans == null) {
break;
}
}
// This may look like premature optimization, but the first "if" here yields roughly 10x performance
// improvement. We exclude CIDR because the value will have been transformed, causing the anythingBut to always
// match. Wait for NUMERIC or STRING matching to harvest anythingBut matches.
if (!anythingButs.isEmpty() && valueType != TransitionValueType.CIDR) {
for (Map.Entry> entry : anythingButs.entrySet()) {
boolean failedAnythingButsContainsKey = failedAnythingButs.containsKey(entry.getKey());
for (Patterns pattern : entry.getValue()) {
if (!failedAnythingButsContainsKey ||
!failedAnythingButs.get(entry.getKey()).contains(pattern)) {
transitionTo.add(new NameStateWithPattern(entry.getKey(), pattern));
}
}
}
}
}
private void addToAnythingButsMap(Map> map, NameState nameState, Patterns pattern) {
if (!map.containsKey(nameState)) {
map.put(nameState, new ArrayList<>());
}
map.get(nameState).add(pattern);
}
private void removeFromAnythingButsMap(Map> map, NameState nameState, Patterns pattern) {
List patterns = map.get(nameState);
if (patterns != null) {
patterns.remove(pattern);
if (patterns.isEmpty()) {
map.remove(nameState);
}
}
}
private void addExistenceMatch(final Set transitionTo) {
final byte[] val = Patterns.EXISTS_BYTE_STRING.getBytes(StandardCharsets.UTF_8);
ByteTransition trans = startState;
ByteTransition nextTrans = null;
for (int valIndex = 0; valIndex < val.length && trans != null; valIndex++) {
nextTrans = getTransition(trans, val[valIndex]);
trans = nextTrans.getTransitionForNextByteStates();
}
if (nextTrans == null) {
return;
}
for (ByteMatch match : nextTrans.getMatches()) {
if (match.getPattern().type() == EXISTS) {
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
break;
}
}
}
private void addSuffixMatch(final byte[] val, final Set transitionTo,
final Map> failedAnythingButs) {
// we only attempt to evaluate suffix matches when there is suffix match in current byte machine instance.
// it works as performance level to avoid other type of matches from being affected by suffix checking.
if (hasSuffix.get() > 0) {
ByteTransition trans = startState;
// check the byte in reverse order in order to harvest suffix matches
for (int valIndex = val.length - 1; valIndex >= 0; valIndex--) {
final ByteTransition nextTrans = getTransition(trans, val[valIndex]);
for (ByteMatch match : nextTrans.getMatches()) {
// given we are traversing in reverse order (from right to left), only suffix matches are eligible
// to be collected.
MatchType patternType = match.getPattern().type();
if (patternType == SUFFIX || patternType == SUFFIX_EQUALS_IGNORE_CASE) {
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
} else if (patternType == ANYTHING_BUT_SUFFIX) {
addToAnythingButsMap(failedAnythingButs, match.getNextNameState(), match.getPattern());
}
}
trans = nextTrans.getTransitionForNextByteStates();
if (trans == null) {
break;
}
}
}
}
/**
* Evaluates if a provided transition is a shortcut transition with a match having a given match type and value. If
* so, adds match to transitionTo Set. Used to short-circuit traversal.
*
* Note: The adjustment mode can ensure the shortcut transition (if exists) is always at the tail of path. Refer to
* addEndOfMatch() function for details.
*
* @param transition Transition to evaluate.
* @param value Value desired in match's pattern.
* @param expectedMatchType Match type expected in match's pattern.
* @param transitionTo Set that match's next name state will be added to if desired match is found.
* @return True iff match was added to transitionTo Set.
*/
private boolean attemptAddShortcutTransitionMatch(final ByteTransition transition, final String value,
final MatchType expectedMatchType, final Set transitionTo) {
for (ShortcutTransition shortcut : transition.getShortcuts()) {
ByteMatch match = shortcut.getMatch();
assert match != null;
if (match.getPattern().type() == expectedMatchType) {
ValuePatterns valuePatterns = (ValuePatterns) match.getPattern();
if (valuePatterns.pattern().equals(value)) {
// Only one match is possible for shortcut transition
transitionTo.add(new NameStateWithPattern(match.getNextNameState(), match.getPattern()));
return true;
}
}
}
return false;
}
NameState addPattern(final Patterns pattern) {
return addPattern(pattern, null);
}
/**
* Adds one pattern to a byte machine.
*
* @param pattern The pattern to add.
* @param nameState If non-null, attempt to transition to this NameState from the ByteMatch. May be ignored if the
* ByteMatch already exists with its own NameState.
* @return NameState transitioned to from ByteMatch. May or may not equal provided NameState if it wasn't null.
*/
NameState addPattern(final Patterns pattern, final NameState nameState) {
switch (pattern.type()) {
case NUMERIC_RANGE:
assert pattern instanceof Range;
return addRangePattern((Range) pattern, nameState);
case ANYTHING_BUT:
assert pattern instanceof AnythingBut;
return addAnythingButPattern((AnythingBut) pattern, nameState);
case ANYTHING_BUT_IGNORE_CASE:
assert pattern instanceof AnythingButEqualsIgnoreCase;
return addAnythingButEqualsIgnoreCasePattern((AnythingButEqualsIgnoreCase) pattern, nameState);
case ANYTHING_BUT_SUFFIX:
case ANYTHING_BUT_PREFIX:
case EXACT:
case NUMERIC_EQ:
case PREFIX:
case PREFIX_EQUALS_IGNORE_CASE:
case SUFFIX:
case SUFFIX_EQUALS_IGNORE_CASE:
case EQUALS_IGNORE_CASE:
case WILDCARD:
assert pattern instanceof ValuePatterns;
return addMatchPattern((ValuePatterns) pattern, nameState);
case EXISTS:
return addExistencePattern(pattern, nameState);
default:
throw new AssertionError(pattern + " is not implemented yet");
}
}
private NameState addExistencePattern(Patterns pattern, NameState nameState) {
return addMatchValue(pattern, Patterns.EXISTS_BYTE_STRING, nameState);
}
private NameState addAnythingButPattern(AnythingBut pattern, NameState nameState) {
NameState nameStateToBeReturned = nameState;
NameState nameStateChecker = null;
for(String value : pattern.getValues()) {
nameStateToBeReturned = addMatchValue(pattern, value, nameStateToBeReturned);
if (nameStateChecker == null) {
nameStateChecker = nameStateToBeReturned;
}
// all the values in the list must point to the same NameState because they are sharing the same pattern
// object.
assert nameStateChecker == null || nameStateChecker == nameStateToBeReturned : " nameStateChecker == nameStateToBeReturned";
}
return nameStateToBeReturned;
}
private NameState addAnythingButEqualsIgnoreCasePattern(AnythingButEqualsIgnoreCase pattern, NameState nameState) {
NameState nameStateToBeReturned = nameState;
NameState nameStateChecker = null;
for(String value : pattern.getValues()) {
nameStateToBeReturned = addMatchValue(pattern, value, nameStateToBeReturned);
if (nameStateChecker == null) {
nameStateChecker = nameStateToBeReturned;
}
// all the values in the list must point to the same NameState because they are sharing the same pattern
// object.
assert nameStateChecker == null || nameStateChecker == nameStateToBeReturned : " nameStateChecker == nameStateToBeReturned";
}
return nameStateToBeReturned;
}
private NameState addMatchValue(Patterns pattern, String value, NameState nameStateToBeReturned) {
final InputCharacter[] characters = getParser().parse(pattern.type(), value);
ByteState byteState = startState;
ByteState prevByteState = null;
int i = 0;
for (; i < characters.length - 1; i++) {
ByteTransition trans = getTransition(byteState, characters[i]);
if (trans.isEmpty()) {
break;
}
ByteState stateToReuse = null;
for (SingleByteTransition single : trans.expand()) {
ByteState nextByteState = single.getNextByteState();
if (canReuseNextByteState(byteState, nextByteState, characters, i)) {
stateToReuse = nextByteState;
break;
}
}
if (stateToReuse == null) {
break;
}
prevByteState = byteState;
byteState = stateToReuse;
}
// we found our way through the machine with all characters except the last having matches or shortcut.
return addEndOfMatch(byteState, prevByteState, characters, i, pattern, nameStateToBeReturned);
}
private boolean canReuseNextByteState(ByteState byteState, ByteState nextByteState, InputCharacter[] characters,
int i) {
// We cannot re-use nextByteState if it is non-existent (null) or if it is the same as the current byteState,
// meaning we are looping on a self-referencing transition.
if (nextByteState == null || nextByteState == byteState) {
return false;
}
// Case 1: When we have a determinate prefix, we can re-use nextByteState except for the special case where we
// are on the second-last character with a final character wildcard. A composite is required for this
// case, so we will create a new state.
//
// Example 1: Take the following machine representing an exact match pattern.
// a b c
// -----> A -----> B -----> C
// With a byteState of A, a current char of b, and a next and final char of *, we would be unable to
// re-use B as nextByteState, since we need to create a new self-referencing composite state D.
if (!nextByteState.hasIndeterminatePrefix()) {
return !(i == characters.length - 2 && isWildcard(characters[i + 1]));
// Case 2: nextByteState has an indeterminate prefix and current character is not a wildcard. We can re-use
// nextByteState only if byteState does not have at least two transitions, one using the next
// InputCharacter, that eventually converge to a common state.
//
// Example 2a: Take the following machine representing a wildcard pattern.
// ____
// a * | * |
// -----> A -----> B <--
// | b b |
// --> C <--
// With a byteState of A, and a current char of b, we would be unable to re-use C as nextByteState,
// since A has a second transition to B that eventually leads to C as well. But we could re-use B.
//
// Example 2b: Take the following machine representing an equals_ignore_case pattern.
// x Y
// -----> A ------
// | y v
// -----> B
// With a byteState of A, and a current char of y, we would be unable to re-use B as nextByteState,
// since A has a second transition to B using char Y.
} else {
return !doMultipleTransitionsConvergeForInputByte(byteState, characters, i);
}
}
/**
* Returns true if the current InputCharacter is the first InputByte of a Java character and there exists at least
* two transitions, one using the next InputCharacter, away from byteState leading down paths that eventually
* converge to a common state.
*
* @param byteState State where we look for at least two transitions that eventually converge to a common state.
* @param characters The array of InputCharacters.
* @param i The current index in the characters array.
* @return True if and only if multiple transitions eventually converge to a common state.
*/
private boolean doMultipleTransitionsConvergeForInputByte(ByteState byteState, InputCharacter[] characters, int i) {
if (!isByte(characters[i])) {
return false;
}
boolean isNextCharacterForSuffixMatch = isNextCharacterFirstContinuationByteForSuffixMatch(characters, i);
if (!isNextCharacterFirstByteOfMultiByte(characters, i) && !isNextCharacterForSuffixMatch) {
// If we are in the midst of a multi-byte sequence, we know that we are dealing with single transitions.
return false;
}
// Scenario 1 where multiple transitions will later converge: wildcard leads to wildcard state and following
// character can be used to skip wildcard state. Check for a non-self-referencing wildcard transition.
ByteTransition byteStateTransitionForAllBytes = byteState.getTransitionForAllBytes();
if (!byteStateTransitionForAllBytes.isEmpty() && byteStateTransitionForAllBytes != byteState) {
return true;
}
// Scenario 2 where multiple transitions will later converge: equals_ignore_case lower and upper case paths.
// Parse the next Java character into lower and upper case representations. Check if there are multiple
// multibytes (paths) and that there exists a transition that both lead to.
String value = extractNextJavaCharacterFromInputCharacters(characters, i);
MatchType matchType = isNextCharacterForSuffixMatch ? SUFFIX_EQUALS_IGNORE_CASE : EQUALS_IGNORE_CASE;
InputCharacter[] inputCharacters = getParser().parse(matchType, value);
ByteTransition transition = getTransition(byteState, inputCharacters[0]);
return inputCharacters[0] instanceof InputMultiByteSet && transition != null;
}
private boolean isNextCharacterFirstByteOfMultiByte(InputCharacter[] characters, int i) {
byte firstByte = InputByte.cast(characters[i]).getByte();
// Since MIN_NON_FIRST_BYTE is (byte) 0x80 = -128 = Byte.MIN_VALUE, we can simply compare against
// MAX_NON_FIRST_BYTE to see if this is a first byte or not.
return firstByte > MAX_NON_FIRST_BYTE;
}
private boolean isNextCharacterFirstContinuationByteForSuffixMatch(InputCharacter[] characters, int i) {
if (hasSuffix.get() <= 0) {
return false;
}
// If the previous byte is a continuation byte, this means that we're in the middle of a multi-byte sequence.
return isContinuationByte(characters, i) && !isContinuationByte(characters, i - 1);
}
private boolean isContinuationByte(InputCharacter[] characters, int i) {
if (i < 0) {
return false;
}
byte continuationByte = InputByte.cast(characters[i]).getByte();
// Continuation bytes have bit 7 set, and bit 6 should be unset
return continuationByte >= MIN_CONTINUATION_BYTE && continuationByte <= MAX_CONTINUATION_BYTE;
}
private String extractNextJavaCharacterFromInputCharacters(InputCharacter[] characters, int i) {
if (isNextCharacterFirstByteOfMultiByte(characters, i)) {
return extractNextJavaCharacterFromInputCharactersForForwardArrays(characters, i);
} else {
return extractNextJavaCharacterFromInputCharactersForBackwardsArrays(characters, i);
}
}
private String extractNextJavaCharacterFromInputCharactersForBackwardsArrays(InputCharacter[] characters, int i) {
List bytesList = new ArrayList<>();
for (int multiByteIndex = i; multiByteIndex < characters.length; multiByteIndex++) {
if (!isContinuationByte(characters, multiByteIndex)) {
// This is the last byte of the suffix char
bytesList.add(InputByte.cast(characters[multiByteIndex]).getByte());
break;
}
bytesList.add(InputByte.cast(characters[multiByteIndex]).getByte());
}
// Undoing the reverse on the byte array to get the valid char
return new String(reverseBytesList(bytesList), StandardCharsets.UTF_8);
}
private byte[] reverseBytesList(List bytesList) {
byte[] byteArray = new byte[bytesList.size()];
for (int i = 0; i < bytesList.size(); i++) {
byteArray[bytesList.size() - i - 1] = bytesList.get(i);
}
return byteArray;
}
private static String extractNextJavaCharacterFromInputCharactersForForwardArrays(InputCharacter[] characters, int i) {
byte firstByte = InputByte.cast(characters[i]).getByte();
if (firstByte >= MIN_FIRST_BYTE_FOR_ONE_BYTE_CHAR && firstByte <= MAX_FIRST_BYTE_FOR_ONE_BYTE_CHAR) {
return new String(new byte[] { firstByte } , StandardCharsets.UTF_8);
} else if (firstByte >= MIN_FIRST_BYTE_FOR_TWO_BYTE_CHAR && firstByte <= MAX_FIRST_BYTE_FOR_TWO_BYTE_CHAR) {
return new String(new byte[] { firstByte, InputByte.cast(characters[i + 1]).getByte() },
StandardCharsets.UTF_8);
} else {
return new String(new byte[] { firstByte, InputByte.cast(characters[i + 1]).getByte(),
InputByte.cast(characters[i + 2]).getByte() }, StandardCharsets.UTF_8);
}
}
NameState findPattern(final Patterns pattern) {
switch (pattern.type()) {
case NUMERIC_RANGE:
assert pattern instanceof Range;
return findRangePattern((Range) pattern);
case ANYTHING_BUT:
assert pattern instanceof AnythingBut;
return findAnythingButPattern((AnythingBut) pattern);
case ANYTHING_BUT_IGNORE_CASE:
assert pattern instanceof AnythingButEqualsIgnoreCase;
return findAnythingButEqualsIgnoreCasePattern((AnythingButEqualsIgnoreCase) pattern);
case EXACT:
case NUMERIC_EQ:
case PREFIX:
case PREFIX_EQUALS_IGNORE_CASE:
case SUFFIX:
case SUFFIX_EQUALS_IGNORE_CASE:
case ANYTHING_BUT_SUFFIX:
case ANYTHING_BUT_PREFIX:
case EQUALS_IGNORE_CASE:
case WILDCARD:
assert pattern instanceof ValuePatterns;
return findMatchPattern((ValuePatterns) pattern);
case EXISTS:
return findMatchPattern(getParser().parse(pattern.type(), Patterns.EXISTS_BYTE_STRING), pattern);
default:
throw new AssertionError(pattern + " is not implemented yet");
}
}
private NameState findAnythingButPattern(AnythingBut pattern) {
Set nextNameStates = new HashSet<>(pattern.getValues().size());
for (String value : pattern.getValues()) {
NameState matchPattern = findMatchPattern(getParser().parse(pattern.type(), value), pattern);
if (matchPattern != null) {
nextNameStates.add(matchPattern);
}
}
if (!nextNameStates.isEmpty()) {
assert nextNameStates.size() == 1 : "nextNameStates.size() == 1";
return nextNameStates.iterator().next();
}
return null;
}
private NameState findAnythingButEqualsIgnoreCasePattern(AnythingButEqualsIgnoreCase pattern) {
Set nextNameStates = new HashSet<>(pattern.getValues().size());
for (String value : pattern.getValues()) {
NameState matchPattern = findMatchPattern(getParser().parse(pattern.type(), value), pattern);
if (matchPattern != null) {
nextNameStates.add(matchPattern);
}
}
if (!nextNameStates.isEmpty()) {
assert nextNameStates.size() == 1 : "nextNameStates.size() == 1";
return nextNameStates.iterator().next();
}
return null;
}
private NameState findMatchPattern(ValuePatterns pattern) {
return findMatchPattern(getParser().parse(pattern.type(), pattern.pattern()), pattern);
}
private NameState findMatchPattern(final InputCharacter[] characters, final Patterns pattern) {
Set transitionsToProcess = new HashSet<>();
transitionsToProcess.add(startState);
ByteTransition shortcutTrans = null;
// Iterate down all possible paths in machine that match characters.
outerLoop: for (int i = 0; i < characters.length - 1; i++) {
Set nextTransitionsToProcess = new HashSet<>();
for (SingleByteTransition trans : transitionsToProcess) {
ByteTransition nextTrans = getTransition(trans, characters[i]);
for (SingleByteTransition eachTrans : nextTrans.expand()) {
// Shortcut and stop outer character loop
if (SHORTCUT_MATCH_TYPES.contains(pattern.type()) && eachTrans.isShortcutTrans()) {
shortcutTrans = eachTrans;
break outerLoop;
}
SingleByteTransition nextByteState = eachTrans.getNextByteState();
if (nextByteState == null) {
continue;
}
// We are interested in the first state that hasn't simply led back to trans
if (trans != nextByteState) {
nextTransitionsToProcess.add(nextByteState);
}
}
}
transitionsToProcess = nextTransitionsToProcess;
}
// Get all possible transitions for final character.
Set finalTransitionsToProcess = new HashSet<>();
if (shortcutTrans != null) {
finalTransitionsToProcess.add(shortcutTrans);
} else {
for (SingleByteTransition trans : transitionsToProcess) {
finalTransitionsToProcess.add(getTransition(trans, characters[characters.length - 1]));
}
}
// Check matches for all possible final transitions until we find the pattern we are looking for.
for (ByteTransition nextTrans : finalTransitionsToProcess) {
for (ByteMatch match : nextTrans.getMatches()) {
if (match.getPattern().equals(pattern)) {
return match.getNextNameState();
}
}
}
return null;
}
// before we accept the delete range pattern, the input range pattern must exactly match Range ByteMatch
// in all path along the range.
private NameState findRangePattern(Range range) {
Set nextNameStates = new HashSet<>();
NameState nextNameState = null;
ByteTransition forkTrans = startState;
int forkOffset = 0;
// bypass common prefix of range's bottom and top patterns
while (range.bottom[forkOffset] == range.top[forkOffset]) {
forkTrans = findNextByteStateForRangePattern(forkTrans, range.bottom[forkOffset++]);
if (forkTrans == null) {
return null;
}
}
// fill in matches in the fork state
for (byte bb : Range.digitSequence(range.bottom[forkOffset], range.top[forkOffset], false, false)) {
nextNameState = findMatchForRangePattern(bb, forkTrans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
// process all the transitions on the bottom range bytes
ByteTransition trans = forkTrans;
int lastMatchOffset = forkOffset;
for (int offsetB = forkOffset + 1; offsetB < (range.bottom.length - 1); offsetB++) {
byte b = range.bottom[offsetB];
if (b < Constants.MAX_DIGIT) {
while (lastMatchOffset < offsetB) {
trans = findNextByteStateForRangePattern(trans, range.bottom[lastMatchOffset++]);
if (trans == null) {
return null;
}
}
assert lastMatchOffset == offsetB : "lastMatchOffset == offsetB";
for (byte bb : Range.digitSequence(b, Constants.MAX_DIGIT, false, true)) {
nextNameState = findMatchForRangePattern(bb, trans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
}
}
// now for last "bottom" digit
final byte lastBottom = range.bottom[range.bottom.length - 1];
final byte lastTop = range.top[range.top.length - 1];
if ((lastBottom < Constants.MAX_DIGIT) || !range.openBottom) {
while (lastMatchOffset < range.bottom.length - 1) {
trans = findNextByteStateForRangePattern(trans, range.bottom[lastMatchOffset++]);
if (trans == null) {
return null;
}
}
assert lastMatchOffset == (range.bottom.length - 1) : "lastMatchOffset == (range.bottom.length - 1)";
if (!range.openBottom) {
nextNameState = findMatchForRangePattern(lastBottom, trans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
// unless the last digit is also at the fork position, fill in the extra matches due to
// the strictly-less-than condition (see discussion above)
if (forkOffset < (range.bottom.length - 1)) {
for (byte bb : Range.digitSequence(lastBottom, Constants.MAX_DIGIT, false, true)) {
nextNameState = findMatchForRangePattern(bb, trans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
}
}
// now process transitions along the top range bytes
trans = forkTrans;
lastMatchOffset = forkOffset;
for (int offsetT = forkOffset + 1; offsetT < (range.top.length - 1); offsetT++) {
byte b = range.top[offsetT];
if (b > '0') {
while (lastMatchOffset < offsetT) {
trans = findNextByteStateForRangePattern(trans, range.top[lastMatchOffset++]);
if (trans == null) {
return null;
}
}
assert lastMatchOffset == offsetT : "lastMatchOffset == offsetT";
for (byte bb : Range.digitSequence((byte) '0', range.top[offsetT], true, false)) {
nextNameState = findMatchForRangePattern(bb, trans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
}
}
// now for last "top" digit
if ((lastTop > '0') || !range.openTop) {
while (lastMatchOffset < range.top.length - 1) {
trans = findNextByteStateForRangePattern(trans, range.top[lastMatchOffset++]);
if (trans == null) {
return null;
}
}
assert lastMatchOffset == (range.top.length - 1) : "lastMatchOffset == (range.top.length - 1)";
if (!range.openTop) {
nextNameState = findMatchForRangePattern(lastTop, trans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
// unless the last digit is also at the fork position, fill in the extra matches due to
// the strictly-less-than condition (see discussion above)
if (forkOffset < (range.top.length - 1)) {
for (byte bb : Range.digitSequence((byte) '0', lastTop, true, false)) {
nextNameState = findMatchForRangePattern(bb, trans, range);
if (nextNameState == null) {
return null;
}
nextNameStates.add(nextNameState);
}
}
}
// There must only have one nextNameState object returned by this range pattern refer to
// addRangePattern() where only one nextNameState is used by one pattern.
assert nextNameStates.size() == 1 : "nextNameStates.size() == 1";
return nextNameState;
}
// add a numeric range expression to the byte machine. Note; the code assumes that the patterns
// are encoded as pure strings containing only decimal digits, and that the top and bottom values
// are equal in length.
private NameState addRangePattern(final Range range, final NameState nameState) {
// we prepare for one new NameSate here which will be used for range match to point to next NameSate.
// however, it will not be used if match is already existing. in that case, we will reuse NameSate
// from that match.
NameState nextNameState = nameState == null ? new NameState() : nameState;
ByteState forkState = startState;
int forkOffset = 0;
// bypass common prefix of range's bottom and top patterns
while (range.bottom[forkOffset] == range.top[forkOffset]) {
forkState = findOrMakeNextByteStateForRangePattern(forkState, range.bottom, forkOffset++);
}
// now we've bypassed any initial positions where the top and bottom patterns are the same, and arrived
// at a position where the 'top' and 'bottom' bytes differ. Such a position must occur because we require
// that the bottom number be strictly less than the top number. Let's call the current state the fork state.
// At the fork state, any byte between the top and bottom byte values means success, the value must be strictly
// greater than bottom and less than top. That leaves the transitions for the bottom and top values.
// After the fork state, we arrive at a state where any digit greater than the next bottom value
// means success, because after the fork state we are already strictly less than the top value, and we
// know then that we are greater than the bottom value. A digit equal to the bottom value leads us
// to another state where the same applies; anything greater than the bottom value is success. Finally,
// when we come to the last digit, as before, anything greater than the bottom value is success, and
// being equal to the bottom value means failure if the interval is open, because the value is strictly
// equal to the bottom of the range.
// Following the top-byte transition out of the fork state leads to a state where the story is reversed;
// any digit lower than the top value means success, successive matches to the top value lead to similar
// states, and a final byte that matches the top value means failure if the interval is open at the top.
// There is a further complication. Consider the case [ > 00299 < 00500 ]. The machine we need to
// build is like this:
// State0 =0=> State1 ; State1 =0=> State2 ; State2 =3=> MATCH ; State2 =4=> MATCH
// That's it. Once you've seen 002 in the input, there's nothing that can follow that will be
// strictly greater than the remaining 299. Once you've seen 005 there's nothing that can
// follow that will be strictly less than the remaining 500
// But this only works when the suffix of the bottom range pattern is all 9's or if the suffix of the
// top range pattern is all 0's
// What could be simpler?
// fill in matches in the fork state
for (byte bb : Range.digitSequence(range.bottom[forkOffset], range.top[forkOffset], false, false)) {
nextNameState = insertMatchForRangePattern(bb, forkState, nextNameState, range);
}
// process all the transitions on the bottom range bytes
ByteState state = forkState;
// lastMatchOffset is the last offset where we know we have to put in a match
int lastMatchOffset = forkOffset;
for (int offsetB = forkOffset + 1; offsetB < (range.bottom.length - 1); offsetB++) {
// if b is Constants.MAX_DIGIT, then we should hold off adding transitions until we see a non-maxDigit digit
// because of the special case described above.
byte b = range.bottom[offsetB];
if (b < Constants.MAX_DIGIT) {
// add transitions for any 9's we bypassed
while (lastMatchOffset < offsetB) {
state = findOrMakeNextByteStateForRangePattern(state, range.bottom, lastMatchOffset++);
}
assert lastMatchOffset == offsetB : "lastMatchOffset == offsetB";
assert state != null : "state != null";
// now add transitions for values greater than this non-9 digit
for (byte bb : Range.digitSequence(b, Constants.MAX_DIGIT, false, true)) {
nextNameState = insertMatchForRangePattern(bb, state, nextNameState, range);
}
}
}
// now for last "bottom" digit
final byte lastBottom = range.bottom[range.bottom.length - 1];
final byte lastTop = range.top[range.top.length - 1];
// similarly, if the last digit is 9 and we have openBottom, there can be no matches so we're done.
if ((lastBottom < Constants.MAX_DIGIT) || !range.openBottom) {
// add transitions for any 9's we bypassed
while (lastMatchOffset < range.bottom.length - 1) {
state = findOrMakeNextByteStateForRangePattern(state, range.bottom, lastMatchOffset++);
}
assert lastMatchOffset == (range.bottom.length - 1) : "lastMatchOffset == (range.bottom.length - 1)";
assert state != null : "state != null";
// now we insert matches for possible values of last digit
if (!range.openBottom) {
nextNameState = insertMatchForRangePattern(lastBottom, state, nextNameState, range);
}
// unless the last digit is also at the fork position, fill in the extra matches due to
// the strictly-less-than condition (see discussion above)
if (forkOffset < (range.bottom.length - 1)) {
for (byte bb : Range.digitSequence(lastBottom, Constants.MAX_DIGIT, false, true)) {
nextNameState = insertMatchForRangePattern(bb, state, nextNameState, range);
}
}
}
// now process transitions along the top range bytes
// restore the state and last match offset to fork position to start analyzing top value bytes ...
state = forkState;
lastMatchOffset = forkOffset;
for (int offsetT = forkOffset + 1; offsetT < (range.top.length - 1); offsetT++) {
// if b is '0', we should hold off adding transitions until we see a non-'0' digit.
byte b = range.top[offsetT];
// if need to add transition
if (b > '0') {
while (lastMatchOffset < offsetT) {
state = findOrMakeNextByteStateForRangePattern(state, range.top, lastMatchOffset++);
}
assert lastMatchOffset == offsetT : "lastMatchOffset == offsetT";
assert state != null : "state != null";
// now add transitions for values less than this non-0 digit
for (byte bb : Range.digitSequence((byte) '0', range.top[offsetT], true, false)) {
nextNameState = insertMatchForRangePattern(bb, state, nextNameState, range);
}
}
}
// now for last "top" digit
// similarly, if the last digit is 0 and we have openTop, there can be no matches so we're done.
if ((lastTop > '0') || !range.openTop) {
// add transitions for any 0's we bypassed
while (lastMatchOffset < range.top.length - 1) {
state = findOrMakeNextByteStateForRangePattern(state, range.top, lastMatchOffset++);
}
assert lastMatchOffset == (range.top.length - 1) : "lastMatchOffset == (range.top.length - 1)";
assert state != null : "state != null";
// now we insert matches for possible values of last digit
if (!range.openTop) {
nextNameState = insertMatchForRangePattern(lastTop, state, nextNameState, range);
}
// unless the last digit is also at the fork position, fill in the extra matches due to
// the strictly-less-than condition (see discussion above)
if (forkOffset < (range.top.length - 1)) {
for (byte bb : Range.digitSequence((byte) '0', lastTop, true, false)) {
nextNameState = insertMatchForRangePattern(bb, state, nextNameState, range);
}
}
}
return nextNameState;
}
// If we meet shortcut trans, that means Range have byte overlapped with shortcut matches,
// To simplify the logic, we extend the shortcut to normal byte state chain, then decide whether need create
// new byte state for current call.
private ByteTransition extendShortcutTransition(final ByteState state, final ByteTransition trans,
final InputCharacter character, final int currentIndex) {
if (!trans.isShortcutTrans()) {
return trans;
}
ShortcutTransition shortcut = (ShortcutTransition) trans;
Patterns shortcutPattern = shortcut.getMatch().getPattern();
String valueInCurrentPos = ((ValuePatterns) shortcutPattern).pattern();
final InputCharacter[] charactersInCurrentPos = getParser().parse(shortcutPattern.type(), valueInCurrentPos);
ByteState firstNewState = null;
ByteState currentState = state;
for (int k = currentIndex; k < charactersInCurrentPos.length-1; k++) {
// we need keep the current state always pointed to last character.
final ByteState newByteState = new ByteState();
newByteState.setIndeterminatePrefix(currentState.hasIndeterminatePrefix());
if (k != currentIndex) {
putTransitionNextState(currentState, charactersInCurrentPos[k], shortcut, newByteState);
} else {
firstNewState = newByteState;
}
currentState = newByteState;
}
putTransitionMatch(currentState, charactersInCurrentPos[charactersInCurrentPos.length-1],
EmptyByteTransition.INSTANCE, shortcut.getMatch());
removeTransition(currentState, charactersInCurrentPos[charactersInCurrentPos.length-1], shortcut);
putTransitionNextState(state, character, shortcut, firstNewState);
return getTransition(state, character);
}
private ByteState findOrMakeNextByteStateForRangePattern(ByteState state, final byte[] utf8bytes,
int currentIndex) {
final InputCharacter character = getParser().parse(utf8bytes[currentIndex]);
ByteTransition trans = getTransition(state, character);
ByteState nextState = trans.getNextByteState();
if (nextState == null) {
// the next state is null => create a new state, set the transition's next state to the new state
nextState = new ByteState();
nextState.setIndeterminatePrefix(state.hasIndeterminatePrefix());
putTransitionNextState(state, character, trans.expand().iterator().next(), nextState);
}
return nextState;
}
// Return the next byte state after transitioning from state using character at currentIndex. Will create it if it
// doesn't exist.
private ByteState findOrMakeNextByteState(ByteState state, ByteState prevState, final InputCharacter[] characters,
int currentIndex, Patterns pattern, NameState nameState) {
final InputCharacter character = characters[currentIndex];
ByteTransition trans = getTransition(state, character);
trans = extendShortcutTransition(state, trans, character, currentIndex);
ByteState nextState = trans.getNextByteState();
if (nextState == null || nextState.hasIndeterminatePrefix() ||
(currentIndex == characters.length - 2 && isWildcard(characters[currentIndex + 1]))) {
// There are three cases for which we cannot re-use the next state and must add a new next state.
// 1. Next state is null (does not exist).
// 2. Next state has an indeterminate prefix, so using it would create unintended matches.
// 3. We're on second last char and last char is wildcard: next state will be composite with match so empty
// substring satisfies wildcard. The composite will self-reference and would create unintended matches.
nextState = new ByteState();
nextState.setIndeterminatePrefix(state.hasIndeterminatePrefix());
addTransitionNextState(state, character, characters, currentIndex, prevState, pattern, nextState,
nameState);
}
return nextState;
}
private ByteTransition findNextByteStateForRangePattern(ByteTransition trans, final byte b) {
if (trans == null) {
return null;
}
ByteTransition nextTrans = getTransition(trans, b);
return nextTrans.getTransitionForNextByteStates();
}
// add a match type pattern, i.e. anything but a numeric range, to the byte machine.
private NameState addMatchPattern(final ValuePatterns pattern, final NameState nameState) {
return addMatchValue(pattern, pattern.pattern(), nameState);
}
// We can reach to this function when we have checked the existing characters array from left to right and found we
// need add the match in the tail character or we find we can shortcut to tail directly without creating new byte
// transition in the middle. If we met the shortcut transition, we need compare the input value to adjust it
// accordingly. Please refer to detail comments in ShortcutTransition.java.
private NameState addEndOfMatch(ByteState state,
ByteState prevState,
final InputCharacter[] characters,
final int charIndex,
final Patterns pattern,
final NameState nameStateCandidate) {
final int length = characters.length;
NameState nameState = (nameStateCandidate == null) ? new NameState() : nameStateCandidate;
if (length == 1 && isWildcard(characters[0])) {
// Only character is '*'. Make the start state a match so empty input is matched.
startStateMatch = new ByteMatch(pattern, nameState);
return nameState;
}
ByteTransition trans = getTransition(state, characters[charIndex]);
// If it is shortcut transition, we need do adjustment first.
if (!trans.isEmpty() && trans.isShortcutTrans()) {
ShortcutTransition shortcut = (ShortcutTransition) trans;
ByteMatch match = shortcut.getMatch();
// In add/delete rule path, match must not be null and must not have other match
assert match != null && SHORTCUT_MATCH_TYPES.contains(match.getPattern().type());
// If it is the same pattern, just return.
if (pattern.equals(match.getPattern())) {
return match.getNextNameState();
}
// Have asserted current match pattern must be value patterns
String valueInCurrentPos = ((ValuePatterns) match.getPattern()).pattern();
final InputCharacter[] charactersInCurrentPos = getParser().parse(match.getPattern().type(),
valueInCurrentPos);
// find the position where the common prefix ends.
int m = charIndex;
for (; m < charactersInCurrentPos.length && m < length; m++) {
if (!charactersInCurrentPos[m].equals(characters[m])) {
break;
}
}
// Extend the prefix part in value to byte transitions, to avoid impact on concurrent read we need firstly
// make the new byte chain ready for using and leave the old transition removing to the last step.
// firstNewState will be head of new byte chain and, to avoid impact on concurrent match traffic in read
// path, it need be linked to current state chain after adjustment done.
ByteState firstNewState = null;
ByteState currentState = state;
for (int k = charIndex; k < m; k++) {
// we need keep the current state always pointed to last character.
if (k != charactersInCurrentPos.length -1) {
final ByteState newByteState = new ByteState();
newByteState.setIndeterminatePrefix(currentState.hasIndeterminatePrefix());
if (k != charIndex) {
putTransitionNextState(currentState, charactersInCurrentPos[k], shortcut, newByteState);
} else {
firstNewState = newByteState;
}
currentState = newByteState;
}
}
// If it reached to last character, link the previous read transition in this character, else create
// shortcut transition. Note: at this time, the previous transition can still keep working.
boolean isShortcutNeeded = m < charactersInCurrentPos.length - 1;
int indexToBeChange = isShortcutNeeded ? m : charactersInCurrentPos.length - 1;
putTransitionMatch(currentState, charactersInCurrentPos[indexToBeChange], isShortcutNeeded ?
new ShortcutTransition() : EmptyByteTransition.INSTANCE, match);
removeTransition(currentState, charactersInCurrentPos[indexToBeChange], shortcut);
// At last, we link the new created chain to the byte state path, so no uncompleted change can be felt by
// reading thread. Note: we already confirmed there is only old shortcut transition at charIndex position,
// now we have move it to new position, so we can directly replace previous transition with new transition
// pointed to new byte state chain.
putTransitionNextState(state, characters[charIndex], shortcut, firstNewState);
}
// If there is a exact match transition on tail of path, after adjustment target transitions, we start
// looking at current remaining characters.
// If this is tail transition, go directly analyse the remaining characters, traverse to tail of chain:
boolean isEligibleForShortcut = true;
int j = charIndex;
for (; j < (length - 1); j++) {
// We do not want to re-use an existing state for the second last character in the case of a final-character
// wildcard pattern. In this case, we will have a self-referencing composite match state, which allows zero
// or many character to satisfy the wildcard. The self-reference would lead to unintended matches for the
// existing patterns.
if (j == length - 2 && isWildcard(characters[j + 1])) {
break;
}
trans = getTransition(state, characters[j]);
if (trans.isEmpty()) {
break;
}
ByteState nextByteState = trans.getNextByteState();
if (nextByteState != null) {
// We cannot re-use a state with an indeterminate prefix without creating unintended matches.
if (nextByteState.hasIndeterminatePrefix()) {
// Since there is more path we are unable to traverse, this means we cannot insert shortcut without
// potentially ignoring matches further down path.
isEligibleForShortcut = false;
break;
}
prevState = state;
state = nextByteState;
} else {
// trans has match but no next state, we need prepare a next next state to add trans for either last
// character or shortcut byte.
final ByteState newByteState = new ByteState();
newByteState.setIndeterminatePrefix(state.hasIndeterminatePrefix());
// Stream will not be empty since trans has been verified as non-empty
SingleByteTransition single = trans.expand().iterator().next();
putTransitionNextState(state, characters[j], single, newByteState);
prevState = state;
state = newByteState;
}
}
// look for a chance to put in a shortcut transition.
// However, for the moment, we only do this for a JSON string match i.e beginning with ", not literals
// like true or false or numbers, because if we do this for numbers produced by
// ComparableNumber.generate(), they can be messed up by addRangePattern.
if (SHORTCUT_MATCH_TYPES.contains(pattern.type())) {
// For exactly match, if it is last character already, we just put the real transition with match there.
if (j == length - 1) {
return insertMatch(characters, j, state, nameState, pattern, prevState);
} else if (isEligibleForShortcut) {
// If current character is not last character, create the shortcut transition with the next
ByteMatch byteMatch = new ByteMatch(pattern, nameState);
addTransition(state, characters[j], new ShortcutTransition().setMatch(byteMatch));
addMatchReferences(byteMatch);
return nameState;
}
}
// For other match type, keep the old logic to extend all characters to byte state path and put the match in the
// tail state.
for (; j < (length - 1); j++) {
ByteState nextByteState = findOrMakeNextByteState(state, prevState, characters, j, pattern, nameState);
prevState = state;
state = nextByteState;
}
return insertMatch(characters, length - 1, state, nameState, pattern, prevState);
}
private NameState insertMatchForRangePattern(byte b, ByteState state, NameState nextNameState,
Patterns pattern) {
return insertMatch(new InputCharacter[] { getParser().parse(b) }, 0, state, nextNameState, pattern, null);
}
private NameState insertMatch(InputCharacter[] characters, int currentIndex, ByteState state,
NameState nextNameState, Patterns pattern, ByteState prevState) {
InputCharacter character = characters[currentIndex];
ByteMatch match = findMatch(getTransition(state, character).getMatches(), pattern);
if (match != null) {
// There is a match linked to the transition that's the same type, so we just re-use its nextNameState
return match.getNextNameState();
}
// we make a new NameState and hook it up
NameState nameState = nextNameState == null ? new NameState() : nextNameState;
match = new ByteMatch(pattern, nameState);
addMatchReferences(match);
if (isWildcard(character)) {
// Rule ends with '*'. Allow no characters up to many characters to match, using a composite match state
// that references a self-referencing composite match state. Two composite states are used so the count of
// matching rule prefixes is accurate in MachineComplexityEvaluator. If there is a previous state, then the
// first composite already exists and we will find it. Otherwise, create a brand new composite.
SingleByteTransition composite;
if (prevState != null) {
composite = getTransitionHavingNextByteState(prevState, state, characters[currentIndex - 1]);
assert composite != null : "Composite must exist";
} else {
composite = new ByteState().setMatch(match);
}
ByteState nextState = composite.getNextByteState();
CompositeByteTransition compositeClone = (CompositeByteTransition) composite.clone();
nextState.addTransitionForAllBytes(compositeClone);
nextState.setIndeterminatePrefix(true);
} else {
addTransition(state, character, match);
// If second last char was wildcard, the previous state will also need to transition to the match so that
// empty substring matches wildcard.
if (prevState != null && currentIndex == characters.length - 1 &&
isWildcard(characters[characters.length - 2])) {
addTransition(prevState, character, match);
}
}
return nameState;
}
/**
* Gets the first transition originating from origin on character that has a next byte state of toFind.
*
* @param origin The origin transition that we will explore transitions from.
* @param toFind The state that we are looking for from origin's transitions.
* @param character The character to retrieve transitions from origin.
* @return Origin's first transition on character that has a next byte state of toFind, or null if not found.
*/
private static SingleByteTransition getTransitionHavingNextByteState(SingleByteTransition origin,
SingleByteTransition toFind, InputCharacter character) {
for (SingleByteTransition eachTrans : getTransition(origin, character).expand()) {
if (eachTrans.getNextByteState() == toFind) {
return eachTrans;
}
}
return null;
}
private void addMatchReferences(ByteMatch match) {
Patterns pattern = match.getPattern();
switch (pattern.type()) {
case EXACT:
case PREFIX:
case PREFIX_EQUALS_IGNORE_CASE:
case EXISTS:
case EQUALS_IGNORE_CASE:
case WILDCARD:
break;
case SUFFIX:
case SUFFIX_EQUALS_IGNORE_CASE:
hasSuffix.incrementAndGet();
break;
case NUMERIC_EQ:
hasNumeric.incrementAndGet();
break;
case NUMERIC_RANGE:
final Range range = (Range) pattern;
if (range.isCIDR) {
hasIP.incrementAndGet();
} else {
hasNumeric.incrementAndGet();
}
break;
case ANYTHING_BUT:
addToAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
if (((AnythingBut) pattern).isNumeric()) {
hasNumeric.incrementAndGet();
}
break;
case ANYTHING_BUT_IGNORE_CASE:
addToAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
break;
case ANYTHING_BUT_SUFFIX:
hasSuffix.incrementAndGet();
addToAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
break;
case ANYTHING_BUT_PREFIX:
addToAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
break;
default:
throw new AssertionError("Not implemented yet");
}
}
private NameState findMatchForRangePattern(byte b, ByteTransition trans, Patterns pattern) {
if (trans == null) {
return null;
}
ByteTransition nextTrans = getTransition(trans, b);
ByteMatch match = findMatch(nextTrans.getMatches(), pattern);
return match == null ? null : match.getNextNameState();
}
/**
* Deletes any matches that exist on the given pattern that are accessed from ByteTransition transition using
* character.
*
* @param character The character used to transition.
* @param transition The transition we are transitioning from to look for matches.
* @param pattern The pattern whose match we are attempting to delete.
*/
private void deleteMatches(InputCharacter character, ByteTransition transition, Patterns pattern) {
if (transition == null) {
return;
}
for (SingleByteTransition single : transition.expand()) {
ByteTransition trans = getTransition(single, character);
for (SingleByteTransition eachTrans : trans.expand()) {
deleteMatch(character, single.getNextByteState(), pattern, eachTrans);
}
}
}
/**
* Deletes a match, if it exists, on the given pattern that is accessed from ByteState state using character to
* transition over SingleByteTransition trans.
*
* @param character The character used to transition.
* @param state The state we are transitioning from.
* @param pattern The pattern whose match we are attempting to delete.
* @param trans The transition to the match.
* @return The updated transition (may or may not be same object as trans) if the match was found. Null otherwise.
*/
private SingleByteTransition deleteMatch(InputCharacter character, ByteState state, Patterns pattern,
SingleByteTransition trans) {
if (state == null) {
return null;
}
ByteMatch match = trans.getMatch();
if (match != null && match.getPattern().equals(pattern)) {
updateMatchReferences(match);
return putTransitionMatch(state, character, trans, null);
}
return null;
}
private void updateMatchReferences(ByteMatch match) {
Patterns pattern = match.getPattern();
switch (pattern.type()) {
case EXACT:
case PREFIX:
case PREFIX_EQUALS_IGNORE_CASE:
case EXISTS:
case EQUALS_IGNORE_CASE:
case WILDCARD:
break;
case SUFFIX:
case SUFFIX_EQUALS_IGNORE_CASE:
hasSuffix.decrementAndGet();
break;
case NUMERIC_EQ:
hasNumeric.decrementAndGet();
break;
case NUMERIC_RANGE:
final Range range = (Range) pattern;
if (range.isCIDR) {
hasIP.decrementAndGet();
} else {
hasNumeric.decrementAndGet();
}
break;
case ANYTHING_BUT:
removeFromAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
if (((AnythingBut) pattern).isNumeric()) {
hasNumeric.decrementAndGet();
}
break;
case ANYTHING_BUT_IGNORE_CASE:
removeFromAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
break;
case ANYTHING_BUT_SUFFIX:
hasSuffix.decrementAndGet();
removeFromAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
break;
case ANYTHING_BUT_PREFIX:
removeFromAnythingButsMap(anythingButs, match.getNextNameState(), match.getPattern());
break;
default:
throw new AssertionError("Not implemented yet");
}
}
private static ByteTransition getTransition(ByteTransition trans, byte b) {
ByteTransition nextTrans = trans.getTransition(b);
return nextTrans == null ? EmptyByteTransition.INSTANCE : nextTrans;
}
private static ByteTransition getTransition(SingleByteTransition trans, InputCharacter character) {
switch (character.getType()) {
case WILDCARD:
return trans.getTransitionForAllBytes();
case MULTI_BYTE_SET:
return getTransitionForMultiBytes(trans, InputMultiByteSet.cast(character).getMultiBytes());
case BYTE:
return getTransition(trans, InputByte.cast(character).getByte());
default:
throw new AssertionError("Not implemented yet");
}
}
private static ByteTransition getTransitionForMultiBytes(SingleByteTransition trans, Set multiBytes) {
Set candidates = new HashSet<>();
for (MultiByte multiByte : multiBytes) {
ByteTransition currentTransition = trans;
for (byte b : multiByte.getBytes()) {
ByteTransition nextTransition = currentTransition.getTransition(b);
if (nextTransition == null) {
return EmptyByteTransition.INSTANCE;
}
currentTransition = nextTransition;
}
if (candidates.isEmpty()) {
currentTransition.expand().forEach(t -> candidates.add(t));
} else {
Set retainThese = new HashSet<>();
currentTransition.expand().forEach(t -> retainThese.add(t));
candidates.retainAll(retainThese);
if (candidates.isEmpty()) {
return EmptyByteTransition.INSTANCE;
}
}
}
return coalesce(candidates);
}
private void addTransitionNextState(ByteState state, InputCharacter character, InputCharacter[] characters,
int currentIndex, ByteState prevState, Patterns pattern,
ByteState nextState, NameState nameState) {
if (isWildcard(character)) {
addTransitionNextStateForWildcard(state, nextState);
} else {
// If the last character is the wildcard character, and we're on the second last character, set a match on
// the transition (turning it into a composite) before adding transitions from state and prevState.
SingleByteTransition single = nextState;
if (isWildcard(characters[characters.length - 1]) && currentIndex == characters.length - 2) {
ByteMatch match = new ByteMatch(pattern, nameState);
single = nextState.setMatch(match);
nextState.setIndeterminatePrefix(true);
} else if (isMultiByteSet(character)) {
nextState.setIndeterminatePrefix(true);
}
addTransition(state, character, single);
// If there was a previous state and it transitioned using the wildcard character, we need to add a
// transition from the previous state to allow wildcard match on empty substring.
if (prevState != null && isWildcard(characters[currentIndex - 1])) {
addTransition(prevState, character, single);
}
}
}
/**
* Assuming a current wildcard character, a next character of byte b, a current state of A, a next state of B, and a
* next next state of C, this function produces the following state machine:
*
* ____
* * | * |
* A ---> B <--
*
* When processing the next byte (b) of the current rule, which transitions to the next next state of C, the
* addTransitionNextState function will be invoked to transform the state machine to:
*
* ____ ____
* * | * | * | * |
* A ---> B <-- A -----> B <--
* b | =====> | b b |
* C <-- --> C <--
*
* A more naive implementation would skip B altogether, like:
*
* ____
* | * | b
* --> A ---> C
*
* But the naive implementation does not work in the case of multiple rules in one machine. Since the current state,
* A, may already have transitions for other rules, adding a self-referential * transition to A can lead to
* unintended matches using those existing rules. We must create a new state (B) that the wildcard byte transitions
* to so that we do not affect existing rules in the machine.
*
* @param state The current state.
* @param nextState The next state.
*/
private static void addTransitionNextStateForWildcard(ByteState state,
ByteState nextState) {
// Add self-referential * transition to next state (B).
nextState.addTransitionForAllBytes(nextState);
nextState.setIndeterminatePrefix(true);
// Add * transition from current state (A) to next state (B).
state.addTransitionForAllBytes(nextState);
}
private static void putTransitionNextState(ByteState state, InputCharacter character,
SingleByteTransition transition, ByteState nextState) {
// In order to avoid change being felt by the concurrent query thread in the middle of change, we clone the
// trans firstly and will not update state store until the changes have completely applied in the new trans.
ByteTransition trans = transition.clone();
SingleByteTransition single = trans.setNextByteState(nextState);
if (single != null && !single.isEmpty() && single != transition) {
addTransition(state, character, single);
}
removeTransition(state, character, transition);
}
/**
* Returns the cloned transition with the match set.
*/
private static SingleByteTransition putTransitionMatch(ByteState state, InputCharacter character,
SingleByteTransition transition, ByteMatch match) {
// In order to avoid change being felt by the concurrent query thread in the middle of change, we clone the
// trans firstly and will not update state store until the changes have completely applied in the new trans.
SingleByteTransition trans = (SingleByteTransition) transition.clone();
SingleByteTransition single = trans.setMatch(match);
if (single != null && !single.isEmpty() && single != transition) {
addTransition(state, character, single);
}
removeTransition(state, character, transition);
return single;
}
private static void removeTransition(ByteState state, InputCharacter character, SingleByteTransition transition) {
if (isWildcard(character)) {
state.removeTransitionForAllBytes(transition);
} else if (isMultiByteSet(character)) {
removeTransitionForMultiByteSet(state, InputMultiByteSet.cast(character), transition);
} else {
state.removeTransition(InputByte.cast(character).getByte(), transition);
}
}
private static void removeTransitionForMultiByteSet(ByteState state, InputMultiByteSet multiByteSet,
SingleByteTransition transition) {
for (MultiByte multiByte : multiByteSet.getMultiBytes()) {
removeTransitionForBytes(state, multiByte.getBytes(), 0, transition);
}
}
private static void removeTransitionForBytes(ByteState state, byte[] bytes, int index,
SingleByteTransition transition) {
if (index == bytes.length - 1) {
state.removeTransition(bytes[index], transition);
} else {
for (SingleByteTransition single : getTransition(state, bytes[index]).expand()) {
ByteState nextState = single.getNextByteState();
if (nextState != null) {
removeTransitionForBytes(nextState, bytes, index + 1, transition);
if (nextState.hasNoTransitions() || nextState.hasOnlySelfReferentialTransition()) {
state.removeTransition(bytes[index], single);
}
}
}
}
}
private static void addTransition(ByteState state, InputCharacter character, SingleByteTransition transition) {
if (isWildcard(character)) {
state.putTransitionForAllBytes(transition);
} else if (isMultiByteSet(character)) {
addTransitionForMultiByteSet(state, InputMultiByteSet.cast(character), transition);
} else {
state.addTransition(InputByte.cast(character).getByte(), transition);
}
}
private static void addTransitionForMultiByteSet(ByteState state, InputMultiByteSet multiByteSet,
SingleByteTransition transition) {
for (MultiByte multiByte : multiByteSet.getMultiBytes()) {
byte[] bytes = multiByte.getBytes();
ByteState currentState = state;
for (int i = 0; i < bytes.length - 1; i++) {
ByteState nextState = new ByteState();
nextState.setIndeterminatePrefix(true);
currentState.addTransition(bytes[i], nextState);
currentState = nextState;
}
currentState.addTransition(bytes[bytes.length - 1], transition);
}
}
private static ByteMatch findMatch(Iterable matches, Patterns pattern) {
for (ByteMatch match : matches) {
if (match.getPattern().equals(pattern)) {
return match;
}
}
return null;
}
private static boolean isByte(InputCharacter character) {
return character.getType() == InputCharacterType.BYTE;
}
private static boolean isWildcard(InputCharacter character) {
return character.getType() == InputCharacterType.WILDCARD;
}
private static boolean isMultiByteSet(InputCharacter character) {
return character.getType() == InputCharacterType.MULTI_BYTE_SET;
}
public int evaluateComplexity(MachineComplexityEvaluator evaluator) {
return (startStateMatch != null ? 1 : 0) + evaluator.evaluate(startState);
}
public void gatherObjects(Set
© 2015 - 2025 Weber Informatics LLC | Privacy Policy