org.apache.lucene.util.automaton.CompiledAutomaton Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.util.automaton;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.PrefixTermsEnum;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.util.BytesRef;
/**
* Immutable class holding compiled details for a given
* Automaton. The Automaton is deterministic, must not have
* dead states but is not necessarily minimal.
*
* @lucene.experimental
*/
public class CompiledAutomaton {
/**
* Automata are compiled into different internal forms for the
* most efficient execution depending upon the language they accept.
*/
public enum AUTOMATON_TYPE {
/** Automaton that accepts no strings. */
NONE,
/** Automaton that accepts all possible strings. */
ALL,
/** Automaton that accepts only a single fixed string. */
SINGLE,
/** Automaton that matches all Strings with a constant prefix. */
PREFIX,
/** Catch-all for any other automata. */
NORMAL
};
public final AUTOMATON_TYPE type;
/**
* For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term;
* for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
*/
public final BytesRef term;
/**
* Matcher for quickly determining if a byte[] is accepted.
* only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final ByteRunAutomaton runAutomaton;
// TODO: would be nice if these sortedTransitions had "int
// to;" instead of "State to;" somehow:
/**
* Two dimensional array of transitions, indexed by state
* number for traversal. The state numbering is consistent with
* {@link #runAutomaton}.
* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final Transition[][] sortedTransitions;
/**
* Shared common suffix accepted by the automaton. Only valid
* for {@link AUTOMATON_TYPE#NORMAL}, and only when the
* automaton accepts an infinite language.
*/
public final BytesRef commonSuffixRef;
/**
* Indicates if the automaton accepts a finite set of strings.
* Null if this was not computed.
* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final Boolean finite;
public CompiledAutomaton(Automaton automaton) {
this(automaton, null, true);
}
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
if (simplify) {
// Test whether the automaton is a "simple" form and
// if so, don't create a runAutomaton. Note that on a
// large automaton these tests could be costly:
if (BasicOperations.isEmpty(automaton)) {
// matches nothing
type = AUTOMATON_TYPE.NONE;
term = null;
commonSuffixRef = null;
runAutomaton = null;
sortedTransitions = null;
this.finite = null;
return;
} else if (BasicOperations.isTotal(automaton)) {
// matches all possible strings
type = AUTOMATON_TYPE.ALL;
term = null;
commonSuffixRef = null;
runAutomaton = null;
sortedTransitions = null;
this.finite = null;
return;
} else {
final String commonPrefix;
final String singleton;
if (automaton.getSingleton() == null) {
commonPrefix = SpecialOperations.getCommonPrefix(automaton);
if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
singleton = commonPrefix;
} else {
singleton = null;
}
} else {
commonPrefix = null;
singleton = automaton.getSingleton();
}
if (singleton != null) {
// matches a fixed string in singleton or expanded
// representation
type = AUTOMATON_TYPE.SINGLE;
term = new BytesRef(singleton);
commonSuffixRef = null;
runAutomaton = null;
sortedTransitions = null;
this.finite = null;
return;
} else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
// matches a constant prefix
type = AUTOMATON_TYPE.PREFIX;
term = new BytesRef(commonPrefix);
commonSuffixRef = null;
runAutomaton = null;
sortedTransitions = null;
this.finite = null;
return;
}
}
}
type = AUTOMATON_TYPE.NORMAL;
term = null;
if (finite == null) {
this.finite = SpecialOperations.isFinite(automaton);
} else {
this.finite = finite;
}
Automaton utf8 = new UTF32ToUTF8().convert(automaton);
if (this.finite) {
commonSuffixRef = null;
} else {
commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
}
runAutomaton = new ByteRunAutomaton(utf8, true);
sortedTransitions = utf8.getSortedTransitions();
}
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {
// Find biggest transition that's < label
// TODO: use binary search here
Transition maxTransition = null;
for (Transition transition : sortedTransitions[state]) {
if (transition.min < leadLabel) {
maxTransition = transition;
}
}
assert maxTransition != null;
// Append floorLabel
final int floorLabel;
if (maxTransition.max > leadLabel-1) {
floorLabel = leadLabel-1;
} else {
floorLabel = maxTransition.max;
}
if (idx >= term.bytes.length) {
term.grow(1+idx);
}
//if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx);
term.bytes[idx] = (byte) floorLabel;
state = maxTransition.to.getNumber();
idx++;
// Push down to last accept state
while (true) {
Transition[] transitions = sortedTransitions[state];
if (transitions.length == 0) {
assert runAutomaton.isAccept(state);
term.length = idx;
//if (DEBUG) System.out.println(" return " + term.utf8ToString());
return term;
} else {
// We are pushing "top" -- so get last label of
// last transition:
assert transitions.length != 0;
Transition lastTransition = transitions[transitions.length-1];
if (idx >= term.bytes.length) {
term.grow(1+idx);
}
//if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
term.bytes[idx] = (byte) lastTransition.max;
state = lastTransition.to.getNumber();
idx++;
}
}
}
// TODO: should this take startTerm too? This way
// Terms.intersect could forward to this method if type !=
// NORMAL:
public TermsEnum getTermsEnum(Terms terms) throws IOException {
switch(type) {
case NONE:
return TermsEnum.EMPTY;
case ALL:
return terms.iterator(null);
case SINGLE:
return new SingleTermsEnum(terms.iterator(null), term);
case PREFIX:
// TODO: this is very likely faster than .intersect,
// but we should test and maybe cutover
return new PrefixTermsEnum(terms.iterator(null), term);
case NORMAL:
return terms.intersect(this, null);
default:
// unreachable
throw new RuntimeException("unhandled case");
}
}
/** Finds largest term accepted by this Automaton, that's
* <= the provided input term. The result is placed in
* output; it's fine for output and input to point to
* the same BytesRef. The returned result is either the
* provided output, or null if there is no floor term
* (ie, the provided input term is before the first term
* accepted by this Automaton). */
public BytesRef floor(BytesRef input, BytesRef output) {
output.offset = 0;
//if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());
int state = runAutomaton.getInitialState();
// Special case empty string:
if (input.length == 0) {
if (runAutomaton.isAccept(state)) {
output.length = 0;
return output;
} else {
return null;
}
}
final List stack = new ArrayList();
int idx = 0;
while (true) {
int label = input.bytes[input.offset + idx] & 0xff;
int nextState = runAutomaton.step(state, label);
//if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState);
if (idx == input.length-1) {
if (nextState != -1 && runAutomaton.isAccept(nextState)) {
// Input string is accepted
if (idx >= output.bytes.length) {
output.grow(1+idx);
}
output.bytes[idx] = (byte) label;
output.length = input.length;
//if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString());
return output;
} else {
nextState = -1;
}
}
if (nextState == -1) {
// Pop back to a state that has a transition
// <= our label:
while (true) {
Transition[] transitions = sortedTransitions[state];
if (transitions.length == 0) {
assert runAutomaton.isAccept(state);
output.length = idx;
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
return output;
} else if (label-1 < transitions[0].min) {
if (runAutomaton.isAccept(state)) {
output.length = idx;
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
return output;
}
// pop
if (stack.size() == 0) {
//if (DEBUG) System.out.println(" pop ord=" + idx + " return null");
return null;
} else {
state = stack.remove(stack.size()-1);
idx--;
//if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
label = input.bytes[input.offset + idx] & 0xff;
}
} else {
//if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
break;
}
}
//if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx);
return addTail(state, output, idx, label);
} else {
if (idx >= output.bytes.length) {
output.grow(1+idx);
}
output.bytes[idx] = (byte) label;
stack.add(state);
state = nextState;
idx++;
}
}
}
public String toDot() {
StringBuilder b = new StringBuilder("digraph CompiledAutomaton {\n");
b.append(" rankdir = LR;\n");
int initial = runAutomaton.getInitialState();
for (int i = 0; i < sortedTransitions.length; i++) {
b.append(" ").append(i);
if (runAutomaton.isAccept(i)) b.append(" [shape=doublecircle,label=\"\"];\n");
else b.append(" [shape=circle,label=\"\"];\n");
if (i == initial) {
b.append(" initial [shape=plaintext,label=\"\"];\n");
b.append(" initial -> ").append(i).append("\n");
}
for (int j = 0; j < sortedTransitions[i].length; j++) {
b.append(" ").append(i);
sortedTransitions[i][j].appendDot(b);
}
}
return b.append("}\n").toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((runAutomaton == null) ? 0 : runAutomaton.hashCode());
result = prime * result + ((term == null) ? 0 : term.hashCode());
result = prime * result + ((type == null) ? 0 : type.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
CompiledAutomaton other = (CompiledAutomaton) obj;
if (type != other.type) return false;
if (type == AUTOMATON_TYPE.SINGLE || type == AUTOMATON_TYPE.PREFIX) {
if (!term.equals(other.term)) return false;
} else if (type == AUTOMATON_TYPE.NORMAL) {
if (!runAutomaton.equals(other.runAutomaton)) return false;
}
return true;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy