
org.apache.lucene.util.automaton.CompiledAutomaton Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
/**
* Immutable class holding compiled details for a given
* Automaton. The Automaton is deterministic, must not have
* dead states but is not necessarily minimal.
*
* @lucene.experimental
*/
public class CompiledAutomaton implements Accountable {
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(CompiledAutomaton.class);
/**
* Automata are compiled into different internal forms for the
* most efficient execution depending upon the language they accept.
*/
public enum AUTOMATON_TYPE {
/** Automaton that accepts no strings. */
NONE,
/** Automaton that accepts all possible strings. */
ALL,
/** Automaton that accepts only a single fixed string. */
SINGLE,
/** Catch-all for any other automata. */
NORMAL
};
/** If simplify is true this will be the "simplified" type; else, this is NORMAL */
public final AUTOMATON_TYPE type;
/**
* For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
*/
public final BytesRef term;
/**
* Matcher for quickly determining if a byte[] is accepted.
* only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final ByteRunAutomaton runAutomaton;
/**
* Two dimensional array of transitions, indexed by state
* number for traversal. The state numbering is consistent with
* {@link #runAutomaton}.
* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final Automaton automaton;
/**
* Shared common suffix accepted by the automaton. Only valid
* for {@link AUTOMATON_TYPE#NORMAL}, and only when the
* automaton accepts an infinite language. This will be null
* if the common prefix is length 0.
*/
public final BytesRef commonSuffixRef;
/**
* Indicates if the automaton accepts a finite set of strings.
* Null if this was not computed.
* Only valid for {@link AUTOMATON_TYPE#NORMAL}.
*/
public final Boolean finite;
/** Which state, if any, accepts all suffixes, else -1. */
public final int sinkState;
/** Create this, passing simplify=true and finite=null, so that we try
* to simplify the automaton and determine if it is finite. */
public CompiledAutomaton(Automaton automaton) {
this(automaton, null, true);
}
/** Returns sink state, if present, else -1. */
private static int findSinkState(Automaton automaton) {
int numStates = automaton.getNumStates();
Transition t = new Transition();
int foundState = -1;
for (int s=0;s 1000) {
commonSuffixRef = null;
} else {
BytesRef suffix = Operations.getCommonSuffixBytesRef(binary);
if (suffix.length == 0) {
commonSuffixRef = null;
} else {
commonSuffixRef = suffix;
}
}
// This will determinize the binary automaton for us:
runAutomaton = new ByteRunAutomaton(binary, true, determinizeWorkLimit);
this.automaton = runAutomaton.automaton;
// TODO: this is a bit fragile because if the automaton is not minimized there could be more than 1 sink state but auto-prefix will fail
// to run for those:
sinkState = findSinkState(this.automaton);
}
private Transition transition = new Transition();
//private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private BytesRef addTail(int state, BytesRefBuilder term, int idx, int leadLabel) {
//System.out.println("addTail state=" + state + " term=" + term.utf8ToString() + " idx=" + idx + " leadLabel=" + (char) leadLabel);
//System.out.println(automaton.toDot());
// Find biggest transition that's < label
// TODO: use binary search here
int maxIndex = -1;
int numTransitions = automaton.initTransition(state, transition);
for(int i=0;i runAutomaton);
break;
case NONE:
break;
case ALL:
visitor.consumeTermsMatching(parent, field, () -> new ByteRunAutomaton(Automata.makeAnyString()));
break;
case SINGLE:
visitor.consumeTerms(parent, new Term(field, term));
break;
}
}
}
/** Finds largest term accepted by this Automaton, that's
* <= the provided input term. The result is placed in
* output; it's fine for output and input to point to
* the same bytes. The returned result is either the
* provided output, or null if there is no floor term
* (ie, the provided input term is before the first term
* accepted by this Automaton). */
public BytesRef floor(BytesRef input, BytesRefBuilder output) {
//if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());
int state = 0;
// Special case empty string:
if (input.length == 0) {
if (runAutomaton.isAccept(state)) {
output.clear();
return output.get();
} else {
return null;
}
}
final List stack = new ArrayList<>();
int idx = 0;
while (true) {
int label = input.bytes[input.offset + idx] & 0xff;
int nextState = runAutomaton.step(state, label);
//if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState);
if (idx == input.length-1) {
if (nextState != -1 && runAutomaton.isAccept(nextState)) {
// Input string is accepted
output.grow(1+idx);
output.setByteAt(idx, (byte) label);
output.setLength(input.length);
//if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString());
return output.get();
} else {
nextState = -1;
}
}
if (nextState == -1) {
// Pop back to a state that has a transition
// <= our label:
while (true) {
int numTransitions = automaton.getNumTransitions(state);
if (numTransitions == 0) {
assert runAutomaton.isAccept(state);
output.setLength(idx);
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
return output.get();
} else {
automaton.getTransition(state, 0, transition);
if (label-1 < transition.min) {
if (runAutomaton.isAccept(state)) {
output.setLength(idx);
//if (DEBUG) System.out.println(" return " + output.utf8ToString());
return output.get();
}
// pop
if (stack.size() == 0) {
//if (DEBUG) System.out.println(" pop ord=" + idx + " return null");
return null;
} else {
state = stack.remove(stack.size()-1);
idx--;
//if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
label = input.bytes[input.offset + idx] & 0xff;
}
} else {
//if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
break;
}
}
}
//if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx);
return addTail(state, output, idx, label);
} else {
output.grow(1+idx);
output.setByteAt(idx, (byte) label);
stack.add(state);
state = nextState;
idx++;
}
}
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((runAutomaton == null) ? 0 : runAutomaton.hashCode());
result = prime * result + ((term == null) ? 0 : term.hashCode());
result = prime * result + ((type == null) ? 0 : type.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
CompiledAutomaton other = (CompiledAutomaton) obj;
if (type != other.type) return false;
if (type == AUTOMATON_TYPE.SINGLE) {
if (!term.equals(other.term)) return false;
} else if (type == AUTOMATON_TYPE.NORMAL) {
if (!runAutomaton.equals(other.runAutomaton)) return false;
}
return true;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES +
RamUsageEstimator.sizeOfObject(automaton) +
RamUsageEstimator.sizeOfObject(commonSuffixRef) +
RamUsageEstimator.sizeOfObject(runAutomaton) +
RamUsageEstimator.sizeOfObject(term) +
RamUsageEstimator.sizeOfObject(transition);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy