All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.index.AutomatonTermsEnum Maven / Gradle / Ivy

There is a newer version: 6.4.2_1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;

import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.TransitionAccessor;

/**
 * A FilteredTermsEnum that enumerates terms based upon what is accepted by a DFA.
 *
 * 

The algorithm is such: * *

    *
  1. As long as matches are successful, keep reading sequentially. *
  2. When a match fails, skip to the next string in lexicographic order that does not enter a * reject state. *
* *

The algorithm does not attempt to actually skip to the next string that is completely * accepted. This is not possible when the language accepted by the FSM is not finite (i.e. * * operator). * * @lucene.internal */ public class AutomatonTermsEnum extends FilteredTermsEnum { // a tableized array-based form of the DFA private final ByteRunnable byteRunnable; // common suffix of the automaton private final BytesRef commonSuffixRef; // true if the automaton accepts a finite language private final boolean finite; // array of sorted transitions for each state, indexed by state number private final TransitionAccessor transitionAccessor; // Used for visited state tracking: each short records gen when we last // visited the state; we use gens to avoid having to clear private short[] visited; private short curGen; // the reference used for seeking forwards through the term dictionary private final BytesRefBuilder seekBytesRef = new BytesRefBuilder(); // true if we are enumerating an infinite portion of the DFA. // in this case it is faster to drive the query based on the terms dictionary. // when this is true, linearUpperBound indicate the end of range // of terms where we should simply do sequential reads instead. private boolean linear; private final BytesRef linearUpperBound = new BytesRef(); private final Transition transition = new Transition(); private final IntsRefBuilder savedStates = new IntsRefBuilder(); /** * Construct an enumerator based upon an automaton, enumerating the specified field, working on a * supplied TermsEnum * * @lucene.experimental * @param compiled CompiledAutomaton */ public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) { super(tenum); if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } this.finite = compiled.finite; this.byteRunnable = compiled.getByteRunnable(); this.transitionAccessor = compiled.getTransitionAccessor(); this.commonSuffixRef = compiled.commonSuffixRef; // No need to track visited states for a finite language without loops. visited = finite ? null : new short[byteRunnable.getSize()]; } /** Records the given state has been visited. */ private void setVisited(int state) { if (!finite) { if (state >= visited.length) { visited = ArrayUtil.grow(visited, state + 1); } visited[state] = curGen; } } /** Indicates whether the given state has been visited. */ private boolean isVisited(int state) { return !finite && state < visited.length && visited[state] == curGen; } /** * Returns true if the term matches the automaton. Also stashes away the term to assist with smart * enumeration. */ @Override protected AcceptStatus accept(final BytesRef term) { if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) { if (byteRunnable.run(term.bytes, term.offset, term.length)) return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; else return (linear && term.compareTo(linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } else { return (linear && term.compareTo(linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } } @Override protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { // System.out.println("ATE.nextSeekTerm term=" + term); if (term == null) { assert seekBytesRef.length() == 0; // return the empty term, as it's valid if (byteRunnable.isAccept(0)) { return seekBytesRef.get(); } } else { seekBytesRef.copyBytes(term); } // seek to the next possible string; if (nextString()) { return seekBytesRef.get(); // reposition } else { return null; // no more possible strings can match } } /** * Sets the enum to operate in linear fashion, as we have found a looping transition at position: * we set an upper bound and act like a TermRangeQuery for this portion of the term space. */ private void setLinear(int position) { assert linear == false; int state = 0; int maxInterval = 0xff; // System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef); for (int i = 0; i < position; i++) { state = byteRunnable.step(state, seekBytesRef.byteAt(i) & 0xff); assert state >= 0 : "state=" + state; } final int numTransitions = transitionAccessor.getNumTransitions(state); transitionAccessor.initTransition(state, transition); for (int i = 0; i < numTransitions; i++) { transitionAccessor.getNextTransition(transition); if (transition.min <= (seekBytesRef.byteAt(position) & 0xff) && (seekBytesRef.byteAt(position) & 0xff) <= transition.max) { maxInterval = transition.max; break; } } // 0xff terms don't get the optimization... not worth the trouble. if (maxInterval != 0xff) maxInterval++; int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) { linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; } System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position); linearUpperBound.bytes[position] = (byte) maxInterval; linearUpperBound.length = length; linear = true; } /** * Increments the byte buffer to the next String in binary order after s that will not put the * machine into a reject state. If such a string does not exist, returns false. * *

The correctness of this method depends upon the automaton being deterministic, and having no * transitions to dead states. * * @return true if more possible solutions exist for the DFA */ private boolean nextString() { int state; int pos = 0; savedStates.grow(seekBytesRef.length() + 1); savedStates.setIntAt(0, 0); while (true) { if (!finite && ++curGen == 0) { // Clear the visited states every time curGen wraps (so very infrequently to not impact // average perf). Arrays.fill(visited, (short) -1); } linear = false; // walk the automaton until a character is rejected. for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) { setVisited(state); int nextState = byteRunnable.step(state, seekBytesRef.byteAt(pos) & 0xff); if (nextState == -1) break; savedStates.setIntAt(pos + 1, nextState); // we found a loop, record it for faster enumeration if (!linear && isVisited(nextState)) { setLinear(pos); } state = nextState; } // take the useful portion, and the last non-reject state, and attempt to // append characters that will match. if (nextString(state, pos)) { return true; } else { /* no more solutions exist from this useful portion, backtrack */ if ((pos = backtrack(pos)) < 0) { /* no more solutions at all */ return false; } final int newState = byteRunnable.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff); if (newState >= 0 && byteRunnable.isAccept(newState)) { /* String is good to go as-is */ return true; } /* else advance further */ // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important! // for now, restart from scratch for all infinite DFAs if (!finite) pos = 0; } } } /** * Returns the next String in lexicographic order that will not put the machine into a reject * state. * *

This method traverses the DFA from the given position in the String, starting at the given * state. * *

If this cannot satisfy the machine, returns false. This method will walk the minimal path, * in lexicographic order, as long as possible. * *

If this method returns false, then there might still be more solutions, it is necessary to * backtrack to find out. * * @param state current non-reject state * @param position useful portion of the string * @return true if more possible solutions exist for the DFA from this position */ private boolean nextString(int state, int position) { /* * the next lexicographic character must be greater than the existing * character, if it exists. */ int c = 0; if (position < seekBytesRef.length()) { c = seekBytesRef.byteAt(position) & 0xff; // if the next byte is 0xff and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. if (c++ == 0xff) return false; } seekBytesRef.setLength(position); setVisited(state); final int numTransitions = transitionAccessor.getNumTransitions(state); transitionAccessor.initTransition(state, transition); // find the minimal path (lexicographic order) that is >= c for (int i = 0; i < numTransitions; i++) { transitionAccessor.getNextTransition(transition); if (transition.max >= c) { int nextChar = Math.max(c, transition.min); // append either the next sequential char, or the minimum transition seekBytesRef.append((byte) nextChar); state = transition.dest; /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. */ while (!isVisited(state) && !byteRunnable.isAccept(state)) { setVisited(state); /* * Note: we work with a DFA with no transitions to dead states. * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ transitionAccessor.initTransition(state, transition); transitionAccessor.getNextTransition(transition); state = transition.dest; // append the minimum transition seekBytesRef.append((byte) transition.min); // we found a loop, record it for faster enumeration if (!linear && isVisited(state)) { setLinear(seekBytesRef.length() - 1); } } return true; } } return false; } /** * Attempts to backtrack thru the string after encountering a dead end at some given position. * Returns false if no more possible strings can match. * * @param position current position in the input String * @return {@code position >= 0} if more possible solutions exist for the DFA */ private int backtrack(int position) { while (position-- > 0) { int nextChar = seekBytesRef.byteAt(position) & 0xff; // if a character is 0xff it's a dead-end too, // because there is no higher character in binary sort order. if (nextChar++ != 0xff) { seekBytesRef.setByteAt(position, (byte) nextChar); seekBytesRef.setLength(position + 1); return position; } } return -1; /* all solutions exhausted */ } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy