All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.uniformsplit.IntersectBlockReader Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.TransitionAccessor;

/**
 * The "intersect" {@link TermsEnum} response to {@link
 * UniformSplitTerms#intersect(CompiledAutomaton, BytesRef)}, intersecting the terms with an
 * automaton.
 *
 * 

By design of the UniformSplit block keys, it is less efficient than {@code * org.apache.lucene.backward_codecs.lucene40.blocktree.IntersectTermsEnum} for {@link * org.apache.lucene.search.FuzzyQuery} (-37%). It is slightly slower for {@link * org.apache.lucene.search.WildcardQuery} (-5%) and slightly faster for {@link * org.apache.lucene.search.PrefixQuery} (+5%). * * @lucene.experimental */ public class IntersectBlockReader extends BlockReader { /** * Block iteration order. Whether to move next block, jump to a block away, or end the iteration. */ protected enum BlockIteration { NEXT, SEEK, END } /** * Threshold that controls when to attempt to jump to a block away. * *

This counter is 0 when entering a block. It is incremented each time a term is rejected by * the automaton. When the counter is greater than or equal to this threshold, then we compute the * next term accepted by the automaton, with {@link AutomatonNextTermCalculator}, and we jump to a * block away if the next term accepted is greater than the immediate next term in the block. * *

A low value, for example 1, improves the performance of automatons requiring many jumps, for * example {@link org.apache.lucene.search.FuzzyQuery} and most {@link * org.apache.lucene.search.WildcardQuery}. A higher value improves the performance of automatons * with less or no jump, for example {@link org.apache.lucene.search.PrefixQuery}. A threshold of * 4 seems to be a good balance. */ protected final int NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD = 4; protected final TransitionAccessor transitionAccessor; protected final ByteRunnable byteRunnable; protected final boolean finite; protected final BytesRef commonSuffix; // maybe null protected final int minTermLength; protected final AutomatonNextTermCalculator nextStringCalculator; /** Set this when our current mode is seeking to this term. Set to null after. */ protected BytesRef seekTerm; /** Number of bytes accepted by the automaton when validating the current term. */ protected int numMatchedBytes; /** * Automaton states reached when validating the current term, from 0 to {@link #numMatchedBytes} - * 1. */ protected int[] states; /** Block iteration order determined when scanning the terms in the current block. */ protected BlockIteration blockIteration; /** * Counter of the number of consecutively rejected terms. Depending on {@link * #NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD}, this may trigger a jump to a block away. */ protected int numConsecutivelyRejectedTerms; protected IntersectBlockReader( CompiledAutomaton compiled, BytesRef startTerm, IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput, PostingsReaderBase postingsReader, FieldMetadata fieldMetadata, BlockDecoder blockDecoder) throws IOException { super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder); this.byteRunnable = compiled.getByteRunnable(); this.transitionAccessor = compiled.getTransitionAccessor(); finite = compiled.finite; commonSuffix = compiled.commonSuffixRef; minTermLength = getMinTermLength(); nextStringCalculator = new AutomatonNextTermCalculator(compiled); seekTerm = startTerm; } /** * Computes the minimal length of the terms accepted by the automaton. This speeds up the term * scanning for automatons accepting a finite language. */ protected int getMinTermLength() { // Automatons accepting infinite language (e.g. PrefixQuery and WildcardQuery) do not benefit // much from // min term length while it takes time to compute it. More precisely, by skipping this // computation PrefixQuery // is significantly boosted while WildcardQuery might be slightly degraded on average. This min // term length // mainly boosts FuzzyQuery. int commonSuffixLength = commonSuffix == null ? 0 : commonSuffix.length; if (!finite) { return commonSuffixLength; } // Since we are only dealing with finite language, there is no loop to detect. int commonPrefixLength = 0; int state = 0; Transition t = null; while (true) { if (byteRunnable.isAccept(state)) { // The common prefix reaches a final state. So common prefix and common suffix overlap. // Min term length is the max between common prefix and common suffix lengths. return Math.max(commonPrefixLength, commonSuffixLength); } if (transitionAccessor.getNumTransitions(state) == 1) { if (t == null) { t = new Transition(); } transitionAccessor.getTransition(state, 0, t); if (t.min == t.max) { state = t.dest; commonPrefixLength++; continue; } } break; } // Min term length is the sum of common prefix and common suffix lengths. return commonPrefixLength + commonSuffixLength; } @Override public BytesRef next() throws IOException { if (blockHeader == null) { if (!seekFirstBlock()) { return null; } states = new int[32]; blockIteration = BlockIteration.NEXT; } termState = null; do { BytesRef term = nextTermInBlockMatching(); if (term != null) { return term; } } while (nextBlock()); return null; } protected boolean seekFirstBlock() throws IOException { seekTerm = nextStringCalculator.nextSeekTerm(seekTerm); if (seekTerm == null) { return false; } long blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm); if (blockStartFP == -1) { blockStartFP = fieldMetadata.getFirstBlockStartFP(); } else if (isBeyondLastTerm(seekTerm, blockStartFP)) { return false; } initializeHeader(seekTerm, blockStartFP); return blockHeader != null; } /** * Finds the next block line that matches (accepted by the automaton), or null when at end of * block. * * @return The next term in the current block that is accepted by the automaton; or null if none. */ protected BytesRef nextTermInBlockMatching() throws IOException { if (seekTerm == null) { if (readLineInBlock() == null) { return null; } } else { SeekStatus seekStatus = seekInBlock(seekTerm); seekTerm = null; if (seekStatus == SeekStatus.END) { return null; } assert numMatchedBytes == 0; assert numConsecutivelyRejectedTerms == 0; } while (true) { TermBytes lineTermBytes = blockLine.getTermBytes(); BytesRef lineTerm = lineTermBytes.getTerm(); assert lineTerm.offset == 0; if (states.length <= lineTerm.length) { states = ArrayUtil.growExact(states, ArrayUtil.oversize(lineTerm.length + 1, Byte.BYTES)); } // Since terms are delta encoded, we may start the automaton steps from the last state reached // by the previous term. int index = Math.min(lineTermBytes.getSuffixOffset(), numMatchedBytes); // Skip this term early if it is shorter than the min term length, or if it does not end with // the common suffix // accepted by the automaton. if (lineTerm.length >= minTermLength && (commonSuffix == null || endsWithCommonSuffix(lineTerm.bytes, lineTerm.length))) { int state = states[index]; while (true) { if (index == lineTerm.length) { if (byteRunnable.isAccept(state)) { // The automaton accepts the current term. Record the number of matched bytes and // return the term. assert byteRunnable.run(lineTerm.bytes, 0, lineTerm.length); numMatchedBytes = index; if (numConsecutivelyRejectedTerms > 0) { numConsecutivelyRejectedTerms = 0; } assert blockIteration == BlockIteration.NEXT; return lineTerm; } break; } state = byteRunnable.step(state, lineTerm.bytes[index] & 0xff); if (state == -1) { // The automaton rejects the current term. break; } // Record the reached automaton state. states[++index] = state; } } // The current term is not accepted by the automaton. // Still record the reached automaton state to start the next term steps from there. assert !byteRunnable.run(lineTerm.bytes, 0, lineTerm.length); numMatchedBytes = index; // If the number of consecutively rejected terms reaches the threshold, // then determine whether it is worthwhile to jump to a block away. if (++numConsecutivelyRejectedTerms >= NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD && lineIndexInBlock < blockHeader.getLinesCount() - 1 && !nextStringCalculator.isLinearState(lineTerm)) { // Compute the next term accepted by the automaton after the current term. if ((seekTerm = nextStringCalculator.nextSeekTerm(lineTerm)) == null) { blockIteration = BlockIteration.END; return null; } // It is worthwhile to jump to a block away if the next term accepted is after the next term // in the block. // Actually the block away may be the current block, but this is a good heuristic. readLineInBlock(); if (seekTerm.compareTo(blockLine.getTermBytes().getTerm()) > 0) { // Stop scanning this block terms and set the iteration order to jump to a block away by // seeking seekTerm. blockIteration = BlockIteration.SEEK; return null; } seekTerm = null; // If it is not worthwhile to jump to a block away, do not attempt anymore for the current // block. numConsecutivelyRejectedTerms = Integer.MIN_VALUE; } else if (readLineInBlock() == null) { // No more terms in the block. The iteration order is to open the very next block. assert blockIteration == BlockIteration.NEXT; return null; } } } /** * Indicates whether the given term ends with the automaton common suffix. This allows to quickly * skip terms that the automaton would reject eventually. */ protected boolean endsWithCommonSuffix(byte[] termBytes, int termLength) { byte[] suffixBytes = commonSuffix.bytes; int suffixLength = commonSuffix.length; int offset = termLength - suffixLength; assert offset >= 0; // We already checked minTermLength. for (int i = 0; i < suffixLength; i++) { if (termBytes[offset + i] != suffixBytes[i]) { return false; } } return true; } /** * Opens the next block. Depending on the {@link #blockIteration} order, it may be the very next * block, or a block away that may contain {@link #seekTerm}. * * @return true if the next block is opened; false if there is no blocks anymore and the iteration * is over. */ protected boolean nextBlock() throws IOException { long blockStartFP; switch (blockIteration) { case NEXT: assert seekTerm == null; blockStartFP = blockInput.getFilePointer(); break; case SEEK: assert seekTerm != null; blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm); if (isBeyondLastTerm(seekTerm, blockStartFP)) { return false; } blockIteration = BlockIteration.NEXT; break; case END: return false; default: throw new UnsupportedOperationException( "Unsupported " + BlockIteration.class.getSimpleName()); } numMatchedBytes = 0; numConsecutivelyRejectedTerms = 0; initializeHeader(seekTerm, blockStartFP); return blockHeader != null; } @Override public boolean seekExact(BytesRef text) { throw new UnsupportedOperationException(); } @Override public void seekExact(long ord) { throw new UnsupportedOperationException(); } @Override public void seekExact(BytesRef term, TermState state) { throw new UnsupportedOperationException(); } @Override public SeekStatus seekCeil(BytesRef text) { throw new UnsupportedOperationException(); } /** * This is mostly a copy of AutomatonTermsEnum. Since it's an inner class, the outer class can * call methods that ATE does not expose. It'd be nice if ATE's logic could be more extensible. */ protected class AutomatonNextTermCalculator { // for path tracking: each short records gen when we last // visited the state; we use gens to avoid having to clear protected short[] visited; protected short curGen; // the reference used for seeking forwards through the term dictionary protected final BytesRefBuilder seekBytesRef = new BytesRefBuilder(); // true if we are enumerating an infinite portion of the DFA. // in this case it is faster to drive the query based on the terms dictionary. // when this is true, linearUpperBound indicate the end of range // of terms where we should simply do sequential reads instead. protected boolean linear; protected final BytesRef linearUpperBound = new BytesRef(); protected final Transition transition = new Transition(); protected final IntsRefBuilder savedStates = new IntsRefBuilder(); protected AutomatonNextTermCalculator(CompiledAutomaton compiled) { visited = compiled.finite ? null : new short[byteRunnable.getSize()]; } /** Records the given state has been visited. */ private void setVisited(int state) { if (finite == false) { if (state >= visited.length) { visited = ArrayUtil.grow(visited, state + 1); } visited[state] = curGen; } } /** Indicates whether the given state has been visited. */ private boolean isVisited(int state) { return !finite && state < visited.length && visited[state] == curGen; } /** True if the current state of the automata is best iterated linearly (without seeking). */ protected boolean isLinearState(BytesRef term) { return linear && term.compareTo(linearUpperBound) < 0; } /** * @see org.apache.lucene.index.FilteredTermsEnum#nextSeekTerm(BytesRef) */ protected BytesRef nextSeekTerm(final BytesRef term) { // System.out.println("ATE.nextSeekTerm term=" + term); if (term == null) { assert seekBytesRef.length() == 0; // return the empty term, as it's valid if (byteRunnable.isAccept(0)) { return seekBytesRef.get(); } } else { seekBytesRef.copyBytes(term); } // seek to the next possible string; if (nextString()) { return seekBytesRef.get(); // reposition } else { return null; // no more possible strings can match } } /** * Sets the enum to operate in linear fashion, as we have found a looping transition at * position: we set an upper bound and act like a TermRangeQuery for this portion of the term * space. */ protected void setLinear(int position) { assert linear == false; int state = 0; int maxInterval = 0xff; // System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef); for (int i = 0; i < position; i++) { state = byteRunnable.step(state, seekBytesRef.byteAt(i) & 0xff); assert state >= 0 : "state=" + state; } final int numTransitions = transitionAccessor.getNumTransitions(state); transitionAccessor.initTransition(state, transition); for (int i = 0; i < numTransitions; i++) { transitionAccessor.getNextTransition(transition); if (transition.min <= (seekBytesRef.byteAt(position) & 0xff) && (seekBytesRef.byteAt(position) & 0xff) <= transition.max) { maxInterval = transition.max; break; } } // 0xff terms don't get the optimization... not worth the trouble. if (maxInterval != 0xff) maxInterval++; int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) { linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; } System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position); linearUpperBound.bytes[position] = (byte) maxInterval; linearUpperBound.length = length; linear = true; } /** * Increments the byte buffer to the next String in binary order after s that will not put the * machine into a reject state. If such a string does not exist, returns false. * *

The correctness of this method depends upon the automaton being deterministic, and having * no transitions to dead states. * * @return true if more possible solutions exist for the DFA */ protected boolean nextString() { int state; int pos = 0; savedStates.grow(seekBytesRef.length() + 1); savedStates.setIntAt(0, 0); while (true) { if (!finite && ++curGen == 0) { // Clear the visited states every time curGen wraps (so very infrequently to not impact // average perf). Arrays.fill(visited, (short) -1); } linear = false; // walk the automaton until a character is rejected. for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) { setVisited(state); int nextState = byteRunnable.step(state, seekBytesRef.byteAt(pos) & 0xff); if (nextState == -1) break; savedStates.setIntAt(pos + 1, nextState); // we found a loop, record it for faster enumeration if (!linear && isVisited(nextState)) { setLinear(pos); } state = nextState; } // take the useful portion, and the last non-reject state, and attempt to // append characters that will match. if (nextString(state, pos)) { return true; } else { /* no more solutions exist from this useful portion, backtrack */ if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ return false; final int newState = byteRunnable.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff); if (newState >= 0 && byteRunnable.isAccept(newState)) /* String is good to go as-is */ return true; /* else advance further */ // paranoia? if we backtrack thru an infinite DFA, the loop detection is important! // for now, restart from scratch for all infinite DFAs if (!finite) pos = 0; } } } /** * Returns the next String in lexicographic order that will not put the machine into a reject * state. * *

This method traverses the DFA from the given position in the String, starting at the given * state. * *

If this cannot satisfy the machine, returns false. This method will walk the minimal path, * in lexicographic order, as long as possible. * *

If this method returns false, then there might still be more solutions, it is necessary to * backtrack to find out. * * @param state current non-reject state * @param position useful portion of the string * @return true if more possible solutions exist for the DFA from this position */ protected boolean nextString(int state, int position) { /* * the next lexicographic character must be greater than the existing * character, if it exists. */ int c = 0; if (position < seekBytesRef.length()) { c = seekBytesRef.byteAt(position) & 0xff; // if the next byte is 0xff and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. if (c++ == 0xff) return false; } seekBytesRef.setLength(position); setVisited(state); final int numTransitions = transitionAccessor.getNumTransitions(state); transitionAccessor.initTransition(state, transition); // find the minimal path (lexicographic order) that is >= c for (int i = 0; i < numTransitions; i++) { transitionAccessor.getNextTransition(transition); if (transition.max >= c) { int nextChar = Math.max(c, transition.min); // append either the next sequential char, or the minimum transition seekBytesRef.append((byte) nextChar); state = transition.dest; /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. */ while (!isVisited(state) && !byteRunnable.isAccept(state)) { setVisited(state); /* * Note: we work with a DFA with no transitions to dead states. * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ transitionAccessor.initTransition(state, transition); transitionAccessor.getNextTransition(transition); state = transition.dest; // append the minimum transition seekBytesRef.append((byte) transition.min); // we found a loop, record it for faster enumeration if (!linear && isVisited(state)) { setLinear(seekBytesRef.length() - 1); } } return true; } } return false; } /** * Attempts to backtrack thru the string after encountering a dead end at some given position. * Returns false if no more possible strings can match. * * @param position current position in the input String * @return {@code position >= 0} if more possible solutions exist for the DFA */ protected int backtrack(int position) { while (position-- > 0) { int nextChar = seekBytesRef.byteAt(position) & 0xff; // if a character is 0xff it's a dead-end too, // because there is no higher character in binary sort order. if (nextChar++ != 0xff) { seekBytesRef.setByteAt(position, (byte) nextChar); seekBytesRef.setLength(position + 1); return position; } } return -1; /* all solutions exhausted */ } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy