org.apache.lucene.queries.intervals.Intervals Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queries.intervals;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
/**
* Factory functions for creating {@link IntervalsSource interval sources}.
*
* These sources implement minimum-interval algorithms taken from the paper Efficient Optimally Lazy Algorithms
* for Minimal-Interval Semantics
*
*
Note: by default, sources that are sensitive to internal gaps (e.g. {@code PHRASE}
* and {@code MAXGAPS}) will rewrite their sub-sources so that disjunctions of different lengths are
* pulled up to the top of the interval tree. For example, {@code PHRASE(or(PHRASE("a", "b", "c"),
* "b"), "c")} will automatically rewrite itself to {@code OR(PHRASE("a", "b", "c", "c"),
* PHRASE("b", "c"))} to ensure that documents containing {@code "b c"} are matched. This can lead
* to less efficient queries, as more terms need to be loaded (for example, the {@code "c"} iterator
* above is loaded twice), so if you care more about speed than about accuracy you can use the
* {@link #or(boolean, IntervalsSource...)} factory method to prevent rewriting.
*/
public final class Intervals {
/**
* The default number of expansions in:
*
*
* - {@link #multiterm(CompiledAutomaton, String)}
*
*/
public static final int DEFAULT_MAX_EXPANSIONS = 128;
private Intervals() {}
/** Return an {@link IntervalsSource} exposing intervals for a term */
public static IntervalsSource term(BytesRef term) {
return new TermIntervalsSource(term);
}
/** Return an {@link IntervalsSource} exposing intervals for a term */
public static IntervalsSource term(String term) {
return new TermIntervalsSource(new BytesRef(term));
}
/**
* Return an {@link IntervalsSource} exposing intervals for a term, filtered by the value of the
* term's payload at each position
*/
public static IntervalsSource term(String term, Predicate payloadFilter) {
return term(new BytesRef(term), payloadFilter);
}
/**
* Return an {@link IntervalsSource} exposing intervals for a term, filtered by the value of the
* term's payload at each position
*/
public static IntervalsSource term(BytesRef term, Predicate payloadFilter) {
return new PayloadFilteredTermIntervalsSource(term, payloadFilter);
}
/**
* Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of terms
*/
public static IntervalsSource phrase(String... terms) {
if (terms.length == 1) {
return Intervals.term(terms[0]);
}
IntervalsSource[] sources = new IntervalsSource[terms.length];
int i = 0;
for (String term : terms) {
sources[i] = term(term);
i++;
}
return phrase(sources);
}
/**
* Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of
* {@link IntervalsSource interval sources}
*/
public static IntervalsSource phrase(IntervalsSource... subSources) {
return BlockIntervalsSource.build(Arrays.asList(subSources));
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* Automatically rewrites if wrapped by an interval source that is sensitive to internal gaps
*/
public static IntervalsSource or(IntervalsSource... subSources) {
return or(true, Arrays.asList(subSources));
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* @param rewrite if {@code false}, do not rewrite intervals that are sensitive to internal gaps;
* this may run more efficiently, but can miss valid hits due to minimization
* @param subSources the sources to combine
*/
public static IntervalsSource or(boolean rewrite, IntervalsSource... subSources) {
return or(rewrite, Arrays.asList(subSources));
}
/** Return an {@link IntervalsSource} over the disjunction of a set of sub-sources */
public static IntervalsSource or(List subSources) {
return or(true, subSources);
}
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* @param rewrite if {@code false}, do not rewrite intervals that are sensitive to internal gaps;
* this may run more efficiently, but can miss valid hits due to minimization
* @param subSources the sources to combine
*/
public static IntervalsSource or(boolean rewrite, List subSources) {
return DisjunctionIntervalsSource.create(subSources, rewrite);
}
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
*
* @throws IllegalStateException if the prefix expands to more than {@link
* #DEFAULT_MAX_EXPANSIONS} terms
*/
public static IntervalsSource prefix(BytesRef prefix) {
return prefix(prefix, DEFAULT_MAX_EXPANSIONS);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that begin with a
* prefix
*
* WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param prefix the prefix to expand
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the prefix expands to more than {@code maxExpansions} terms
*/
public static IntervalsSource prefix(BytesRef prefix, int maxExpansions) {
CompiledAutomaton ca =
new CompiledAutomaton(PrefixQuery.toAutomaton(prefix), false, true, true);
return new MultiTermIntervalsSource(ca, maxExpansions, prefix.utf8ToString() + "*");
}
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
*
* @throws IllegalStateException if the wildcard glob expands to more than {@link
* #DEFAULT_MAX_EXPANSIONS} terms
* @see WildcardQuery for glob format
*/
public static IntervalsSource wildcard(BytesRef wildcard) {
return wildcard(wildcard, DEFAULT_MAX_EXPANSIONS);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
* wildcard glob
*
*
WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param wildcard the glob to expand
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
* terms
* @see WildcardQuery for glob format
*/
public static IntervalsSource wildcard(BytesRef wildcard, int maxExpansions) {
CompiledAutomaton ca =
new CompiledAutomaton(
WildcardQuery.toAutomaton(
new Term("", wildcard), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));
return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
}
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that match a regular
* expression
*
* @param regexp regular expression
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
* @see RegexpQuery for regexp format
*/
public static IntervalsSource regexp(BytesRef regexp) {
return regexp(regexp, DEFAULT_MAX_EXPANSIONS);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a
* regular expression
*
*
WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param regexp regular expression
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the regex expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
* @see RegexpQuery for regexp format
*/
public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) {
Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton();
automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, false);
return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString());
}
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given
* range
*
* @param lowerTerm The term text at the lower end of the range
* @param upperTerm The term text at the upper end of the range
* @param includeLower If true, the lowerTerm
is included in the range
* @param includeUpper If true, the upperTerm
is included in the range
* @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS}
* terms
*/
public static IntervalsSource range(
BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) {
return range(lowerTerm, upperTerm, includeLower, includeUpper, DEFAULT_MAX_EXPANSIONS);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that fall within
* the given range
*
*
WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param lowerTerm The term text at the lower end of the range
* @param upperTerm The term text at the upper end of the range
* @param includeLower If true, the lowerTerm
is included in the range
* @param includeUpper If true, the upperTerm
is included in the range
* @param maxExpansions the maximum number of terms to expand to
* @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions}
* terms
*/
public static IntervalsSource range(
BytesRef lowerTerm,
BytesRef upperTerm,
boolean includeLower,
boolean includeUpper,
int maxExpansions) {
Automaton automaton =
TermRangeQuery.toAutomaton(lowerTerm, upperTerm, includeLower, includeUpper);
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, true);
StringBuilder buffer = new StringBuilder();
buffer.append("{");
buffer.append(lowerTerm.utf8ToString());
buffer.append(",");
buffer.append(upperTerm.utf8ToString());
buffer.append("}");
return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString());
}
/**
* A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
* within the specified {@code maxEdits} from the provided term.
*
* @see #fuzzyTerm(String, int, int, boolean, int)
* @param term the term to search for
* @param maxEdits must be {@code >= 0} and {@code <=} {@link
* LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}, use {@link FuzzyQuery#defaultMaxEdits} for
* the default, if needed.
*/
public static IntervalsSource fuzzyTerm(String term, int maxEdits) {
return fuzzyTerm(
term,
maxEdits,
FuzzyQuery.defaultPrefixLength,
FuzzyQuery.defaultTranspositions,
DEFAULT_MAX_EXPANSIONS);
}
/**
* A fuzzy term {@link IntervalsSource} matches the disjunction of intervals of terms that are
* within the specified {@code maxEdits} from the provided term.
*
*
The implementation is delegated to a {@link #multiterm(CompiledAutomaton, int, String)}
* interval source, with an automaton sourced from {@link org.apache.lucene.search.FuzzyQuery}.
*
* @param term the term to search for
* @param maxEdits must be {@code >= 0} and {@code <=} {@link
* LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}, use {@link FuzzyQuery#defaultMaxEdits} for
* the default, if needed.
* @param prefixLength length of common (non-fuzzy) prefix
* @param maxExpansions the maximum number of terms to match. Setting {@code maxExpansions} to
* higher than the default value of {@link #DEFAULT_MAX_EXPANSIONS} can be both slow and
* memory-intensive
* @param transpositions true if transpositions should be treated as a primitive edit operation.
* If this is false, comparisons will implement the classic Levenshtein algorithm.
*/
public static IntervalsSource fuzzyTerm(
String term, int maxEdits, int prefixLength, boolean transpositions, int maxExpansions) {
return Intervals.multiterm(
FuzzyQuery.getFuzzyAutomaton(term, maxEdits, prefixLength, transpositions),
maxExpansions,
term + "~" + maxEdits);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that are accepted
* by the given automaton
*
* @param ca an automaton accepting matching terms
* @param pattern string representation of the given automaton, mostly used in exception messages
* @throws IllegalStateException if the automaton accepts more than {@link
* #DEFAULT_MAX_EXPANSIONS} terms
*/
public static IntervalsSource multiterm(CompiledAutomaton ca, String pattern) {
return multiterm(ca, DEFAULT_MAX_EXPANSIONS, pattern);
}
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that are accepted
* by the given automaton
*
*
WARNING: Setting {@code maxExpansions} to higher than the default value of {@link
* #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive
*
* @param ca an automaton accepting matching terms
* @param maxExpansions the maximum number of terms to expand to
* @param pattern string representation of the given automaton, mostly used in exception messages
* @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
*/
public static IntervalsSource multiterm(CompiledAutomaton ca, int maxExpansions, String pattern) {
return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
}
/**
* Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
*
* @param width the maximum width of intervals in the sub-source to filter
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxwidth(int width, IntervalsSource subSource) {
return FilteredIntervalsSource.maxWidth(subSource, width);
}
/**
* Create an {@link IntervalsSource} that filters a sub-source by its gaps
*
* @param gaps the maximum number of gaps in the sub-source to filter
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxgaps(int gaps, IntervalsSource subSource) {
return FilteredIntervalsSource.maxGaps(subSource, gaps);
}
/**
* Create an {@link IntervalsSource} that wraps another source, extending its intervals by a
* number of positions before and after.
*
*
This can be useful for adding defined gaps in a block query; for example, to find 'a b [2
* arbitrary terms] c', you can call:
*
*
* Intervals.phrase(Intervals.term("a"), Intervals.extend(Intervals.term("b"), 0, 2), Intervals.term("c"));
*
*
* Note that calling {@link IntervalIterator#gaps()} on iterators returned by this source
* delegates directly to the wrapped iterator, and does not include the extensions.
*
* @param source the source to extend
* @param before how many positions to extend before the delegated interval
* @param after how many positions to extend after the delegated interval
*/
public static IntervalsSource extend(IntervalsSource source, int before, int after) {
return new ExtendedIntervalsSource(source, before, after);
}
/**
* Create an ordered {@link IntervalsSource}
*
* Returns intervals in which the subsources all appear in the given order
*
* @param subSources an ordered set of {@link IntervalsSource} objects
*/
public static IntervalsSource ordered(IntervalsSource... subSources) {
return OrderedIntervalsSource.build(Arrays.asList(subSources));
}
/**
* Create an unordered {@link IntervalsSource}. Note that if there are multiple intervals ends at
* the same position are eligible, only the narrowest one will be returned. For example if asking
* for unordered(term("apple"), term("banana"))
on field of "apple wolf apple orange
* banana", only the "apple orange banana" will be returned.
*
*
Returns intervals in which all the subsources appear. The subsources may overlap
*
* @param subSources an unordered set of {@link IntervalsSource}s
*/
public static IntervalsSource unordered(IntervalsSource... subSources) {
return UnorderedIntervalsSource.build(Arrays.asList(subSources));
}
/**
* Create an unordered {@link IntervalsSource} allowing no overlaps between subsources
*
*
Returns intervals in which both the subsources appear and do not overlap.
*/
public static IntervalsSource unorderedNoOverlaps(IntervalsSource a, IntervalsSource b) {
return Intervals.or(Intervals.ordered(a, b), Intervals.ordered(b, a));
}
/**
* Create an {@link IntervalsSource} that always returns intervals from a specific field
*
*
This is useful for comparing intervals across multiple fields, for example fields that have
* been analyzed differently, allowing you to search for stemmed terms near unstemmed terms, etc.
*/
public static IntervalsSource fixField(String field, IntervalsSource source) {
return new FixedFieldIntervalsSource(field, source);
}
/**
* Create a non-overlapping IntervalsSource
*
*
Returns intervals of the minuend that do not overlap with intervals from the subtrahend
*
* @param minuend the {@link IntervalsSource} to filter
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource nonOverlapping(
IntervalsSource minuend, IntervalsSource subtrahend) {
return new NonOverlappingIntervalsSource(minuend, subtrahend);
}
/**
* Returns intervals from a source that overlap with intervals from another source
*
* @param source the source to filter
* @param reference the source to filter by
*/
public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) {
return new OverlappingIntervalsSource(source, reference);
}
/**
* Create a not-within {@link IntervalsSource}
*
*
Returns intervals of the minuend that do not appear within a set number of positions of
* intervals from the subtrahend query
*
* @param minuend the {@link IntervalsSource} to filter
* @param positions the minimum distance that intervals from the minuend may occur from intervals
* of the subtrahend
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notWithin(
IntervalsSource minuend, int positions, IntervalsSource subtrahend) {
return new NonOverlappingIntervalsSource(
minuend, Intervals.extend(subtrahend, positions, positions));
}
/**
* Returns intervals of the source that appear within a set number of positions of intervals from
* the reference
*
* @param source the {@link IntervalsSource} to filter
* @param positions the maximum distance that intervals of the source may occur from intervals of
* the reference
* @param reference the {@link IntervalsSource} to filter by
*/
public static IntervalsSource within(
IntervalsSource source, int positions, IntervalsSource reference) {
return containedBy(source, Intervals.extend(reference, positions, positions));
}
/**
* Create a not-containing {@link IntervalsSource}
*
*
Returns intervals from the minuend that do not contain intervals of the subtrahend
*
* @param minuend the {@link IntervalsSource} to filter
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notContaining(IntervalsSource minuend, IntervalsSource subtrahend) {
return NotContainingIntervalsSource.build(minuend, subtrahend);
}
/**
* Create a containing {@link IntervalsSource}
*
*
Returns intervals from the big source that contain one or more intervals from the small
* source
*
* @param big the {@link IntervalsSource} to filter
* @param small the {@link IntervalsSource} to filter by
*/
public static IntervalsSource containing(IntervalsSource big, IntervalsSource small) {
return ContainingIntervalsSource.build(big, small);
}
/**
* Create a not-contained-by {@link IntervalsSource}
*
*
Returns intervals from the small {@link IntervalsSource} that do not appear within intervals
* from the big {@link IntervalsSource}.
*
* @param small the {@link IntervalsSource} to filter
* @param big the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notContainedBy(IntervalsSource small, IntervalsSource big) {
return NotContainedByIntervalsSource.build(small, big);
}
/**
* Create a contained-by {@link IntervalsSource}
*
*
Returns intervals from the small query that appear within intervals of the big query
*
* @param small the {@link IntervalsSource} to filter
* @param big the {@link IntervalsSource} to filter by
*/
public static IntervalsSource containedBy(IntervalsSource small, IntervalsSource big) {
return ContainedByIntervalsSource.build(small, big);
}
/**
* Return intervals that span combinations of intervals from {@code minShouldMatch} of the sources
*/
public static IntervalsSource atLeast(int minShouldMatch, IntervalsSource... sources) {
if (minShouldMatch == sources.length) {
return unordered(sources);
}
if (minShouldMatch > sources.length) {
return new NoMatchIntervalsSource(
"Too few sources to match minimum of ["
+ minShouldMatch
+ "]: "
+ Arrays.toString(sources));
}
return new MinimumShouldMatchIntervalsSource(sources, minShouldMatch);
}
/** Returns intervals from the source that appear before intervals from the reference */
public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) {
return ContainedByIntervalsSource.build(
source, Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0));
}
/** Returns intervals from the source that appear after intervals from the reference */
public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) {
return ContainedByIntervalsSource.build(
source,
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
}
/**
* Returns a source that produces no intervals
*
* @param reason A reason string that will appear in the toString output of this source
*/
public static IntervalsSource noIntervals(String reason) {
return new NoMatchIntervalsSource(reason);
}
/**
* Returns intervals that correspond to tokens from a {@link TokenStream} returned for {@code
* text} by applying the provided {@link Analyzer} as if {@code text} was the content of the given
* {@code field}. The intervals can be ordered or unordered and can have optional gaps inside.
*
* @param text The text to analyze.
* @param analyzer The {@link Analyzer} to use to acquire a {@link TokenStream} which is then
* converted into intervals.
* @param field The field {@code text} should be parsed as.
* @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
* @param ordered Whether sub-intervals should enforce token ordering or not.
* @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
* text}. Possibly an empty interval source, never {@code null}.
* @throws IOException If an I/O exception occurs.
*/
public static IntervalsSource analyzedText(
String text, Analyzer analyzer, String field, int maxGaps, boolean ordered)
throws IOException {
try (TokenStream ts = analyzer.tokenStream(field, text)) {
return analyzedText(ts, maxGaps, ordered);
}
}
/**
* Returns intervals that correspond to tokens from the provided {@link TokenStream}. This is a
* low-level counterpart to {@link #analyzedText(String, Analyzer, String, int, boolean)}. The
* intervals can be ordered or unordered and can have optional gaps inside.
*
* @param tokenStream The token stream to produce intervals for. The token stream may be fully or
* partially consumed after returning from this method.
* @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
* @param ordered Whether sub-intervals should enforce token ordering or not.
* @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
* text}. Possibly an empty interval source, never {@code null}.
* @throws IOException If an I/O exception occurs.
*/
public static IntervalsSource analyzedText(TokenStream tokenStream, int maxGaps, boolean ordered)
throws IOException {
CachingTokenFilter stream =
tokenStream instanceof CachingTokenFilter
? (CachingTokenFilter) tokenStream
: new CachingTokenFilter(tokenStream);
return IntervalBuilder.analyzeText(stream, maxGaps, ordered);
}
}