All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.queries.intervals.Intervals Maven / Gradle / Ivy

There is a newer version: 8.10.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.queries.intervals;

import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;

/**
 * Constructor functions for {@link IntervalsSource} types
 *
 * These sources implement minimum-interval algorithms taken from the paper
 * 
 * Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics
 *
 * By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will
 * rewrite their sub-sources so that disjunctions of different lengths are pulled up
 * to the top of the interval tree.  For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")
 * will automatically rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))
 * to ensure that documents containing "b c" are matched.  This can lead to less efficient
 * queries, as more terms need to be loaded (for example, the "c" iterator above is loaded
 * twice), so if you care more about speed than about accuracy you can use the
 * {@link #or(boolean, IntervalsSource...)} factory method to prevent rewriting.
 */
public final class Intervals {

  private Intervals() {}

  /**
   * Return an {@link IntervalsSource} exposing intervals for a term
   */
  public static IntervalsSource term(BytesRef term) {
    return new TermIntervalsSource(term);
  }

  /**
   * Return an {@link IntervalsSource} exposing intervals for a term
   */
  public static IntervalsSource term(String term) {
    return new TermIntervalsSource(new BytesRef(term));
  }

  /**
   * Return an {@link IntervalsSource} exposing intervals for a term, filtered by the value
   * of the term's payload at each position
   */
  public static IntervalsSource term(String term, Predicate payloadFilter) {
    return term(new BytesRef(term), payloadFilter);
  }

  /**
   * Return an {@link IntervalsSource} exposing intervals for a term, filtered by the value
   * of the term's payload at each position
   */
  public static IntervalsSource term(BytesRef term, Predicate payloadFilter) {
    return new PayloadFilteredTermIntervalsSource(term, payloadFilter);
  }

  /**
   * Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of terms
   */
  public static IntervalsSource phrase(String... terms) {
    if (terms.length == 1) {
      return Intervals.term(terms[0]);
    }
    IntervalsSource[] sources = new IntervalsSource[terms.length];
    int i = 0;
    for (String term : terms) {
      sources[i] = term(term);
      i++;
    }
    return phrase(sources);
  }

  /**
   * Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of IntervalsSources
   */
  public static IntervalsSource phrase(IntervalsSource... subSources) {
    return BlockIntervalsSource.build(Arrays.asList(subSources));
  }

  /**
   * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
   *
   * Automatically rewrites if wrapped by an interval source that is sensitive to
   * internal gaps
   */
  public static IntervalsSource or(IntervalsSource... subSources) {
    return or(true, Arrays.asList(subSources));
  }

  /**
   * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
   *
   * @param rewrite      if {@code false}, do not rewrite intervals that are sensitive to
   *                     internal gaps; this may run more efficiently, but can miss valid
   *                     hits due to minimization
   * @param subSources   the sources to combine
   */
  public static IntervalsSource or(boolean rewrite, IntervalsSource... subSources) {
    return or(rewrite, Arrays.asList(subSources));
  }

  /**
   * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
   */
  public static IntervalsSource or(List subSources) {
    return or(true, subSources);
  }

  /**
   * Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
   *
   * @param rewrite      if {@code false}, do not rewrite intervals that are sensitive to
   *                     internal gaps; this may run more efficiently, but can miss valid
   *                     hits due to minimization
   * @param subSources   the sources to combine
   */
  public static IntervalsSource or(boolean rewrite, List subSources) {
    return DisjunctionIntervalsSource.create(subSources, rewrite);
  }

  /**
   * Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
   *
   * @throws IllegalStateException if the prefix expands to more than 128 terms
   */
  public static IntervalsSource prefix(BytesRef prefix) {
    return prefix(prefix, 128);
  }

  /**
   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
   *
   * WARNING: Setting {@code maxExpansions} to higher than the default value of 128
   * can be both slow and memory-intensive
   *
   * @param prefix        the prefix to expand
   * @param maxExpansions the maximum number of terms to expand to
   *
   * @throws IllegalStateException if the prefix expands to more than {@code maxExpansions} terms
   */
  public static IntervalsSource prefix(BytesRef prefix, int maxExpansions) {
    CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefix));
    return new MultiTermIntervalsSource(ca, maxExpansions, prefix.utf8ToString() + "*");
  }

  /**
   * Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
   *
   * @throws IllegalStateException if the wildcard glob expands to more than 128 terms
   *
   * @see WildcardQuery for glob format
   */
  public static IntervalsSource wildcard(BytesRef wildcard) {
    return wildcard(wildcard, 128);
  }

  /**
   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
   *
   * WARNING: Setting {@code maxExpansions} to higher than the default value of 128
   * can be both slow and memory-intensive
   *
   * @param wildcard the glob to expand
   * @param maxExpansions the maximum number of terms to expand to
   *
   * @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions} terms
   *
   * @see WildcardQuery for glob format
   */
  public static IntervalsSource wildcard(BytesRef wildcard, int maxExpansions) {
    CompiledAutomaton ca = new CompiledAutomaton(WildcardQuery.toAutomaton(new Term("", wildcard)));
    return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
  }

  /**
   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton 
   *
   * @param automaton accepts terms for to expand to
   * @param pattern string representation of the given automaton, mostly used in exception messages  
   *
   * @throws IllegalStateException if the automaton accepts more than 128 terms
   *
   * @deprecated use {@link #multiterm(CompiledAutomaton, String)}
   */
  @Deprecated
  public static IntervalsSource multiterm(Automaton automaton, String pattern) {
    return multiterm(automaton, 128, pattern);
  }

  /**
   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton 
   *
   * WARNING: Setting {@code maxExpansions} to higher than the default value of 128
   * can be both slow and memory-intensive
   *
   * @param automaton accepts terms for to expand to
   * @param maxExpansions the maximum number of terms to expand to
   * @param pattern string representation of the given automaton, mostly used in exception messages  
   *
   * @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
   *
   * @deprecated use {@link #multiterm(CompiledAutomaton, int, String)}
   */
  @Deprecated
  public static IntervalsSource multiterm(Automaton automaton, int maxExpansions, String pattern) {
    CompiledAutomaton ca = new CompiledAutomaton(automaton);
    return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
  }

  /**
   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
   *
   * @param ca      an automaton accepting matching terms
   * @param pattern string representation of the given automaton, mostly used in exception messages
   *
   * @throws IllegalStateException if the automaton accepts more than 128 terms
   */
  public static IntervalsSource multiterm(CompiledAutomaton ca, String pattern) {
    return multiterm(ca, 128, pattern);
  }

  /**
   * Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
   *
   * WARNING: Setting {@code maxExpansions} to higher than the default value of 128
   * can be both slow and memory-intensive
   *
   * @param ca            an automaton accepting matching terms
   * @param maxExpansions the maximum number of terms to expand to
   * @param pattern string representation of the given automaton, mostly used in exception messages
   *
   * @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
   */
  public static IntervalsSource multiterm(CompiledAutomaton ca, int maxExpansions, String pattern) {
    return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
  }
  
  /**
   * Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
   * @param width       the maximum width of intervals in the sub-source to filter
   * @param subSource   the sub-source to filter
   */
  public static IntervalsSource maxwidth(int width, IntervalsSource subSource) {
    return FilteredIntervalsSource.maxWidth(subSource, width);
  }

  /**
   * Create an {@link IntervalsSource} that filters a sub-source by its gaps
   * @param gaps        the maximum number of gaps in the sub-source to filter
   * @param subSource   the sub-source to filter
   */
  public static IntervalsSource maxgaps(int gaps, IntervalsSource subSource) {
    return FilteredIntervalsSource.maxGaps(subSource, gaps);
  }

  /**
   * Create an {@link IntervalsSource} that wraps another source, extending its
   * intervals by a number of positions before and after.
   *
   * This can be useful for adding defined gaps in a block query; for example,
   * to find 'a b [2 arbitrary terms] c', you can call:
   * 
   *   Intervals.phrase(Intervals.term("a"), Intervals.extend(Intervals.term("b"), 0, 2), Intervals.term("c"));
   * 
* * Note that calling {@link IntervalIterator#gaps()} on iterators returned by this source * delegates directly to the wrapped iterator, and does not include the extensions. * * @param source the source to extend * @param before how many positions to extend before the delegated interval * @param after how many positions to extend after the delegated interval */ public static IntervalsSource extend(IntervalsSource source, int before, int after) { return new ExtendedIntervalsSource(source, before, after); } /** * Create an ordered {@link IntervalsSource} * * Returns intervals in which the subsources all appear in the given order * * @param subSources an ordered set of {@link IntervalsSource} objects */ public static IntervalsSource ordered(IntervalsSource... subSources) { return OrderedIntervalsSource.build(Arrays.asList(subSources)); } /** * Create an unordered {@link IntervalsSource} * * Returns intervals in which all the subsources appear. The subsources may overlap * * @param subSources an unordered set of {@link IntervalsSource}s */ public static IntervalsSource unordered(IntervalsSource... subSources) { return UnorderedIntervalsSource.build(Arrays.asList(subSources)); } /** * Create an unordered {@link IntervalsSource} allowing no overlaps between subsources * * Returns intervals in which both the subsources appear and do not overlap. */ public static IntervalsSource unorderedNoOverlaps(IntervalsSource a, IntervalsSource b) { return Intervals.or(Intervals.ordered(a, b), Intervals.ordered(b, a)); } /** * Create an {@link IntervalsSource} that always returns intervals from a specific field * * This is useful for comparing intervals across multiple fields, for example fields that * have been analyzed differently, allowing you to search for stemmed terms near unstemmed * terms, etc. */ public static IntervalsSource fixField(String field, IntervalsSource source) { return new FixedFieldIntervalsSource(field, source); } /** * Create a non-overlapping IntervalsSource * * Returns intervals of the minuend that do not overlap with intervals from the subtrahend * @param minuend the {@link IntervalsSource} to filter * @param subtrahend the {@link IntervalsSource} to filter by */ public static IntervalsSource nonOverlapping(IntervalsSource minuend, IntervalsSource subtrahend) { return new NonOverlappingIntervalsSource(minuend, subtrahend); } /** * Returns intervals from a source that overlap with intervals from another source * @param source the source to filter * @param reference the source to filter by */ public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) { return new OverlappingIntervalsSource(source, reference); } /** * Create a not-within {@link IntervalsSource} * * Returns intervals of the minuend that do not appear within a set number of positions of * intervals from the subtrahend query * * @param minuend the {@link IntervalsSource} to filter * @param positions the minimum distance that intervals from the minuend may occur from intervals * of the subtrahend * @param subtrahend the {@link IntervalsSource} to filter by */ public static IntervalsSource notWithin(IntervalsSource minuend, int positions, IntervalsSource subtrahend) { return new NonOverlappingIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions)); } /** * Returns intervals of the source that appear within a set number of positions of intervals from * the reference * * @param source the {@link IntervalsSource} to filter * @param positions the maximum distance that intervals of the source may occur from intervals of the reference * @param reference the {@link IntervalsSource} to filter by */ public static IntervalsSource within(IntervalsSource source, int positions, IntervalsSource reference) { return containedBy(source, Intervals.extend(reference, positions, positions)); } /** * Create a not-containing {@link IntervalsSource} * * Returns intervals from the minuend that do not contain intervals of the subtrahend * * @param minuend the {@link IntervalsSource} to filter * @param subtrahend the {@link IntervalsSource} to filter by */ public static IntervalsSource notContaining(IntervalsSource minuend, IntervalsSource subtrahend) { return NotContainingIntervalsSource.build(minuend, subtrahend); } /** * Create a containing {@link IntervalsSource} * * Returns intervals from the big source that contain one or more intervals from * the small source * * @param big the {@link IntervalsSource} to filter * @param small the {@link IntervalsSource} to filter by */ public static IntervalsSource containing(IntervalsSource big, IntervalsSource small) { return ContainingIntervalsSource.build(big, small); } /** * Create a not-contained-by {@link IntervalsSource} * * Returns intervals from the small {@link IntervalsSource} that do not appear within * intervals from the big {@link IntervalsSource}. * * @param small the {@link IntervalsSource} to filter * @param big the {@link IntervalsSource} to filter by */ public static IntervalsSource notContainedBy(IntervalsSource small, IntervalsSource big) { return NotContainedByIntervalsSource.build(small, big); } /** * Create a contained-by {@link IntervalsSource} * * Returns intervals from the small query that appear within intervals of the big query * * @param small the {@link IntervalsSource} to filter * @param big the {@link IntervalsSource} to filter by */ public static IntervalsSource containedBy(IntervalsSource small, IntervalsSource big) { return ContainedByIntervalsSource.build(small, big); } /** * Return intervals that span combinations of intervals from {@code minShouldMatch} of the sources */ public static IntervalsSource atLeast(int minShouldMatch, IntervalsSource... sources) { if (minShouldMatch == sources.length) { return unordered(sources); } if (minShouldMatch > sources.length) { return new NoMatchIntervalsSource( "Too few sources to match minimum of [" + minShouldMatch + "]: " + Arrays.toString(sources)); } return new MinimumShouldMatchIntervalsSource(sources, minShouldMatch); } /** * Returns intervals from the source that appear before intervals from the reference */ public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) { return ContainedByIntervalsSource.build(source, Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0)); } /** * Returns intervals from the source that appear after intervals from the reference */ public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) { return ContainedByIntervalsSource.build(source, Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE)); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy