org.pageseeder.flint.lucene.search.Terms Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pso-flint-lucene Show documentation
Flint framework
The newest version!
/*
 * Copyright 2015 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.flint.lucene.search;

import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.pageseeder.flint.lucene.util.Beta;
import org.pageseeder.flint.lucene.util.Bucket;
import org.pageseeder.flint.lucene.util.Bucket.Entry;
import org.pageseeder.xmlwriter.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.*;

/**
 * A collection of utility methods to manipulate and extract terms.
 *
 * @author Christophe Lauret
 * @version 18 March 2011
 */
public final class Terms {

  /**
   * private logger
   */
  private final static Logger LOGGER = LoggerFactory.getLogger(Terms.class);

  /**
   * Compares terms using their text value instead of their field value.
   */
  private static final Comparator TEXT_COMPARATOR = new Comparator<>() {
    /**
     * {@inheritDoc}
     */
    @Override
    public int compare(Term t1, Term t2) {
      return t1.text().compareTo(t2.text());
    }
  };

  /** Utility class. */
  private Terms() {
  }

  /**
   * Returns a comparator to order terms using their text value.
   *
   * @return a comparator to order terms using their text value.
   */
  public static Comparator textComparator() {
    return TEXT_COMPARATOR;
  }

  /**
   * Extract the terms from the query
   * @param query  the query
   * @param reader the reader, used to rewrite the query
   * @return the set of terms
   * @throws IOException if rewriting the query failed
   */
  public static Set extractTerms(Query query, IndexReader reader) throws IOException {
    Set allTerms = new HashSet<>();
    query.rewrite(reader).visit(QueryVisitor.termCollector(allTerms));
    return allTerms;
  }
  /**
   * Returns the list of terms based on the given list of fields and texts.
   *
   * The number of the terms returns is (number of fields) x (number of texts).
   *
   * @param fields The list of fields.
   * @param texts  The list of texts.
   *
   * @return The corresponding list of terms.
   */
  public static List terms(List fields, List texts) {
    List terms = new ArrayList<>();
    for (String field : fields) {
      for (String text : texts) {
        terms.add(new Term(field, text));
      }
    }
    return terms;
  }

  /**
   * Returns the list of fuzzy terms given a term and using the specified index reader.
   *
   * @param reader Index reader to use.
   * @param term   The term to use.
   *
   * @return The corresponding list of fuzzy terms.
   *
   * @throws IOException If an error is thrown by the fuzzy term enumeration.
   */
  public static List fuzzy(IndexReader reader, Term term) throws IOException {
    List values = new ArrayList<>();
    fuzzy(reader, values, term);
    return values;
  }

  /**
   * Returns the list of prefix terms given a term and using the specified index reader.
   *
   * @param reader Index reader to use.
   * @param term   The term to use.
   *
   * @return The corresponding list of prefix terms.
   *
   * @throws IOException If an error is thrown by the prefix term enumeration.
   */
  public static List prefix(IndexReader reader, Term term) throws IOException {
    List terms = new ArrayList<>();
    prefix(reader, terms, term);
    return terms;
  }

  /**
   * Loads all the fuzzy terms in the list of terms given the reader.
   *
   * @param reader Index reader to use.
   * @param values The list of terms to load.
   * @param term   The term to use.
   *
   * @throws IOException If an error is thrown by the fuzzy term enumeration.
   */
  public static void fuzzy(IndexReader reader, List values, Term term) throws IOException {
    fuzzy(reader, values, term, 2);
  }

  /**
   * Loads all the fuzzy terms in the list of terms given the reader.
   *
   * @param reader Index reader to use.
   * @param values The list of terms to load.
   * @param term   The term to use.
   *
   * @throws IOException If an error is thrown by the fuzzy term enumeration.
   */
  public static void fuzzy(IndexReader reader, List values, Term term, int minSimilarity) throws IOException {
    org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field());
    if (terms == null) return;
    FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, term, minSimilarity, 0, false);
    BytesRef val;
    BytesRef searched = term.bytes();
    while ((val = fuzzy.next()) != null) {
      if (!searched.bytesEquals(val))
        values.add(val.utf8ToString());
    }
  }

  /**
   * Loads all the fuzzy terms in the list of terms given the reader.
   *
   * @param reader  Index reader to use.
   * @param bucket  Where to store the terms.
   * @param term    The term to use.
   *
   * @throws IOException If an error is thrown by the fuzzy term enumeration.
   */
  @Beta
  public static void fuzzy(IndexReader reader, Bucket bucket, Term term) throws IOException {
    fuzzy(reader, bucket, term, 2);
  }

  /**
   * Loads all the fuzzy terms in the list of terms given the reader.
   *
   * @param reader  Index reader to use.
   * @param bucket  Where to store the terms.
   * @param term    The term to use.
   *
   * @throws IOException If an error is thrown by the fuzzy term enumeration.
   */
  @Beta
  public static void fuzzy(IndexReader reader, Bucket bucket, Term term, int minSimilarity) throws IOException {
    org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field());
    if (terms == null) return;
    FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, term, minSimilarity, 0, true);
    BytesRef val;
    BytesRef searched = term.bytes();
    while ((val = fuzzy.next()) != null) {
      if (!searched.bytesEquals(val)) {
        Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
        bucket.add(t, reader.docFreq(t));
      }
    }
  }

  /**
   * Loads all the prefix terms in the list of terms given the reader.
   *
   * @param reader  Index reader to use.
   * @param values  The list of values to load.
   * @param term    The term to use.
   *
   * @throws IOException If an error is thrown by the prefix term enumeration.
   */
  public static void prefix(IndexReader reader, List values, Term term) throws IOException {
    org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field());
    if (terms == null) return;
    TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), null);
    BytesRef val;
    while ((val = prefixes.next()) != null) {
      values.add(val.utf8ToString());
    }
  }

  /**
   * Loads all the prefix terms in the list of terms given the reader.
   *
   * @param reader  Index reader to use.
   * @param bucket  Where to store the terms.
   * @param term    The term to use.
   *
   * @throws IOException If an error is thrown by the prefix term enumeration.
   */
  public static void prefix(IndexReader reader, Bucket bucket, Term term) throws IOException {
    org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field());
    if (terms == null) return;
    TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes());
    BytesRef val;
    while ((val = prefixes.next()) != null) {
      Term t = new Term(term.field(), BytesRef.deepCopyOf(val));
      bucket.add(t, reader.docFreq(t));
    }
  }

  /**
   * Returns the list of field names for the specified reader.
   *
   * @param reader The index reader
   *
   * @return the list of field names
   *
   */
  @Beta public static List fields(IndexReader reader) {
    LOGGER.debug("Loading fields");
    return new ArrayList<>(FieldInfos.getIndexedFields(reader));
  }


  /**
   * Returns the list of terms for the specified field.
   *
   * @param reader The index reader
   * @param field  The field
   *
   * @return the list of terms for this field
   *
   * @throws IOException should any IO error be reported.
   */
  @Beta public static List terms(IndexReader reader, String field) throws IOException {
    LOGGER.debug("Loading terms for field {}", field);
    org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, field);
    if (terms == null) return Collections.emptyList();
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum == TermsEnum.EMPTY) return Collections.emptyList();
    Map termsList = new HashMap<>();
    while (termsEnum.next() != null) {
      BytesRef t = termsEnum.term();
      if (t == null) break;
      termsList.put(t, new Term(field, BytesRef.deepCopyOf(t)));
    }
    return new ArrayList<>(termsList.values());
  }

  /**
   * Returns the list of term fields from the list of the fields provided which are in the search results of the query provided.
   *
   * @param searcher   a searcher on the index desired
   * @param query      the base query
   * @param candidates the list of candidate fields
   *
   * @return the list of fields with search results
   *
   * @throws IOException should any IO error be reported when querying the index.
   */
  @Beta public static List fields(IndexSearcher searcher, Query query, List candidates) throws IOException {
    LOGGER.debug("Loading fields for query {}", query);
    List fields = new ArrayList<>();
    for (String field : candidates) {
      FieldDocumentChecker checker = new FieldDocumentChecker(field);
      searcher.search(query, checker);
      if (checker.fieldFound()) fields.add(field);
    }
    return fields;
  }

  /**
   * Returns the list of term values for the specified field.
   *
   * @param reader The index reader to use
   * @param field  The field
   *
   * @return the list of terms for this field
   *
   * @throws IOException should any IO error be reported.
   */
  @Beta public static List values(IndexReader reader, String field) throws IOException {
    LOGGER.debug("Loading term values for field {}", field);
    List values = new ArrayList<>();
    org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, field);
    if (terms == null) return values;
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum == TermsEnum.EMPTY) return values;
    while (termsEnum.next() != null) {
      BytesRef t = termsEnum.term();
      if (t == null) break;
      values.add(t.utf8ToString());
    }
    return values;
  }

  // XML Serialisers ==============================================================================

  /**
   * Returns the XML for a list of terms.
   *
   * @param xml   The XML writer.
   * @param terms The list of terms to serialise as XML.
   *
   * @throws IOException Any I/O error thrown by the XML writer.
   */
  public static void toXML(XMLWriter xml, List terms) throws IOException {
    for (Term t : terms) {
      toXML(xml, t);
    }
  }

  /**
   * Returns the XML for a list of terms.
   *
   * @param xml   The XML writer.
   * @param terms The list of terms to serialise as XML.
   *
   * @throws IOException Any I/O error thrown by the XML writer.
   */
  public static void toXML(XMLWriter xml, Bucket terms) throws IOException {
    for (Entry t : terms.entrySet()) {
      toXML(xml, t.item(), t.count());
    }
  }

  /**
   * Returns the XML for a term.
   *
   * @param xml The XML writer.
   * @param t   Term to serialise as XML.
   *
   * @throws IOException Any I/O error thrown by the XML writer.
   */
  public static void toXML(XMLWriter xml, Term t) throws IOException {
    xml.openElement("term");
    xml.attribute("field", t.field());
    xml.attribute("text", t.text());
    xml.closeElement();
  }

  /**
   * Returns the XML for a term.
   *
   * @param xml       The XML writer.
   * @param t         Term to serialise as XML.
   * @param frequency The term document frequency.
   *
   * @throws IOException Any I/O error thrown by the XML writer.
   */
  public static void toXML(XMLWriter xml, Term t, int frequency) throws IOException {
    xml.openElement("term");
    xml.attribute("field", t.field());
    xml.attribute("text", t.text());
    xml.attribute("frequency", frequency);
    xml.closeElement();
  }

}