All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pageseeder.flint.lucene.search.Terms Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.flint.lucene.search;

import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.pageseeder.flint.lucene.util.Beta;
import org.pageseeder.flint.lucene.util.Bucket;
import org.pageseeder.flint.lucene.util.Bucket.Entry;
import org.pageseeder.xmlwriter.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.*;

/**
 * A collection of utility methods to manipulate and extract terms.
 *
 * @author Christophe Lauret
 * @version 18 March 2011
 */
public final class Terms {

  /**
   * private logger
   */
  private final static Logger LOGGER = LoggerFactory.getLogger(Terms.class);

  /**
   * Compares terms using their text value instead of their field value.
   */
  private static final Comparator TEXT_COMPARATOR = new Comparator<>() {
    /**
     * {@inheritDoc}
     */
    @Override
    public int compare(Term t1, Term t2) {
      return t1.text().compareTo(t2.text());
    }
  };

  /** Utility class. */
  private Terms() {
  }

  /**
   * Returns a comparator to order terms using their text value.
   *
   * @return a comparator to order terms using their text value.
   */
  public static Comparator textComparator() {
    return TEXT_COMPARATOR;
  }

  /**
   * Extract the terms from the query
   * @param query  the query
   * @param reader the reader, used to rewrite the query
   * @return the set of terms
   * @throws IOException if rewriting the query failed
   */
  public static Set extractTerms(Query query, IndexReader reader) throws IOException {
    Set allTerms = new HashSet<>();
    query.rewrite(reader).visit(QueryVisitor.termCollector(allTerms));
    return allTerms;
  }
  /**
   * Returns the list of terms based on the given list of fields and texts.
   *
   * 

The number of the terms returns is (number of fields) x (number of texts). * * @param fields The list of fields. * @param texts The list of texts. * * @return The corresponding list of terms. */ public static List terms(List fields, List texts) { List terms = new ArrayList<>(); for (String field : fields) { for (String text : texts) { terms.add(new Term(field, text)); } } return terms; } /** * Returns the list of fuzzy terms given a term and using the specified index reader. * * @param reader Index reader to use. * @param term The term to use. * * @return The corresponding list of fuzzy terms. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ public static List fuzzy(IndexReader reader, Term term) throws IOException { List values = new ArrayList<>(); fuzzy(reader, values, term); return values; } /** * Returns the list of prefix terms given a term and using the specified index reader. * * @param reader Index reader to use. * @param term The term to use. * * @return The corresponding list of prefix terms. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static List prefix(IndexReader reader, Term term) throws IOException { List terms = new ArrayList<>(); prefix(reader, terms, term); return terms; } /** * Loads all the fuzzy terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of terms to load. * @param term The term to use. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ public static void fuzzy(IndexReader reader, List values, Term term) throws IOException { fuzzy(reader, values, term, 2); } /** * Loads all the fuzzy terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of terms to load. * @param term The term to use. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ public static void fuzzy(IndexReader reader, List values, Term term, int minSimilarity) throws IOException { org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field()); if (terms == null) return; FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, term, minSimilarity, 0, false); BytesRef val; BytesRef searched = term.bytes(); while ((val = fuzzy.next()) != null) { if (!searched.bytesEquals(val)) values.add(val.utf8ToString()); } } /** * Loads all the fuzzy terms in the list of terms given the reader. * * @param reader Index reader to use. * @param bucket Where to store the terms. * @param term The term to use. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ @Beta public static void fuzzy(IndexReader reader, Bucket bucket, Term term) throws IOException { fuzzy(reader, bucket, term, 2); } /** * Loads all the fuzzy terms in the list of terms given the reader. * * @param reader Index reader to use. * @param bucket Where to store the terms. * @param term The term to use. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ @Beta public static void fuzzy(IndexReader reader, Bucket bucket, Term term, int minSimilarity) throws IOException { org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field()); if (terms == null) return; FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, term, minSimilarity, 0, true); BytesRef val; BytesRef searched = term.bytes(); while ((val = fuzzy.next()) != null) { if (!searched.bytesEquals(val)) { Term t = new Term(term.field(), BytesRef.deepCopyOf(val)); bucket.add(t, reader.docFreq(t)); } } } /** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of values to load. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, List values, Term term) throws IOException { org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), null); BytesRef val; while ((val = prefixes.next()) != null) { values.add(val.utf8ToString()); } } /** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use. * @param bucket Where to store the terms. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, Bucket bucket, Term term) throws IOException { org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes()); BytesRef val; while ((val = prefixes.next()) != null) { Term t = new Term(term.field(), BytesRef.deepCopyOf(val)); bucket.add(t, reader.docFreq(t)); } } /** * Returns the list of field names for the specified reader. * * @param reader The index reader * * @return the list of field names * */ @Beta public static List fields(IndexReader reader) { LOGGER.debug("Loading fields"); return new ArrayList<>(FieldInfos.getIndexedFields(reader)); } /** * Returns the list of terms for the specified field. * * @param reader The index reader * @param field The field * * @return the list of terms for this field * * @throws IOException should any IO error be reported. */ @Beta public static List terms(IndexReader reader, String field) throws IOException { LOGGER.debug("Loading terms for field {}", field); org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, field); if (terms == null) return Collections.emptyList(); TermsEnum termsEnum = terms.iterator(); if (termsEnum == TermsEnum.EMPTY) return Collections.emptyList(); Map termsList = new HashMap<>(); while (termsEnum.next() != null) { BytesRef t = termsEnum.term(); if (t == null) break; termsList.put(t, new Term(field, BytesRef.deepCopyOf(t))); } return new ArrayList<>(termsList.values()); } /** * Returns the list of term fields from the list of the fields provided which are in the search results of the query provided. * * @param searcher a searcher on the index desired * @param query the base query * @param candidates the list of candidate fields * * @return the list of fields with search results * * @throws IOException should any IO error be reported when querying the index. */ @Beta public static List fields(IndexSearcher searcher, Query query, List candidates) throws IOException { LOGGER.debug("Loading fields for query {}", query); List fields = new ArrayList<>(); for (String field : candidates) { FieldDocumentChecker checker = new FieldDocumentChecker(field); searcher.search(query, checker); if (checker.fieldFound()) fields.add(field); } return fields; } /** * Returns the list of term values for the specified field. * * @param reader The index reader to use * @param field The field * * @return the list of terms for this field * * @throws IOException should any IO error be reported. */ @Beta public static List values(IndexReader reader, String field) throws IOException { LOGGER.debug("Loading term values for field {}", field); List values = new ArrayList<>(); org.apache.lucene.index.Terms terms = MultiTerms.getTerms(reader, field); if (terms == null) return values; TermsEnum termsEnum = terms.iterator(); if (termsEnum == TermsEnum.EMPTY) return values; while (termsEnum.next() != null) { BytesRef t = termsEnum.term(); if (t == null) break; values.add(t.utf8ToString()); } return values; } // XML Serialisers ============================================================================== /** * Returns the XML for a list of terms. * * @param xml The XML writer. * @param terms The list of terms to serialise as XML. * * @throws IOException Any I/O error thrown by the XML writer. */ public static void toXML(XMLWriter xml, List terms) throws IOException { for (Term t : terms) { toXML(xml, t); } } /** * Returns the XML for a list of terms. * * @param xml The XML writer. * @param terms The list of terms to serialise as XML. * * @throws IOException Any I/O error thrown by the XML writer. */ public static void toXML(XMLWriter xml, Bucket terms) throws IOException { for (Entry t : terms.entrySet()) { toXML(xml, t.item(), t.count()); } } /** * Returns the XML for a term. * * @param xml The XML writer. * @param t Term to serialise as XML. * * @throws IOException Any I/O error thrown by the XML writer. */ public static void toXML(XMLWriter xml, Term t) throws IOException { xml.openElement("term"); xml.attribute("field", t.field()); xml.attribute("text", t.text()); xml.closeElement(); } /** * Returns the XML for a term. * * @param xml The XML writer. * @param t Term to serialise as XML. * @param frequency The term document frequency. * * @throws IOException Any I/O error thrown by the XML writer. */ public static void toXML(XMLWriter xml, Term t, int frequency) throws IOException { xml.openElement("term"); xml.attribute("field", t.field()); xml.attribute("text", t.text()); xml.attribute("frequency", frequency); xml.closeElement(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy