org.apache.lucene.search.suggest.DocumentDictionary Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest;

import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;

/**
 * Dictionary with terms, weights, payload (optional) and contexts (optional) information taken from
 * stored/indexed fields in a Lucene index. NOTE:
 *
 * 
 *   The term field has to be stored; if it is missing, the document is skipped.
 *   
The payload and contexts field are optional and are not required to be stored.
 *   
The weight field can be stored or can be a {@link NumericDocValues}. If the weight field is
 *       not defined, the value of the weight is 0
 * 
 */
public class DocumentDictionary implements Dictionary {

  /** {@link IndexReader} to load documents from */
  protected final IndexReader reader;

  /** {@link StoredFields} for this reader */
  protected final StoredFields storedFields;

  /** Field to read payload from */
  protected final String payloadField;

  /** Field to read contexts from */
  protected final String contextsField;

  private final String field;
  private final String weightField;

  /**
   * Creates a new dictionary with the contents of the fields named field for the terms
   * and weightField for the weights that will be used for the corresponding terms.
   */
  public DocumentDictionary(IndexReader reader, String field, String weightField)
      throws IOException {
    this(reader, field, weightField, null);
  }

  /**
   * Creates a new dictionary with the contents of the fields named field for the
   * terms, weightField for the weights that will be used for the the corresponding
   * terms and payloadField for the corresponding payloads for the entry.
   */
  public DocumentDictionary(
      IndexReader reader, String field, String weightField, String payloadField)
      throws IOException {
    this(reader, field, weightField, payloadField, null);
  }

  /**
   * Creates a new dictionary with the contents of the fields named field for the
   * terms, weightField for the weights that will be used for the the corresponding
   * terms, payloadField for the corresponding payloads for the entry and 
   * contextsField for associated contexts.
   */
  public DocumentDictionary(
      IndexReader reader,
      String field,
      String weightField,
      String payloadField,
      String contextsField)
      throws IOException {
    this.reader = reader;
    this.storedFields = reader.storedFields();
    this.field = field;
    this.weightField = weightField;
    this.payloadField = payloadField;
    this.contextsField = contextsField;
  }

  @Override
  public InputIterator getEntryIterator() throws IOException {
    return new DocumentInputIterator(payloadField != null, contextsField != null);
  }

  /** Implements {@link InputIterator} from stored fields. */
  protected class DocumentInputIterator implements InputIterator {

    private final int docCount;
    private final Set relevantFields;
    private final boolean hasPayloads;
    private final boolean hasContexts;
    private final Bits liveDocs;
    private int currentDocId = -1;
    private long currentWeight = 0;
    private BytesRef currentPayload = null;
    private Set currentContexts;
    private final NumericDocValues weightValues;
    IndexableField[] currentDocFields = new IndexableField[0];
    int nextFieldsPosition = 0;

    /**
     * Creates an iterator over term, weight and payload fields from the lucene index. setting
     * withPayload to false, implies an iterator over only term and weight.
     */
    public DocumentInputIterator(boolean hasPayloads, boolean hasContexts) throws IOException {
      this.hasPayloads = hasPayloads;
      this.hasContexts = hasContexts;
      docCount = reader.maxDoc() - 1;
      weightValues =
          (weightField != null) ? MultiDocValues.getNumericValues(reader, weightField) : null;
      liveDocs = (reader.leaves().size() > 0) ? MultiBits.getLiveDocs(reader) : null;
      relevantFields =
          getRelevantFields(new String[] {field, weightField, payloadField, contextsField});
    }

    @Override
    public long weight() {
      return currentWeight;
    }

    @Override
    public BytesRef next() throws IOException {
      while (true) {
        if (nextFieldsPosition < currentDocFields.length) {
          // Still values left from the document
          IndexableField fieldValue = currentDocFields[nextFieldsPosition++];
          if (fieldValue.binaryValue() != null) {
            return fieldValue.binaryValue();
          } else if (fieldValue.stringValue() != null) {
            return new BytesRef(fieldValue.stringValue());
          } else {
            continue;
          }
        }

        if (currentDocId == docCount) {
          // Iterated over all the documents.
          break;
        }

        currentDocId++;
        if (liveDocs != null && !liveDocs.get(currentDocId)) {
          continue;
        }

        Document doc = storedFields.document(currentDocId, relevantFields);

        BytesRef tempPayload = null;
        if (hasPayloads) {
          IndexableField payload = doc.getField(payloadField);
          if (payload != null) {
            if (payload.binaryValue() != null) {
              tempPayload = payload.binaryValue();
            } else if (payload.stringValue() != null) {
              tempPayload = new BytesRef(payload.stringValue());
            }
          }
          // in case that the iterator has payloads configured, use empty values
          // instead of null for payload
          if (tempPayload == null) {
            tempPayload = new BytesRef();
          }
        }

        Set tempContexts;
        if (hasContexts) {
          tempContexts = new HashSet<>();
          final IndexableField[] contextFields = doc.getFields(contextsField);
          for (IndexableField contextField : contextFields) {
            if (contextField.binaryValue() != null) {
              tempContexts.add(contextField.binaryValue());
            } else if (contextField.stringValue() != null) {
              tempContexts.add(new BytesRef(contextField.stringValue()));
            } else {
              continue;
            }
          }
        } else {
          tempContexts = Collections.emptySet();
        }

        currentDocFields = doc.getFields(field);
        nextFieldsPosition = 0;
        if (currentDocFields.length == 0) { // no values in this document
          continue;
        }
        IndexableField fieldValue = currentDocFields[nextFieldsPosition++];
        BytesRef tempTerm;
        if (fieldValue.binaryValue() != null) {
          tempTerm = fieldValue.binaryValue();
        } else if (fieldValue.stringValue() != null) {
          tempTerm = new BytesRef(fieldValue.stringValue());
        } else {
          continue;
        }

        currentPayload = tempPayload;
        currentContexts = tempContexts;
        currentWeight = getWeight(doc, currentDocId);

        return tempTerm;
      }

      return null;
    }

    @Override
    public BytesRef payload() {
      return currentPayload;
    }

    @Override
    public boolean hasPayloads() {
      return hasPayloads;
    }

    /**
     * Returns the value of the weightField for the current document. Retrieves the
     * value for the weightField if it's stored (using doc) or if it's
     * indexed as {@link NumericDocValues} (using docId) for the document. If no value
     * is found, then the weight is 0.
     */
    protected long getWeight(Document doc, int docId) throws IOException {
      IndexableField weight = doc.getField(weightField);
      if (weight != null) { // found weight as stored
        return (weight.numericValue() != null) ? weight.numericValue().longValue() : 0;
      } else if (weightValues != null) { // found weight as NumericDocValue
        if (weightValues.docID() < docId) {
          weightValues.advance(docId);
        }
        if (weightValues.docID() == docId) {
          return weightValues.longValue();
        } else {
          // missing
          return 0;
        }
      } else { // fall back
        return 0;
      }
    }

    private Set getRelevantFields(String... fields) {
      Set relevantFields = new HashSet<>();
      for (String relevantField : fields) {
        if (relevantField != null) {
          relevantFields.add(relevantField);
        }
      }
      return relevantFields;
    }

    @Override
    public Set contexts() {
      if (hasContexts) {
        return currentContexts;
      }
      return null;
    }

    @Override
    public boolean hasContexts() {
      return hasContexts;
    }
  }
}