io.anserini.index.generator.DefaultLuceneDocumentGenerator Maven / Gradle / Ivy

Go to download
/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.index.generator;

import io.anserini.collection.InvalidContentsException;
import io.anserini.collection.MultifieldSourceDocument;
import io.anserini.collection.SourceDocument;
import io.anserini.index.IndexArgs;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.BytesRef;

import java.util.Arrays;

/**
 * Converts a {@link SourceDocument} into a Lucene {@link Document}, ready to be indexed.
 *
 * @param  type of the source document
 */
public class DefaultLuceneDocumentGenerator implements LuceneDocumentGenerator {
  protected IndexArgs args;

  protected DefaultLuceneDocumentGenerator() {
  }

  /**
   * Constructor with config and counters
   *
   * @param args configuration arguments
   */
  public DefaultLuceneDocumentGenerator(IndexArgs args) {
    this.args = args;
  }

  @Override
  public Document createDocument(T src) throws GeneratorException {
    String id = src.id();
    String contents;

    try {
      contents = src.contents();
    } catch (InvalidContentsException e) {
      // Catch and rethrow; indexer will eat the exception at top level and increment counters accordingly.
      throw new InvalidDocumentException();
    }

    if (contents.trim().length() == 0) {
      throw new EmptyDocumentException();
    }

    // Make a new, empty document.
    final Document document = new Document();

    // Store the collection docid.
    document.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
    // This is needed to break score ties by docid.
    document.add(new BinaryDocValuesField(IndexArgs.ID, new BytesRef(id)));

    if (args.storeRaw) {
      document.add(new StoredField(IndexArgs.RAW, src.raw()));
    }

    FieldType fieldType = new FieldType();
    fieldType.setStored(args.storeContents);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
      fieldType.setStoreTermVectors(true);
      fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
      fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
      fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    document.add(new Field(IndexArgs.CONTENTS, contents, fieldType));

    // If this document has other fields, then we want to index it also.
    // Currently, we just use all the settings of the main "content" field.
    if (src instanceof MultifieldSourceDocument) {
      ((MultifieldSourceDocument) src).fields().forEach((k, v) -> {
        if (k == IndexArgs.ENTITY) {
          document.add(new StoredField(IndexArgs.ENTITY, v));
        } else {
          // Only index fields that have been explicitly referenced in -fields parameter of indexing program.
          if (ArrayUtils.contains(args.fields, k)) {
            document.add(new Field(k, v, fieldType));
          }
        }
      });
    }

    return document;
  }
}