All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.anserini.index.generator.DenseVectorDocumentGenerator Maven / Gradle / Ivy

/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.index.generator;

import io.anserini.collection.SourceDocument;
import io.anserini.index.Constants;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.util.BytesRef;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
 * A document generator for creating Lucene documents with Parquet dense
 * vector data.
 * Implements the LuceneDocumentGenerator interface.
 *
 * @param  the type of SourceDocument
 */
public class DenseVectorDocumentGenerator implements LuceneDocumentGenerator {
  private static final Logger LOG = LogManager.getLogger(DenseVectorDocumentGenerator.class);

  /**
   * Creates a Lucene document from the source document.
   *
   * @param src the source document
   * @return the created Lucene document
   * @throws InvalidDocumentException if the document is invalid
   */
  @Override
  public Document createDocument(T src) throws InvalidDocumentException {

    try {
      float[] contents = src.vector();
      if (contents == null || contents.length == 0) {
        LOG.error("Vector data is null or empty for document ID: " + src.id());
        throw new InvalidDocumentException();
      }

      // Create and populate the Lucene document
      final Document document = new Document();
      document.add(new StringField(Constants.ID, src.id(), Field.Store.YES));
      document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(src.id())));
      document.add(new KnnFloatVectorField(Constants.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT));

      return document;

    } catch (InvalidDocumentException e) {
      throw e;
    } catch (Exception e) {
      LOG.error("Unexpected error creating document for ID: " + src.id(), e);
      throw new InvalidDocumentException();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy