All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.anserini.index.generator.LuceneDenseVectorDocumentGenerator Maven / Gradle / Ivy

/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.index.generator;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.anserini.collection.SourceDocument;
import io.anserini.index.IndexDenseVectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnVectorField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.VectorSimilarityFunction;

import java.util.ArrayList;

/**
 * Converts a {@link SourceDocument} into a Lucene {@link Document}, ready to be indexed.
 *
 * @param  type of the source document
 */
public class LuceneDenseVectorDocumentGenerator implements LuceneDocumentGenerator {
  protected IndexDenseVectors.Args args;

  protected LuceneDenseVectorDocumentGenerator() {
  }

  /**
   * Constructor with config and counters
   *
   * @param args configuration arguments
   */
  public LuceneDenseVectorDocumentGenerator(IndexDenseVectors.Args args) {
    this.args = args;
  }

  private float[] convertJsonArray(String vectorString) throws JsonMappingException, JsonProcessingException {
    ObjectMapper mapper = new ObjectMapper();
    ArrayList denseVector = mapper.readValue(vectorString, new TypeReference>(){});
    int length = denseVector.size();
    float[] vector = new float[length];
    int i = 0;
    for (Float f : denseVector) {
      vector[i++] = f;
    }
    return vector;
  }

  @Override
  public Document createDocument(T src) throws InvalidDocumentException {
    String id = src.id();
    float[] contents;

    try {
      contents = convertJsonArray(src.contents());
    } catch (Exception e) {
      throw new InvalidDocumentException();
    }

    // Make a new, empty document.
    final Document document = new Document();

    // Store the collection docid.
    document.add(new StringField(IndexDenseVectors.Args.ID, id, Field.Store.YES));
    // This is needed to break score ties by docid.
    document.add(new KnnVectorField(IndexDenseVectors.Args.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT));
    if (args.storeRaw) {
      document.add(new StoredField(IndexDenseVectors.Args.RAW, src.raw()));
    }
    return document;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy