All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.anserini.rerank.ScoredDocuments Maven / Gradle / Ivy

/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.rerank;

import io.anserini.index.Constants;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * ScoredDocuments object that converts TopDocs from the searcher into an Anserini format
 */
public class ScoredDocuments {
  private static final Logger LOG = LogManager.getLogger(ScoredDocuments.class);
  // Array of document objects
  public Document[] documents;
  // The docIds as used by the index reader
  public int[] ids;
  // Scores returned from the searcher's similarity
  public float[] scores;
  
  public static ScoredDocuments fromTopDocs(TopDocs rs, IndexSearcher searcher) {
    ScoredDocuments scoredDocs = new ScoredDocuments();
    scoredDocs.documents = new Document[rs.scoreDocs.length];
    scoredDocs.ids = new int[rs.scoreDocs.length];
    scoredDocs.scores = new float[rs.scoreDocs.length];

    for (int i=0; i qrels, IndexReader reader) throws IOException {
    ScoredDocuments scoredDocs = new ScoredDocuments();

    List documentList = new ArrayList<>();
    List idList = new ArrayList<>();
    List scoreList = new ArrayList<>();

    IndexSearcher searcher;
    int i = 0;
    for (Map.Entry qrelsDocScorePair : qrels.entrySet()) {
      String externalDocid = qrelsDocScorePair.getKey();
      searcher = new IndexSearcher(reader);
      Query q = new TermQuery(new Term(Constants.ID, externalDocid));
      TopDocs rs = searcher.search(q, 1);
      try {
        documentList.add(searcher.doc(rs.scoreDocs[0].doc));
        idList.add(rs.scoreDocs[0].doc);
        scoreList.add(Float.valueOf(qrelsDocScorePair.getValue().floatValue()));
        i++;
      } catch (IOException e) {
        e.printStackTrace();
        documentList.add(null);
      } catch (ArrayIndexOutOfBoundsException e){
        // e.printStackTrace();
        LOG.warn("Cannot find document " + externalDocid);
      }
    }

    int length = documentList.size();
    scoredDocs.documents = new Document[length];
    scoredDocs.ids = new int[length];
    scoredDocs.scores = new float[length];
    scoredDocs.documents = documentList.toArray(scoredDocs.documents);
    scoredDocs.ids = ArrayUtils.toPrimitive(idList.toArray(new Integer[length]));
    scoredDocs.scores = ArrayUtils.toPrimitive(scoreList.toArray(new Float[length]), Float.NaN);

    return scoredDocs;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy