All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.structures.CollectionStatistics Maven / Gradle / Ivy

The newest version!
/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is CollectionStatistics.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Vassilis Plachouras  
 */
package org.terrier.structures;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;

import org.apache.hadoop.io.Writable;

/**
 * This class provides basic statistics for the indexed
 * collection of documents, such as the average length of documents,
 * or the total number of documents in the collection. 
* After indexing, statistics are saved in the associated index object. * * @author Gianni Amati, Vassilis Plachouras, Craig Macdonald */ @ConcurrentReadable public class CollectionStatistics implements Serializable,Writable { private static final long serialVersionUID = 1L; /** Number of fields used to index */ protected int numberOfFields; /** Number of tokens in each field */ protected long[] fieldTokens; /** Average length of each field */ protected double[] avgFieldLengths; /** Field names */ protected String[] fieldNames; /** Total number of documents in the collection.*/ protected int numberOfDocuments; /** Total number of tokens in the collection.*/ protected long numberOfTokens; /** * Total number of pointers in the inverted file. * This corresponds to the sum of the document frequencies for * the terms in the lexicon. */ protected long numberOfPointers; /** * Total number of unique terms in the collection. * This corresponds to the number of entries in the lexicon. */ protected int numberOfUniqueTerms; /** * Average length of a document in the collection. */ protected double averageDocumentLength; /** Does the index have positions */ protected boolean hasPositions; @Deprecated public CollectionStatistics(int numDocs, int numTerms, long numTokens, long numPointers, final long[] _fieldTokens, final String[] _fieldNames) { this(numDocs, numTerms, numTokens, numPointers, _fieldTokens, _fieldNames, false); } /** * Constructs an instance of the class. * * @param numDocs the number of documents in the collection. * @param numTerms the number of terms in the collection. * @param numTokens the number of tokens in the collection. * @param numPointers the number of pointers in the inverted file. * @param _fieldTokens the number of tokens in each field. * @param _fieldNames the field names. */ public CollectionStatistics(int numDocs, int numTerms, long numTokens, long numPointers, final long[] _fieldTokens, final String[] _fieldNames, boolean positions) { assert _fieldTokens != null && _fieldNames != null && _fieldNames.length ==_fieldTokens.length; numberOfDocuments = numDocs; numberOfUniqueTerms = numTerms; numberOfTokens = numTokens; numberOfPointers = numPointers; fieldTokens = _fieldTokens; fieldNames = _fieldNames; numberOfFields = _fieldTokens.length; avgFieldLengths = new double[numberOfFields]; hasPositions = positions; recalculateAverageLengths(); } /** * Default constructor. */ public CollectionStatistics() { } protected void recalculateAverageLengths() { if (numberOfDocuments != 0) { averageDocumentLength = (double)numberOfTokens / (double)numberOfDocuments; for (int fi = 0; fi < numberOfFields; fi++) avgFieldLengths[fi] = (double)fieldTokens[fi] / (double)numberOfDocuments; } else { averageDocumentLength = 0.0D; Arrays.fill(avgFieldLengths, 0.0d); } } @Override public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of postings: " + getNumberOfPostings() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n" + "Field names: " + Arrays.toString(getFieldNames()) + "\n" + "Positions: " + hasPositions + "\n"; } /** Returns true if the inverted index will have position informat */ public boolean hasPositions() { return hasPositions; } /** * Returns the documents' average length. * * @return the average length of the documents in the collection. */ public double getAverageDocumentLength() { return averageDocumentLength; } /** * Returns the total number of documents in the collection. * * @return the total number of documents in the collection. */ public int getNumberOfDocuments() { return numberOfDocuments; } /** * Returns the total number of postings in the collection. * * @return the total number of postings in the collection. */ @Deprecated public long getNumberOfPointers() { return numberOfPointers; } /** * Returns the total number of postings in the collection. * * @return the total number of postings in the collection. */ public long getNumberOfPostings() { return numberOfPointers; } /** * Returns the total number of tokens in the collection. * @return the total number of tokens in the collection. */ public long getNumberOfTokens() { return numberOfTokens; } /** * Returns the total number of unique terms in the lexicon. * * @return the total number of unique terms in the lexicon. */ public int getNumberOfUniqueTerms() { return numberOfUniqueTerms; } /** * Returns the number of fields being used to index. * * @return the number of fields being used to index. */ public int getNumberOfFields() { return numberOfFields; } /** * Returns the length of each field in tokens. * * @return the length of each field in tokens. */ public long[] getFieldTokens() { return fieldTokens; } /** * Returns the average length of each field in tokens. * * @return the average length of each field in tokens. */ public double[] getAverageFieldLengths() { return avgFieldLengths; } /** * Returns the field names. * * @return the field names. */ public String[] getFieldNames() { return fieldNames; } /** * Increment the collection statistics with the provided collection statistics. * * @param cs the collection statistics to use to increment. * */ public void addStatistics(final CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPostings(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for (int fi = 0; fi < numberOfFields; fi++) fieldTokens[fi] += otherFieldTokens[fi]; recalculateAverageLengths(); } @Override public void readFields(DataInput in) throws IOException { numberOfDocuments = in.readInt(); numberOfUniqueTerms = in.readInt(); numberOfTokens = in.readLong(); numberOfPointers = in.readLong(); numberOfFields = in.readInt(); fieldTokens = new long[numberOfFields]; avgFieldLengths = new double[numberOfFields]; fieldNames = new String[numberOfFields]; for (int fi = 0; fi < numberOfFields; fi++) { fieldTokens[fi] = in.readLong(); fieldNames[fi] = in.readUTF(); } hasPositions = in.readBoolean(); recalculateAverageLengths(); } public void readFieldsV5(DataInput in) throws IOException { numberOfDocuments = in.readInt(); numberOfUniqueTerms = in.readInt(); numberOfTokens = in.readLong(); numberOfPointers = in.readLong(); numberOfFields = in.readInt(); fieldTokens = new long[numberOfFields]; avgFieldLengths = new double[numberOfFields]; fieldNames = new String[numberOfFields]; for (int fi = 0; fi < numberOfFields; fi++) { fieldTokens[fi] = in.readLong(); fieldNames[fi] = in.readUTF(); } recalculateAverageLengths(); } @Override public void write(DataOutput out) throws IOException { out.writeInt(numberOfDocuments); out.writeInt(numberOfUniqueTerms); out.writeLong(numberOfTokens); out.writeLong(numberOfPointers); out.writeInt(numberOfFields); for (int fi = 0; fi < numberOfFields; fi++) { out.writeLong(fieldTokens[fi]); out.writeUTF(fieldNames[fi]); } out.writeBoolean(hasPositions); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy