Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is BasicSinglePassIndexer.java.
*
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Roi Blanco
* Craig Macdonald
*/
package org.terrier.structures.indexing.singlepass;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import org.terrier.indexing.Collection;
import org.terrier.indexing.Document;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressionFactory.BitCompressionConfiguration;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.DocumentPostingList;
import org.terrier.structures.indexing.classical.BasicIndexer;
import org.terrier.structures.postings.bit.BasicIterablePosting;
import org.terrier.structures.postings.bit.FieldIterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.MemoryChecker;
import org.terrier.utility.RuntimeMemoryChecker;
import org.terrier.utility.UnitUtils;
/**
* This class indexes a document collection (skipping the direct file construction). It implements a single-pass algorithm,
* that operates in two phases:
* First, it traverses the document collection, passes the terms through the TermPipeline and builds an in-memory
* representation of the posting lists. When it has exhausted the main memory, it flushes the sorted postings to disk, along
* with the lexicon (collectively known as a run, and continues traversing the collection.
* The second phase, merges the sorted runs (with their partial lexicons) in disk to create the final inverted file.
* This class follows the template pattern, so the main bulk of the code is reused for block (and fields) indexing. There are a few hook methods,
* that chooses the right classes to instantiate, depending on the indexing options defined.
*
* Memory tracking is a key concern in this class. Four properties are provided for checking the amount of memory
* consumed, how regularly to check the memory, and (optional) maximums on the amount of memory that
* can be used for the postings, or on the number of documents before a flush is comitted.
*
* Properties:
*
*
memory.reserved - amount of free memory threshold before a run is committed.
* Default is 50 000 000 (50MB) and 100 000 000 (100MB) for 32bit and 64bit JVMs respectively.
*
memory.heap.usage - proportion of max heap allocated to JVM before a run is committed. Default 0.70.
*
indexing.singlepass.max.postings.memory - maximum amount of memory that the postings can consume before a run is committed. Default is 0, which is no limit.
*
indexing.singlepass.max.documents.flush - maximum number of documents before a run is committed. Default is 0, which is no limit.
*
docs.check - interval of how many documents indexed should the amount of free memory be checked. Default is 20 - check memory consumption every 20 documents.
*
* @author Roi Blanco
*/
public class BasicSinglePassIndexer extends BasicIndexer{
/** Current document Id */
protected int currentId = 0;
protected long maxMemory = 0;
/** Memory Checker - provides the method for checking to see if
* the system is running low on memory */
protected MemoryChecker memoryCheck = null;
/** Number of documents read per memory check */
protected int docsPerCheck;
protected int maxDocsPerFlush;
/** Runtime system JVM running this instance of Terrier */
protected static final Runtime runtime = Runtime.getRuntime();
/** Number of documents read since the memory consumption was last checked */
protected int numberOfDocsSinceCheck = 0;
/** Number of documents read since the memory runs were last flushed to disk */
protected int numberOfDocsSinceFlush = 0;
/** Memory status after flush */
protected long memoryAfterFlush = -1;
/** Queue with the file names for the runs in disk */
protected Queue fileNames = new LinkedList();
/** Number of the current Run to be written in disk */
protected int currentFile = 0;
/** Structure that keeps the posting lists in memory */
protected MemoryPostings mp;
/** Structure for merging the run */
protected RunsMerger merger;
/** Number of documents indexed */
protected int numberOfDocuments = 0;
/** Number of tokens indexed */
protected long numberOfTokens = 0;
/** Number of unique terms indexed */
protected int numberOfUniqueTerms = 0;
/** Number of pointers indexed */
protected long numberOfPointers = 0;
/** what class should be used to read the generated inverted index? */
protected String invertedIndexClass = org.terrier.structures.bit.BitPostingIndex.class.getName();
protected String basicInvertedIndexPostingIteratorClass = BasicIterablePosting.class.getName();
protected String fieldInvertedIndexPostingIteratorClass = FieldIterablePosting.class.getName();
/** what class should be used to read the inverted index as a stream? */
protected String invertedIndexInputStreamClass = org.terrier.structures.bit.BitPostingIndexInputStream.class.getName();
/**
* Constructs an instance of a BasicSinglePassIndexer, using the given path name
* for storing the data structures.
* @param pathname String the path where the datastructures will be created. This is assumed to be
* absolute.
* @param prefix String the prefix of the index, usually "data".
*/
public BasicSinglePassIndexer(String pathname, String prefix) {
super(pathname, prefix);
//delay the execution of init() if we are a parent class
if (this.getClass() == BasicSinglePassIndexer.class)
init();
if (! (this.compressionInvertedConfig instanceof BitCompressionConfiguration ))
{
throw new Error("Sorry, only default BitCompressionConfiguration is supported by " + this.getClass().getName()
+ " - you can recompress the inverted index later using IndexRecompressor");
}
}
/** Protected do-nothing constructor for use by child classes */
protected BasicSinglePassIndexer(long a, long b, long c) {
super(a,b,c);
}
@Override
public void createDirectIndex(Collection collection) {
createInvertedIndex(collection);
}
public void indexDocuments(Iterator, DocumentPostingList>> iterDocs) {
logger.info("Creating IF (no direct file)..");
final boolean FIELDS = (FieldScore.FIELDS_COUNT > 0);
long startCollection, endCollection;
fileNames = new LinkedList();
numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0;
numberOfTokens = numberOfPointers = 0;
createMemoryPostings();
currentIndex = IndexOnDisk.createNewIndex(path, prefix);
docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document", FIELDS);
metaBuilder = createMetaIndexBuilder();
emptyDocIndexEntry = FIELDS ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new SimpleDocumentIndexEntry();
boolean stopIndexing = false;
memoryAfterFlush = runtime.freeMemory();
logger.debug("Starting free memory: "+memoryAfterFlush/1000000+"M");
startCollection = System.currentTimeMillis();
long numberOfTokens = 0;
while(iterDocs.hasNext()) {
Map.Entry