All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.structures.indexing.singlepass.BasicSinglePassIndexer Maven / Gradle / Ivy

The newest version!

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is BasicSinglePassIndexer.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *  Roi Blanco
 *  Craig Macdonald
 */

package org.terrier.structures.indexing.singlepass;

import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;

import org.terrier.indexing.Collection;
import org.terrier.indexing.Document;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressionFactory.BitCompressionConfiguration;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.DocumentPostingList;
import org.terrier.structures.indexing.classical.BasicIndexer;
import org.terrier.structures.postings.bit.BasicIterablePosting;
import org.terrier.structures.postings.bit.FieldIterablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.MemoryChecker;
import org.terrier.utility.RuntimeMemoryChecker;
import org.terrier.utility.UnitUtils;
/**
 * This class indexes a document collection (skipping the direct file construction). It implements a single-pass algorithm,
 * that operates in two phases:
* First, it traverses the document collection, passes the terms through the TermPipeline and builds an in-memory * representation of the posting lists. When it has exhausted the main memory, it flushes the sorted postings to disk, along * with the lexicon (collectively known as a run, and continues traversing the collection.
* The second phase, merges the sorted runs (with their partial lexicons) in disk to create the final inverted file. * This class follows the template pattern, so the main bulk of the code is reused for block (and fields) indexing. There are a few hook methods, * that chooses the right classes to instantiate, depending on the indexing options defined. *

* Memory tracking is a key concern in this class. Four properties are provided for checking the amount of memory * consumed, how regularly to check the memory, and (optional) maximums on the amount of memory that * can be used for the postings, or on the number of documents before a flush is comitted. *

* Properties: *

    *
  • memory.reserved - amount of free memory threshold before a run is committed. * Default is 50 000 000 (50MB) and 100 000 000 (100MB) for 32bit and 64bit JVMs respectively.
  • *
  • memory.heap.usage - proportion of max heap allocated to JVM before a run is committed. Default 0.70.
  • *
  • indexing.singlepass.max.postings.memory - maximum amount of memory that the postings can consume before a run is committed. Default is 0, which is no limit.
  • *
  • indexing.singlepass.max.documents.flush - maximum number of documents before a run is committed. Default is 0, which is no limit.
  • *
  • docs.check - interval of how many documents indexed should the amount of free memory be checked. Default is 20 - check memory consumption every 20 documents.
  • *
* @author Roi Blanco */ public class BasicSinglePassIndexer extends BasicIndexer{ /** Current document Id */ protected int currentId = 0; protected long maxMemory = 0; /** Memory Checker - provides the method for checking to see if * the system is running low on memory */ protected MemoryChecker memoryCheck = null; /** Number of documents read per memory check */ protected int docsPerCheck; protected int maxDocsPerFlush; /** Runtime system JVM running this instance of Terrier */ protected static final Runtime runtime = Runtime.getRuntime(); /** Number of documents read since the memory consumption was last checked */ protected int numberOfDocsSinceCheck = 0; /** Number of documents read since the memory runs were last flushed to disk */ protected int numberOfDocsSinceFlush = 0; /** Memory status after flush */ protected long memoryAfterFlush = -1; /** Queue with the file names for the runs in disk */ protected Queue fileNames = new LinkedList(); /** Number of the current Run to be written in disk */ protected int currentFile = 0; /** Structure that keeps the posting lists in memory */ protected MemoryPostings mp; /** Structure for merging the run */ protected RunsMerger merger; /** Number of documents indexed */ protected int numberOfDocuments = 0; /** Number of tokens indexed */ protected long numberOfTokens = 0; /** Number of unique terms indexed */ protected int numberOfUniqueTerms = 0; /** Number of pointers indexed */ protected long numberOfPointers = 0; /** what class should be used to read the generated inverted index? */ protected String invertedIndexClass = org.terrier.structures.bit.BitPostingIndex.class.getName(); protected String basicInvertedIndexPostingIteratorClass = BasicIterablePosting.class.getName(); protected String fieldInvertedIndexPostingIteratorClass = FieldIterablePosting.class.getName(); /** what class should be used to read the inverted index as a stream? */ protected String invertedIndexInputStreamClass = org.terrier.structures.bit.BitPostingIndexInputStream.class.getName(); /** * Constructs an instance of a BasicSinglePassIndexer, using the given path name * for storing the data structures. * @param pathname String the path where the datastructures will be created. This is assumed to be * absolute. * @param prefix String the prefix of the index, usually "data". */ public BasicSinglePassIndexer(String pathname, String prefix) { super(pathname, prefix); //delay the execution of init() if we are a parent class if (this.getClass() == BasicSinglePassIndexer.class) init(); if (! (this.compressionInvertedConfig instanceof BitCompressionConfiguration )) { throw new Error("Sorry, only default BitCompressionConfiguration is supported by " + this.getClass().getName() + " - you can recompress the inverted index later using IndexRecompressor"); } } /** Protected do-nothing constructor for use by child classes */ protected BasicSinglePassIndexer(long a, long b, long c) { super(a,b,c); } @Override public void createDirectIndex(Collection collection) { createInvertedIndex(collection); } public void indexDocuments(Iterator, DocumentPostingList>> iterDocs) { logger.info("Creating IF (no direct file).."); final boolean FIELDS = (FieldScore.FIELDS_COUNT > 0); long startCollection, endCollection; fileNames = new LinkedList(); numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0; numberOfTokens = numberOfPointers = 0; createMemoryPostings(); currentIndex = IndexOnDisk.createNewIndex(path, prefix); docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document", FIELDS); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = FIELDS ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new SimpleDocumentIndexEntry(); boolean stopIndexing = false; memoryAfterFlush = runtime.freeMemory(); logger.debug("Starting free memory: "+memoryAfterFlush/1000000+"M"); startCollection = System.currentTimeMillis(); long numberOfTokens = 0; while(iterDocs.hasNext()) { Map.Entry, DocumentPostingList> me = iterDocs.next(); if (me == null) { continue; } DocumentPostingList _termsInDocument = me.getValue(); Map props = me.getKey(); try { if (_termsInDocument.getDocumentLength() == 0) { /* this document is empty, add the minimum to the document index */ indexEmpty(props); if (IndexEmptyDocuments) { currentId++; numberOfDocuments++; } } else { /* index this docuent */ numberOfTokens += numOfTokensInDocument; indexDocument(props, _termsInDocument); } } catch (Exception ioe) { logger.error("Failed to index "+props.get("docno"),ioe); throw new RuntimeException(ioe); } } try{ forceFlush(); endCollection = System.currentTimeMillis(); long partialTime = (endCollection-startCollection)/1000; logger.info("Collection took "+partialTime+ " seconds to build the runs for "+numberOfDocuments+" documents\n"); docIndexBuilder.finishedCollections(); if (FieldScore.FIELDS_COUNT > 0) { currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}"); } else { currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", ""); } currentIndex.setIndexProperty("termpipelines", ApplicationSetup.getProperty("termpipelines", "Stopwords,PorterStemmer")); metaBuilder.close(); currentIndex.flush(); logger.info("Merging "+fileNames.size()+" runs..."); startCollection = System.currentTimeMillis(); performMultiWayMerge(); currentIndex.flush(); endCollection = System.currentTimeMillis(); logger.info("Collection took "+((endCollection-startCollection)/1000)+" seconds to merge\n "); if (emptyDocCount > 0) logger.warn("Indexed " + emptyDocCount + " empty documents"); } catch (Exception e) { logger.error("Problem finishing index", e); } finishedInvertedIndexBuild(); } /** * Builds the inverted file and lexicon file for the given collections * Loops through each document in each of the collections, * extracting terms and pushing these through the Term Pipeline * (eg stemming, stopping, lowercase). * @param collection Collection the collection to be indexed. */ public void createInvertedIndex(Collection collection) { long startCollection, endCollection; startCollection = System.currentTimeMillis(); CollectionConsumer iterDocs = new CollectionConsumer(collection); indexDocuments(iterDocs); endCollection = System.currentTimeMillis(); logger.info("Collection total time "+( (endCollection-startCollection)/1000)); long secs = ((endCollection-startCollection)/1000); if (secs > 3600) logger.info("Rate: "+((double)numberOfDocuments/((double)secs/3600.0d))+" docs/hour"); } /** check to see if a flush is required, and perform if necessary */ protected void checkFlush() throws IOException { if(numberOfDocsSinceCheck >= docsPerCheck) return; numberOfDocsSinceCheck = 0; final long consumed = mp.getMemoryConsumption(); boolean doFlush = false; final boolean memCheck = memoryCheck.checkMemory(); String msg = null; logger.debug(msg="Run "+currentFile+" maxAllowedMemory="+maxMemory + " consumed="+consumed + " maxDocsPerFlush="+maxDocsPerFlush +" numberOfDocsSinceFlush="+numberOfDocsSinceFlush + " memcheck="+ memCheck); if (memCheck) { doFlush = true; msg += " (memory check threshold hit: " + memoryCheck.toString()+")"; } if (maxDocsPerFlush > 0 && numberOfDocsSinceFlush >= maxDocsPerFlush) { msg += " (doc threshold hit)"; doFlush = true; } if (maxMemory > 0 && consumed > maxMemory ) { msg += " (posting memory threshold hit)"; doFlush = true; } if (doFlush) { logger.info("Flush forced: " + msg); forceFlush(); } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="DM_GC", justification="Forcing GC is an essential part of releasing" + "memory for further indexing") /** causes the posting lists built up in memory to be flushed out */ protected void forceFlush() throws IOException { mp.finish(finishMemoryPosting()); System.gc(); createMemoryPostings(); memoryCheck.reset(); numberOfDocsSinceFlush = 0; } /** * {@inheritDoc}. * This implementation only places content in the runs in memory, which will eventually be flushed to disk. */ @Override protected void indexDocument(Map docProperties, DocumentPostingList termsInDocument) throws Exception { if (termsInDocument.getDocumentLength() > 0) { numberOfDocsSinceCheck++; numberOfDocsSinceFlush++; checkFlush(); mp.addTerms(termsInDocument, currentId); DocumentIndexEntry die = termsInDocument.getDocumentStatistics(); docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die)); metaBuilder.writeDocumentEntry(docProperties); currentId++; numberOfDocuments++; } } /** * Adds the name of the current run + partial lexicon to be flushed in disk. * @return the two dimensional String[] array with the names of the run and partial lexicon to write. */ protected String[] finishMemoryPosting(){ String[] names = new String[2]; names[0] = fileNameNoExtension + "Run."+(currentFile); names[1] = fileNameNoExtension + "Run."+(currentFile++)+".str"; fileNames.add(names); return names; } /** * Uses the merger class to perform a k multiway merge * in a set of previously written runs. * The file names and the number of runs are given by the private queue */ public void performMultiWayMerge() throws IOException { String[][] _fileNames = getFileNames(); this.currentIndex.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20)); LexiconOutputStream lexStream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon", (super.numFields > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class)); try{ if (useFieldInformation) createFieldRunMerger(_fileNames); else createRunMerger(_fileNames); merger.beginMerge(_fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix + ".inverted.bf"); while(!merger.isDone()){ merger.mergeOne(lexStream); } merger.endMerge(lexStream); lexStream.close(); //the constructor for FieldLexiconEntry is wrong - replace it if (super.numFields > 0) { this.currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}"); } numberOfUniqueTerms = merger.getNumberOfTerms(); numberOfPointers = merger.getNumberOfPointers(); // Delete the runs files for(int i = 0; i < _fileNames.length; i++) { Files.delete(_fileNames[i][0]); Files.delete(_fileNames[i][1]); } currentIndex.setIndexProperty("num.Terms", ""+numberOfUniqueTerms); currentIndex.setIndexProperty("num.Pointers", ""+numberOfPointers); currentIndex.setIndexProperty("num.Tokens", ""+numberOfTokens); currentIndex.addIndexStructure( "inverted", invertedIndexClass, "org.terrier.structures.IndexOnDisk,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class", "index,structureName,document,"+ (FieldScore.FIELDS_COUNT > 0 ? fieldInvertedIndexPostingIteratorClass : basicInvertedIndexPostingIteratorClass )); currentIndex.addIndexStructureInputStream( "inverted", invertedIndexInputStreamClass, "org.terrier.structures.IndexOnDisk,java.lang.String,java.util.Iterator,java.lang.Class", "index,structureName,lexicon-entry-inputstream,"+ (FieldScore.FIELDS_COUNT > 0 ? fieldInvertedIndexPostingIteratorClass : basicInvertedIndexPostingIteratorClass )); currentIndex.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT ); currentIndex.setIndexProperty("index.inverted.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ",")); }catch(Exception e){ logger.error("Problem in performMultiWayMerge", e); } } /** * @return the String[][] structure with the name of the runs files and partial lexicons. */ protected String[][] getFileNames(){ String[][] files = new String[fileNames.size()][2]; int i = 0; while(!fileNames.isEmpty()){ files[i++] = fileNames.poll(); } return files; } /** * Hook method that creates a FieldRunMerger instance * @throws IOException if an I/O error occurs. */ protected void createFieldRunMerger(String[][] files) throws Exception{ merger = new RunsMerger(new FileRunIteratorFactory(files, FieldPostingInRun.class, super.numFields)); } /** * Hook method that creates a RunsMerger instance * @throws IOException if an I/O error occurs. */ protected void createRunMerger(String[][] files) throws Exception{ merger = new RunsMerger(new FileRunIteratorFactory(files, useFieldInformation ? FieldPostingInRun.class : SimplePostingInRun.class, 0)); } /** * Hook method that creates the right type of MemoryPostings class. */ protected void createMemoryPostings(){ if (useFieldInformation) mp = new FieldsMemoryPostings(); else mp = new MemoryPostings(); } @Override protected void load_indexer_properties() { super.load_indexer_properties(); docsPerCheck = ApplicationSetup.DOCS_CHECK_SINGLEPASS; maxDocsPerFlush = Integer.parseInt(ApplicationSetup.getProperty("indexing.singlepass.max.documents.flush", "0")); memoryCheck = new RuntimeMemoryChecker(); logger.info("Checking memory usage every " + docsPerCheck + " maxDocPerFlush=" + maxDocsPerFlush); MAX_DOCS_PER_BUILDER = UnitUtils.parseInt(ApplicationSetup.getProperty("indexing.max.docs.per.builder", "0")); maxMemory = UnitUtils.parseLong(ApplicationSetup.getProperty("indexing.singlepass.max.postings.memory", "0")); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy