org.terrier.applications.ThreadedBatchIndexing Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of terrier-batch-indexers Show documentation
The newest version!
/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.ac.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is ThreadedBatchIndexing.java.
 *
 * The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald 
 */
package org.terrier.applications;

import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BinaryOperator;
import java.util.function.Function;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.indexing.Collection;
import org.terrier.indexing.CollectionFactory;
import org.terrier.structures.Index;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.merging.BlockStructureMerger;
import org.terrier.structures.merging.StructureMerger;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.TagSet;
/** An implementation of BatchIndexing that uses Java 8 parallel streams to
 * increase indexing speed on multi-core machines.
 * @author Craig Macdonald
 * @since 4.2
 */
public class ThreadedBatchIndexing extends BatchIndexing {
	
	/** The logger used */
	protected static Logger logger = LoggerFactory.getLogger(ThreadedBatchIndexing.class);
	
	final boolean singlePass;
	int maxThreads = -1;
	
	public ThreadedBatchIndexing(String _path, String _prefix, boolean _singlePass) {
		super(_path, _prefix);
		singlePass = _singlePass;
	}
	
	public ThreadedBatchIndexing(String _path, String _prefix, boolean _singlePass, int threads) {
		super(_path, _prefix);
		singlePass = _singlePass;
		this.maxThreads = threads;
	}
	
	/** Define maximum number of threads in use. -1 for no limit. */
	public void setMaxThreads(int threads)
	{
		this.maxThreads = threads;
	}

	@Override
	public void index()
	{	
		try{
			final long starttime = System.currentTimeMillis();
			final AtomicInteger indexCounter = new AtomicInteger();
			final AtomicInteger mergeCounter = new AtomicInteger();			
			
			final int threadCount = this.maxThreads == -1 ? ForkJoinPool.commonPool().getParallelism() : this.maxThreads;
			logger.info("Started " + this.getClass().getSimpleName() + " with parallelism " + threadCount);
			if (singlePass)
			{
				int reservationFactor = Math.min(threadCount, 10);
				logger.warn("Multi-threaded singlepass indexing is experimental - caution advised due to threads competing for available memory! YMMV.");
				logger.info("Memory reserved was " + ApplicationSetup.MEMORY_THRESHOLD_SINGLEPASS);
				logger.info("Increasing reserved memory for singlepass by factor of "+ reservationFactor);
				ApplicationSetup.MEMORY_THRESHOLD_SINGLEPASS *= reservationFactor;
				logger.info("Memory reserved is now "+ApplicationSetup.MEMORY_THRESHOLD_SINGLEPASS);
			}
			final List> partitioned = collectionFiles.size() > 0
				? CollectionFactory.splitList(super.collectionFiles, threadCount)
				: CollectionFactory.splitCollectionSpecFileList(super.collectionSpec, threadCount);
			logger.info("Partitioned collection.spec into "+ partitioned.size() + " partitions");
			if (partitioned.size() == 1)
			{
				Collection c = loadCollection(partitioned.get(0));
				BatchIndexing indexing = singlePass 
						? new TRECIndexingSinglePass(path, prefix, c)
						: new TRECIndexing(path, prefix, c);
				indexing.setExternalParalllism(1);
				indexing.blocks = blocks;
				indexing.index();
				logger.info("Final index is at "+path+" " + prefix);
				return;
			}

			IndexOnDisk.setIndexLoadingProfileAsRetrieval(false);
			Function,String> indexer = new Function,String>()
			{
				@Override
				public String apply(List files) {
					String thisPrefix = prefix + "_stream"+indexCounter.getAndIncrement();
					Collection c = loadCollection(files);
					BatchIndexing indexing = singlePass 
							? new TRECIndexingSinglePass(path, thisPrefix, c)
							: new TRECIndexing(path, thisPrefix, c);
					indexing.setExternalParalllism(threadCount);
					indexing.blocks = blocks;
					indexing.index();
					return thisPrefix;
				}	
			};
			BinaryOperator merger = (String t, String u) -> {
				logger.debug("Reduce " + t+ " " + u);
				try {			
					if (t == null && u == null)
						return null;
					if (t == null)
						return u;
					if (u == null)
						return t;
					Index.setIndexLoadingProfileAsRetrieval(false);
					IndexOnDisk src1 = IndexOnDisk.createIndex(path, t);
					IndexOnDisk src2 = IndexOnDisk.createIndex(path, u);
					int doc1 = src1.getCollectionStatistics().getNumberOfDocuments();
					int doc2 = src2.getCollectionStatistics().getNumberOfDocuments();
					if (doc1 > 0 && doc2 == 0)
					{
						logger.warn("Unusually, index " + u + " did not contain any documents");
						IndexUtil.deleteIndex(path, u);
						return t;
					} else if (doc1 == 0 && doc2 > 0 ) {
						logger.warn("Unusually, index " + t + " did not contain any documents");
						IndexUtil.deleteIndex(path, t);
						return u;
					} else if (doc1 == 0 && doc2 == 0) {
						logger.warn("Very unusually, index " + t + " and index " + u + " did not contain any documents");
						IndexUtil.deleteIndex(path, t);
						IndexUtil.deleteIndex(path, u);
						return null;
					}
					
					String thisPrefix = prefix + "_merge"+mergeCounter.getAndIncrement();
					logger.debug("Target prefix for this merge is " + thisPrefix);
					IndexOnDisk newIndex = IndexOnDisk.createNewIndex(path, thisPrefix);
					if (blocks)
						new BlockStructureMerger(src1, src2, newIndex).mergeStructures();
					else
						new StructureMerger(src1, src2, newIndex).mergeStructures();
					
					src1.close();
					src2.close();
					newIndex.close();
					//TODO: could index deletion occur in parallel
					IndexUtil.deleteIndex(path, t);
					IndexUtil.deleteIndex(path, u);
					logger.debug("New index from " + t+ " and "+ u + " is " + thisPrefix);
					return thisPrefix;
				} catch (Throwable e) {
					throw new RuntimeException(e);
				}
					
			};
			
			ForkJoinPool forkPool = this.maxThreads == -1 
					? ForkJoinPool.commonPool()
					: new ForkJoinPool(this.maxThreads);
			String tmpPrefix = forkPool.submit(() -> partitioned.parallelStream().map(indexer).reduce(merger).get()).get();
			if (tmpPrefix == null)
			{
				logger.warn("No index created -- all partitions were empty");
				return;
			}
			IndexUtil.renameIndex(path, tmpPrefix, path, prefix);
			logger.info("Parallel indexing completed after " 
				+ (System.currentTimeMillis() - starttime)/1000 + " seconds, using " 
				+ threadCount + " threads");
			logger.info("Final index is at "+path+" " + prefix);
		} catch (Throwable e) {
			logger.error("Problem occurred during parallel indexing", e);
		}
	}

}