
org.terrier.applications.ThreadedBatchIndexing Maven / Gradle / Ivy
The newest version!
/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.ac.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is ThreadedBatchIndexing.java.
*
* The Original Code is Copyright (C) 2004-2020 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald
*/
package org.terrier.applications;
import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.indexing.Collection;
import org.terrier.indexing.CollectionFactory;
import org.terrier.structures.Index;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.merging.BlockStructureMerger;
import org.terrier.structures.merging.StructureMerger;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.TagSet;
/** An implementation of BatchIndexing that uses Java 8 parallel streams to
* increase indexing speed on multi-core machines.
* @author Craig Macdonald
* @since 4.2
*/
public class ThreadedBatchIndexing extends BatchIndexing {
/** The logger used */
protected static Logger logger = LoggerFactory.getLogger(ThreadedBatchIndexing.class);
final boolean singlePass;
int maxThreads = -1;
public ThreadedBatchIndexing(String _path, String _prefix, boolean _singlePass) {
super(_path, _prefix);
singlePass = _singlePass;
}
public ThreadedBatchIndexing(String _path, String _prefix, boolean _singlePass, int threads) {
super(_path, _prefix);
singlePass = _singlePass;
this.maxThreads = threads;
}
/** Define maximum number of threads in use. -1 for no limit. */
public void setMaxThreads(int threads)
{
this.maxThreads = threads;
}
@Override
public void index()
{
try{
final long starttime = System.currentTimeMillis();
final AtomicInteger indexCounter = new AtomicInteger();
final AtomicInteger mergeCounter = new AtomicInteger();
final int threadCount = this.maxThreads == -1 ? ForkJoinPool.commonPool().getParallelism() : this.maxThreads;
logger.info("Started " + this.getClass().getSimpleName() + " with parallelism " + threadCount);
if (singlePass)
{
int reservationFactor = Math.min(threadCount, 10);
logger.warn("Multi-threaded singlepass indexing is experimental - caution advised due to threads competing for available memory! YMMV.");
logger.info("Memory reserved was " + ApplicationSetup.MEMORY_THRESHOLD_SINGLEPASS);
logger.info("Increasing reserved memory for singlepass by factor of "+ reservationFactor);
ApplicationSetup.MEMORY_THRESHOLD_SINGLEPASS *= reservationFactor;
logger.info("Memory reserved is now "+ApplicationSetup.MEMORY_THRESHOLD_SINGLEPASS);
}
final List> partitioned = collectionFiles.size() > 0
? CollectionFactory.splitList(super.collectionFiles, threadCount)
: CollectionFactory.splitCollectionSpecFileList(super.collectionSpec, threadCount);
logger.info("Partitioned collection.spec into "+ partitioned.size() + " partitions");
if (partitioned.size() == 1)
{
Collection c = loadCollection(partitioned.get(0));
BatchIndexing indexing = singlePass
? new TRECIndexingSinglePass(path, prefix, c)
: new TRECIndexing(path, prefix, c);
indexing.setExternalParalllism(1);
indexing.blocks = blocks;
indexing.index();
logger.info("Final index is at "+path+" " + prefix);
return;
}
IndexOnDisk.setIndexLoadingProfileAsRetrieval(false);
Function,String> indexer = new Function,String>()
{
@Override
public String apply(List files) {
String thisPrefix = prefix + "_stream"+indexCounter.getAndIncrement();
Collection c = loadCollection(files);
BatchIndexing indexing = singlePass
? new TRECIndexingSinglePass(path, thisPrefix, c)
: new TRECIndexing(path, thisPrefix, c);
indexing.setExternalParalllism(threadCount);
indexing.blocks = blocks;
indexing.index();
return thisPrefix;
}
};
BinaryOperator merger = (String t, String u) -> {
logger.debug("Reduce " + t+ " " + u);
try {
if (t == null && u == null)
return null;
if (t == null)
return u;
if (u == null)
return t;
Index.setIndexLoadingProfileAsRetrieval(false);
IndexOnDisk src1 = IndexOnDisk.createIndex(path, t);
IndexOnDisk src2 = IndexOnDisk.createIndex(path, u);
int doc1 = src1.getCollectionStatistics().getNumberOfDocuments();
int doc2 = src2.getCollectionStatistics().getNumberOfDocuments();
if (doc1 > 0 && doc2 == 0)
{
logger.warn("Unusually, index " + u + " did not contain any documents");
IndexUtil.deleteIndex(path, u);
return t;
} else if (doc1 == 0 && doc2 > 0 ) {
logger.warn("Unusually, index " + t + " did not contain any documents");
IndexUtil.deleteIndex(path, t);
return u;
} else if (doc1 == 0 && doc2 == 0) {
logger.warn("Very unusually, index " + t + " and index " + u + " did not contain any documents");
IndexUtil.deleteIndex(path, t);
IndexUtil.deleteIndex(path, u);
return null;
}
String thisPrefix = prefix + "_merge"+mergeCounter.getAndIncrement();
logger.debug("Target prefix for this merge is " + thisPrefix);
IndexOnDisk newIndex = IndexOnDisk.createNewIndex(path, thisPrefix);
if (blocks)
new BlockStructureMerger(src1, src2, newIndex).mergeStructures();
else
new StructureMerger(src1, src2, newIndex).mergeStructures();
src1.close();
src2.close();
newIndex.close();
//TODO: could index deletion occur in parallel
IndexUtil.deleteIndex(path, t);
IndexUtil.deleteIndex(path, u);
logger.debug("New index from " + t+ " and "+ u + " is " + thisPrefix);
return thisPrefix;
} catch (Throwable e) {
throw new RuntimeException(e);
}
};
ForkJoinPool forkPool = this.maxThreads == -1
? ForkJoinPool.commonPool()
: new ForkJoinPool(this.maxThreads);
String tmpPrefix = forkPool.submit(() -> partitioned.parallelStream().map(indexer).reduce(merger).get()).get();
if (tmpPrefix == null)
{
logger.warn("No index created -- all partitions were empty");
return;
}
IndexUtil.renameIndex(path, tmpPrefix, path, prefix);
logger.info("Parallel indexing completed after "
+ (System.currentTimeMillis() - starttime)/1000 + " seconds, using "
+ threadCount + " threads");
logger.info("Final index is at "+path+" " + prefix);
} catch (Throwable e) {
logger.error("Problem occurred during parallel indexing", e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy