eu.stratosphere.nephele.jobmanager.splitassigner.file.FileInputSplitList Maven / Gradle / Ivy
/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.nephele.jobmanager.splitassigner.file;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.stratosphere.core.fs.FileInputSplit;
import eu.stratosphere.nephele.instance.AbstractInstance;
/**
* The file input split list stores the file input splits for an input vertex that are still expected to be consumed.
* Besides simply storing the splits, the file input split list also computes the distance all {@link AbstractInstance}
* objects which request a input split and its nearest storage location with respect to the underlying network topology.
* That way input splits are always given to consuming vertices in a way that data locality is preserved as well as
* possible.
*
* This class is not thread-safe.
*
*/
public final class FileInputSplitList {
/**
* The logging object which is used to report information and errors.
*/
private static final Log LOG = LogFactory.getLog(FileInputSplitList.class);
/**
* The set containing all the file input splits that still must be consumed.
*/
private Set masterSet = new HashSet();
/**
* The map caching the specific file input split lists for each {@link AbstractInstance}.
*/
private Map> instanceMap = new HashMap>();
/**
* This is an auxiliary class to store the minimum distance between a file input split's storage locations and an
* {@link AbstractInstance}.
*
*/
private final class QueueElem implements Comparable {
/**
* The file input split the distance applies to.
*/
final FileInputSplit inputSplit;
/**
* The minimum distance between the file input split's storage locations and the instance this object has been
* created for.
*/
final int distance;
/**
* Creates a new queue element.
*
* @param inputSplit
* the file input split to be stored
* @param distance
* the minimum distance between the stored input split's storage locations and the instance this object
* has been created for
*/
private QueueElem(final FileInputSplit inputSplit, final int distance) {
this.inputSplit = inputSplit;
this.distance = distance;
}
/**
* Returns the file input split stored within this object.
*
* @return the file input split
*/
private FileInputSplit getInputSplit() {
return this.inputSplit;
}
/**
* {@inheritDoc}
*/
@Override
public int compareTo(final QueueElem o) {
return (this.distance - o.distance);
}
}
/**
* Adds the given file input split to the set of file input splits to be consumed.
*
* @param fileInputSplit
* the file input split to be added
*/
synchronized void addSplit(final FileInputSplit fileInputSplit) {
this.masterSet.add(fileInputSplit);
}
/**
* Returns the next file input split to be consumed by the given instance. The returned input split is selected in a
* way that the distance between the split's storage location and the requesting {@link AbstractInstance} is as
* short as possible.
*
* @param instance
* the instance requesting the next file input split
* @return the next input split to be consumed by the given instance or null
if all input splits have
* already been consumed.
*/
synchronized FileInputSplit getNextInputSplit(final AbstractInstance instance) {
final Queue instanceSplitList = getInstanceSplitList(instance);
while (true) {
final QueueElem candidate = instanceSplitList.poll();
if (candidate == null) {
return null;
}
if (this.masterSet.remove(candidate.getInputSplit())) {
if (LOG.isInfoEnabled()) {
if (candidate.distance == 0) {
LOG.info(instance + " receives local file input split");
} else {
LOG.info(instance + " receives remote file input split (distance " + candidate.distance + ")");
}
}
return candidate.getInputSplit();
}
if (this.masterSet.isEmpty()) {
return null;
}
}
}
/**
* Returns a list of file input splits specifically ordered for the given {@link AbstractInstance}. When the list is
* initially created, it contains all the unconsumed file input splits at that point in time, ascendingly ordered by
* the minimum distance between the input splits' storage locations and the given {@link AbstractInstance}.
*
* @param instance
* the instance for which the file input split list has been computed
* @return the list of file input splits ordered specifically for the given instance
*/
private Queue getInstanceSplitList(final AbstractInstance instance) {
Queue instanceSplitList = this.instanceMap.get(instance);
if (instanceSplitList == null) {
// Create and populate instance specific split list
instanceSplitList = new PriorityQueue();
final Iterator it = this.masterSet.iterator();
while (it.hasNext()) {
final FileInputSplit split = it.next();
final String[] hostNames = split.getHostNames();
if (hostNames == null) {
instanceSplitList.add(new QueueElem(split, Integer.MAX_VALUE));
} else {
int minDistance = Integer.MAX_VALUE;
for (int i = 0; i < hostNames.length; ++i) {
final int distance = instance.getDistance(hostNames[i]);
if (LOG.isDebugEnabled()) {
LOG.debug("Distance between " + instance + " and " + hostNames[i] + " is " + distance);
}
if (distance < minDistance) {
minDistance = distance;
}
}
instanceSplitList.add(new QueueElem(split, minDistance));
}
}
this.instanceMap.put(instance, instanceSplitList);
}
return instanceSplitList;
}
}