All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.infinispan.distexec.mapreduce.MapReduceManagerImpl Maven / Gradle / Ivy

There is a newer version: 9.1.7.Final
Show newest version
package org.infinispan.distexec.mapreduce;

import org.infinispan.Cache;
import org.infinispan.atomic.Delta;
import org.infinispan.atomic.DeltaAware;
import org.infinispan.commands.read.MapCombineCommand;
import org.infinispan.commands.read.ReduceCommand;
import org.infinispan.commons.CacheException;
import org.infinispan.commons.marshall.AbstractExternalizer;
import org.infinispan.commons.util.CollectionFactory;
import org.infinispan.commons.util.Util;
import org.infinispan.configuration.cache.Configuration;
import org.infinispan.container.DataContainer;
import org.infinispan.container.entries.InternalCacheEntry;
import org.infinispan.context.Flag;
import org.infinispan.distexec.mapreduce.spi.MapReduceTaskLifecycleService;
import org.infinispan.distribution.DistributionManager;
import org.infinispan.factories.annotations.ComponentName;
import org.infinispan.factories.annotations.Inject;
import org.infinispan.filter.CollectionKeyFilter;
import org.infinispan.filter.CompositeKeyFilter;
import org.infinispan.filter.KeyFilter;
import org.infinispan.interceptors.locking.ClusteringDependentLogic;
import org.infinispan.persistence.manager.PersistenceManager;
import org.infinispan.persistence.PrimaryOwnerFilter;
import org.infinispan.persistence.spi.AdvancedCacheLoader;
import org.infinispan.persistence.spi.AdvancedCacheLoader.TaskContext;
import org.infinispan.manager.EmbeddedCacheManager;
import org.infinispan.marshall.core.Ids;
import org.infinispan.marshall.core.MarshalledEntry;
import org.infinispan.marshall.core.MarshalledValue;
import org.infinispan.remoting.transport.Address;
import org.infinispan.util.TimeService;
import org.infinispan.util.logging.Log;
import org.infinispan.util.logging.LogFactory;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiConsumer;

import static org.infinispan.factories.KnownComponentNames.ASYNC_TRANSPORT_EXECUTOR;

/**
 * Default implementation of {@link MapReduceManager}.
 * 

* * * This is an internal class, not intended to be used by clients. * @author Vladimir Blagojevic * @since 5.2 */ public class MapReduceManagerImpl implements MapReduceManager { private static final Log log = LogFactory.getLog(MapReduceManagerImpl.class); private static final boolean trace = log.isTraceEnabled(); private ClusteringDependentLogic cdl; private EmbeddedCacheManager cacheManager; private PersistenceManager persistenceManager; private ExecutorService executorService; private TimeService timeService; private int chunkSize; MapReduceManagerImpl() { } @Inject public void init(EmbeddedCacheManager cacheManager, PersistenceManager persistenceManager, @ComponentName(ASYNC_TRANSPORT_EXECUTOR) ExecutorService asyncTransportExecutor, ClusteringDependentLogic cdl, TimeService timeService, Configuration configuration) { this.cacheManager = cacheManager; this.persistenceManager = persistenceManager; this.cdl = cdl; this.executorService = asyncTransportExecutor; this.timeService = timeService; this.chunkSize = configuration.clustering().stateTransfer().chunkSize(); } @Override public ExecutorService getExecutorService() { return executorService; } @Override public Map> mapAndCombineForLocalReduction( MapCombineCommand mcc) throws InterruptedException { CollectableCollector collector = map(mcc); combine(mcc, collector); return collector.collectedValues(); } @Override public Set mapAndCombineForDistributedReduction( MapCombineCommand mcc) throws InterruptedException { try { return mapAndCombine(mcc); } catch (Exception e) { throw new CacheException(e); } } @Override public Map reduce(ReduceCommand reduceCommand) throws InterruptedException { final Map result = CollectionFactory.makeConcurrentMap(256); reduce(reduceCommand, result); return result; } @Override public void reduce(ReduceCommand reduceCommand, String resultCache) throws InterruptedException{ Cache cache = cacheManager.getCache(resultCache); reduce(reduceCommand, cache); } protected void reduce(ReduceCommand reduceCommand, final Map result) throws InterruptedException { final Set keys = reduceCommand.getKeys(); final String taskId = reduceCommand.getTaskId(); boolean noInputKeys = keys == null || keys.isEmpty(); if (noInputKeys) { //illegal state, raise exception throw new IllegalStateException("Reduce phase of MapReduceTask " + taskId + " on node " + cdl.getAddress() + " executed with empty input keys"); } else { final Reducer reducer = reduceCommand.getReducer(); final boolean sharedTmpCacheUsed = reduceCommand.isUseIntermediateSharedCache(); MapReduceTaskLifecycleService taskLifecycleService = MapReduceTaskLifecycleService.getInstance(); log.tracef("For m/r task %s invoking %s at %s", taskId, reduceCommand, cdl.getAddress()); long start = trace ? timeService.time() : 0; try { Cache, List> cache = cacheManager.getCache(reduceCommand.getCacheName()); taskLifecycleService.onPreExecute(reducer, cache); KeyFilter> filter = new IntermediateKeyFilter(taskId, !sharedTmpCacheUsed); //iterate all tmp cache entries in memory, do it in parallel DataContainer, List> dc = cache.getAdvancedCache().getDataContainer(); dc.executeTask(filter, new DataContainerTask, List>() { @Override public void accept(IntermediateKey k, InternalCacheEntry, List> v) { KOut key = k.getKey(); //resolve Iterable for iterated key stored in tmp cache Iterable value = getValue(v); if (value == null) { throw new IllegalStateException("Found invalid value in intermediate cache, for key " + key + " during reduce phase execution on " + cacheManager.getAddress() + " for M/R task " + taskId); } // and reduce it VOut reduced = reducer.reduce(key, value.iterator()); result.put(key, reduced); log.tracef("For m/r task %s reduced %s to %s at %s ", taskId, key, reduced, cdl.getAddress()); } }); } finally { if (trace) { log.tracef("Reduce for task %s took %s milliseconds", reduceCommand.getTaskId(), timeService.timeDuration(start, TimeUnit.MILLISECONDS)); } taskLifecycleService.onPostExecute(reducer); } } } @SuppressWarnings("unchecked") protected CollectableCollector map( MapCombineCommand mcc) throws InterruptedException { final Cache cache = cacheManager.getCache(mcc.getCacheName()); Set keys = mcc.getKeys(); int maxCSize = mcc.getMaxCollectorSize(); final Mapper mapper = mcc.getMapper(); final boolean inputKeysSpecified = keys != null && !keys.isEmpty(); // hook map function into lifecycle and execute it MapReduceTaskLifecycleService taskLifecycleService = MapReduceTaskLifecycleService.getInstance(); final CollectableCollector collector = new SynchronizedCollector( new DefaultCollector(mcc, maxCSize)); DataContainer dc = cache.getAdvancedCache().getDataContainer(); log.tracef("For m/r task %s invoking %s with input keys %s", mcc.getTaskId(), mcc, keys); long start = trace ? timeService.time() : 0; try { taskLifecycleService.onPreExecute(mapper, cache); //User specified input taks keys, most likely a short list of input keys (<10^3), iterate serially if (inputKeysSpecified) { for (KIn key : keys) { VIn value = cache.get(key); if (value != null) { mapper.map(key, value, collector); } } } else { // here we have to iterate all entries in memory, do it in parallel dc.executeTask(new PrimaryOwnerFilter(cdl), new DataContainerTask() { @Override public void accept(KIn key , InternalCacheEntry v) { VIn value = getValue(v); if (value != null) { mapper.map(key, value, collector); } } }); } // in case we have stores, we have to process key/values from there as well if (persistenceManager != null && !inputKeysSpecified) { KeyFilter keyFilter = new CompositeKeyFilter(new PrimaryOwnerFilter(cdl), new CollectionKeyFilter(dc.keySet())); persistenceManager.processOnAllStores(keyFilter, new MapReduceCacheLoaderTask(mapper, collector), true, false); } } finally { if (trace) { log.tracef("Map phase for task %s took %s milliseconds", mcc.getTaskId(), timeService.timeDuration(start, TimeUnit.MILLISECONDS)); } taskLifecycleService.onPostExecute(mapper); } return collector; } @SuppressWarnings("unchecked") protected Set mapAndCombine(final MapCombineCommand mcc) throws Exception { final Cache cache = cacheManager.getCache(mcc.getCacheName()); Set keys = mcc.getKeys(); int maxCSize = mcc.getMaxCollectorSize(); final Mapper mapper = mcc.getMapper(); final boolean inputKeysSpecified = keys != null && !keys.isEmpty(); // hook map function into lifecycle and execute it MapReduceTaskLifecycleService taskLifecycleService = MapReduceTaskLifecycleService.getInstance(); DataContainer dc = cache.getAdvancedCache().getDataContainer(); log.tracef("For m/r task %s invoking %s with input keys %s", mcc.getTaskId(), mcc, mcc.getKeys()); long start = trace ? timeService.time() : 0; final Set intermediateKeys = new HashSet(); try { taskLifecycleService.onPreExecute(mapper, cache); if (inputKeysSpecified) { DefaultCollector c = new DefaultCollector(mcc, maxCSize); for (KIn key : keys) { VIn value = cache.get(key); if (value != null) { mapper.map(key, value, c); } } combine(mcc, c); Set s = migrateIntermediateKeysAndValues(mcc, c.collectedValues()); intermediateKeys.addAll(s); } else { MapCombineTask task = new MapCombineTask(mcc, maxCSize); dc.executeTask(new PrimaryOwnerFilter(cdl), task); intermediateKeys.addAll(task.getMigratedIntermediateKeys()); //the last chunk of remaining keys/values to migrate Map> combinedValues = task.collectedValues(); Set lastOne = migrateIntermediateKeysAndValues(mcc, combinedValues); intermediateKeys.addAll(lastOne); } // in case we have stores, we have to process key/values from there as well if (persistenceManager != null && !inputKeysSpecified) { KeyFilter keyFilter = new CompositeKeyFilter(new PrimaryOwnerFilter(cdl), new CollectionKeyFilter(dc.keySet())); MapCombineTask task = new MapCombineTask(mcc, maxCSize); persistenceManager.processOnAllStores(keyFilter, task, true, false); intermediateKeys.addAll(task.getMigratedIntermediateKeys()); //the last chunk of remaining keys/values to migrate Map> combinedValues = task.collectedValues(); Set lastOne = migrateIntermediateKeysAndValues(mcc, combinedValues); intermediateKeys.addAll(lastOne); } } finally { if (trace) { log.tracef("Map phase for task %s took %s milliseconds", mcc.getTaskId(), timeService.timeDuration(start, TimeUnit.MILLISECONDS)); } taskLifecycleService.onPostExecute(mapper); } return intermediateKeys; } protected void combine(MapCombineCommand mcc, CollectableCollector c) { if (mcc.hasCombiner()) { Reducer combiner = mcc.getCombiner(); Cache cache = cacheManager.getCache(mcc.getCacheName()); log.tracef("For m/r task %s invoking combiner %s at %s", mcc.getTaskId(), mcc, cdl.getAddress()); MapReduceTaskLifecycleService taskLifecycleService = MapReduceTaskLifecycleService.getInstance(); long start = trace ? timeService.time() : 0; try { taskLifecycleService.onPreExecute(combiner, cache); for (Entry> e : c.collectedValues().entrySet()) { List mapped = e.getValue(); if (mapped.size() > 1) { VOut reduced = combiner.reduce(e.getKey(), mapped.iterator()); c.emitReduced(e.getKey(), reduced); } } } finally { if (trace) { log.tracef("Combine for task %s took %s milliseconds", mcc.getTaskId(), timeService.timeDuration(start, TimeUnit.MILLISECONDS)); } taskLifecycleService.onPostExecute(combiner); } } } private Set migrateIntermediateKeysAndValues( MapCombineCommand mcc, Map> collectedValues) { String taskId = mcc.getTaskId(); String tmpCacheName = mcc.getIntermediateCacheName(); Cache, DeltaList> tmpCache = cacheManager.getCache(tmpCacheName); if (tmpCache == null) { throw new IllegalStateException("Temporary cache for MapReduceTask " + taskId + " named " + tmpCacheName + " not found on " + cdl.getAddress()); } Set mapPhaseKeys = new HashSet(); DistributionManager dm = tmpCache.getAdvancedCache().getDistributionManager(); Map> keysToNodes = mapKeysToNodes(dm, taskId, collectedValues.keySet()); long start = log.isTraceEnabled() ? timeService.time() : 0; tmpCache = tmpCache.getAdvancedCache().withFlags(Flag.IGNORE_RETURN_VALUES); try { for (Entry> entry : keysToNodes.entrySet()) { List keysHashedToAddress = entry.getValue(); try { log.tracef("For m/r task %s migrating intermediate keys %s to %s", taskId, keysHashedToAddress, entry.getKey()); for (KOut key : keysHashedToAddress) { List values = collectedValues.get(key); int entryTransferCount = chunkSize; for (int i = 0; i < values.size(); i += entryTransferCount) { List chunk = values.subList(i, Math.min(values.size(), i + entryTransferCount)); DeltaList delta = new DeltaList(chunk); tmpCache.put(new IntermediateKey(taskId, key), delta); } mapPhaseKeys.add(key); } } catch (Exception e) { throw new CacheException("Could not move intermediate keys/values for M/R task " + taskId, e); } } } finally { if (trace) { log.tracef("Migrating keys for task %s took %s milliseconds (Migrated %s keys)", mcc.getTaskId(), timeService.timeDuration(start, TimeUnit.MILLISECONDS), mapPhaseKeys.size()); } } return mapPhaseKeys; } @Override public Map> mapKeysToNodes(DistributionManager dm, String taskId, Collection keysToMap) { Map> addressToKey = new HashMap>(); for (T key : keysToMap) { Address ownerOfKey = dm.getPrimaryLocation(new IntermediateKey(taskId, key)); List keysAtNode = addressToKey.get(ownerOfKey); if (keysAtNode == null) { keysAtNode = new ArrayList(); addressToKey.put(ownerOfKey, keysAtNode); } keysAtNode.add(key); } return addressToKey; } protected Set filterLocalPrimaryOwner(Set nodeLocalKeys, DistributionManager dm) { Set selectedKeys = new HashSet(); for (KIn key : nodeLocalKeys) { Address primaryLocation = dm != null ? dm.getPrimaryLocation(key) : cdl.getAddress(); if (primaryLocation != null && primaryLocation.equals(cdl.getAddress())) { selectedKeys.add(key); } } return selectedKeys; } private abstract class DataContainerTask implements BiConsumer> { @SuppressWarnings("unchecked") protected V getValue(InternalCacheEntry entry){ if (entry != null && !entry.isExpired(timeService.wallClockTime())) { Object value = entry.getValue(); if (value instanceof MarshalledValue) { value = ((MarshalledValue) value).get(); } return (V)value; } else { return null; } } } /** * This is the parallel staggered map/combine algorithm. Threads from the default fork/join pool * traverse container and store key/value pairs in parallel. As one of the threads hits the * maxCollectorSize threshold, it takes the snapshot of the current state of the collector and * invokes combine on it all while others threads continue to fill up collector up to the point * where the threshold is reached again. The thread that broke the collector threshold invokes * combine and the algorithm repeats. The benefit of staggered parallel map/combine is manyfold. * First, we never exhaust working memory of a node as we batch map/combine execution all while * traversal of key/value pairs is in progress. Second, such a staggered combine execution does * not cause underlying transport to be completely saturated by intermediate cache put commands; * intermediate key/value pairs of map/reduce algorithm are transferred across the cluster * smoothly as parallel traversal of container's key/value pairs is progress. * */ private final class MapCombineTask extends DataContainerTask implements AdvancedCacheLoader.CacheLoaderTask { private final MapCombineCommand mcc; private final Set intermediateKeys; private final int queueLimit; private final BlockingQueue> queue; public MapCombineTask(MapCombineCommand mcc, int maxCollectorSize) throws Exception { super(); this.queueLimit = Runtime.getRuntime().availableProcessors() * 2; this.queue = new ArrayBlockingQueue>(queueLimit + 1); this.mcc = mcc; this.intermediateKeys = Collections.synchronizedSet(new HashSet()); //fill up queue with collectors for (int i = 0; i < queueLimit; i++){ queue.put(new DefaultCollector(mcc, maxCollectorSize)); } } @Override public void accept(K key, InternalCacheEntry v) { V value = getValue(v); if (value != null) { try { executeMapWithCollector(key, value); } catch (InterruptedException e) { //reset signal Thread.currentThread().interrupt(); } } } @Override public void processEntry(MarshalledEntry marshalledEntry, TaskContext taskContext) throws InterruptedException { executeMapWithCollector(marshalledEntry.getKey(), getValue(marshalledEntry)); } @Override @SuppressWarnings("unchecked") protected V getValue(InternalCacheEntry entry){ if (entry != null) { Object value = entry.getValue(); if (value instanceof MarshalledValue) { value = ((MarshalledValue) value).get(); } return (V)value; } else { return null; } } private Set getMigratedIntermediateKeys() { return intermediateKeys; } private Map> collectedValues() { //combine all collectors from the queue into one DefaultCollector finalCollector = new DefaultCollector(mcc, Integer.MAX_VALUE); for (DefaultCollector collector : queue) { if (!collector.isEmpty()) { finalCollector.emit(collector.collectedValues()); collector.reset(); } } combine(mcc, finalCollector); return finalCollector.collectedValues(); } private void executeMapWithCollector(K key, V value) throws InterruptedException { DefaultCollector c = null; try { // grab collector C from the bounded queue c = queue.take(); //invoke mapper with collector C mcc.getMapper().map(key, value, c); migrate(c); } finally { queue.put(c); } } private void migrate(final DefaultCollector c) { // if overflow even after combine then migrate these keys/values if (c.isOverflown()) { Set migratedKeys = migrateIntermediateKeysAndValues(mcc, c.collectedValues()); intermediateKeys.addAll(migratedKeys); c.reset(); } } @SuppressWarnings("unchecked") private V getValue(MarshalledEntry marshalledEntry) { Object loadedValue = marshalledEntry.getValue(); if (loadedValue instanceof MarshalledValue) { return (V) ((MarshalledValue) loadedValue).get(); } else { return (V) loadedValue; } } } private static final class IntermediateKeyFilter implements KeyFilter> { private final String taskId; private final boolean acceptAll; public IntermediateKeyFilter(String taskId, boolean acceptAll) { if (taskId == null || taskId.isEmpty()) { throw new IllegalArgumentException("Invalid task Id " + taskId); } this.taskId = taskId; this.acceptAll = acceptAll; } @Override public boolean accept(IntermediateKey key) { if (acceptAll) { return true; } else { if (key != null) { return taskId.equals(key.getTaskId()); } else { return false; } } } } /** * @author Sanne Grinovero (C) 2011 Red Hat Inc. * @author Dan Berindei * @author William Burns * @author Vladimir Blagojevic */ private final class DefaultCollector implements CollectableCollector { private Map> store; private final AtomicInteger emitCount; private final int maxCollectorSize; private MapCombineCommand mcc; public DefaultCollector(MapCombineCommand mcc, int maxCollectorSize) { store = new HashMap>(1024, 0.75f); emitCount = new AtomicInteger(); this.maxCollectorSize = maxCollectorSize; this.mcc = mcc; } @Override public void emit(KOut key, VOut value) { List list = store.get(key); if (list == null) { list = new ArrayList(128); store.put(key, list); } list.add(value); emitCount.incrementAndGet(); if (isOverflown() && mcc.hasCombiner()) { combine(mcc, this); } } @Override public void emitReduced(KOut key, VOut value) { List list = store.get(key); int prevSize = list.size(); list.clear(); list.add(value); //we remove prevSize elements and replace it with one (the reduced value) emitCount.addAndGet(-prevSize + 1); } @Override public Map> collectedValues() { return store; } public void reset(){ store.clear(); emitCount.set(0); } public boolean isEmpty() { return store.isEmpty(); } public void emit(Map> combined) { for (Entry> e : combined.entrySet()) { KOut k = e.getKey(); List values = e.getValue(); for (VOut v : values) { emit(k, v); } } } public boolean isOverflown() { return emitCount.get() > maxCollectorSize; } } private interface CollectableCollector extends Collector{ Map> collectedValues(); void emitReduced(K key, V value); } private final class SynchronizedCollector implements CollectableCollector { private CollectableCollector delegate; public SynchronizedCollector(CollectableCollector delegate) { this.delegate = delegate; } @Override public synchronized void emit(KOut key, VOut value) { delegate.emit(key, value); } @Override public synchronized void emitReduced(KOut key, VOut value) { delegate.emitReduced(key, value); } @Override public synchronized Map> collectedValues() { return delegate.collectedValues(); } } private static class DeltaAwareList implements Iterable, DeltaAware { private final List list; public DeltaAwareList(List list) { this.list = list; } @Override public Delta delta() { return new DeltaList(list); } @Override public void commit() { list.clear(); } @Override public Iterator iterator(){ return list.iterator(); } @Override public String toString() { return "DeltaAwareList(" + list.size() + ")" + String.valueOf(list); } } private static class DeltaList implements Delta { private final List deltas; public DeltaList(List list) { deltas = new ArrayList(list); } @SuppressWarnings("unchecked") @Override public DeltaAware merge(DeltaAware d) { DeltaAwareList other = null; if (d instanceof DeltaAwareList) { other = (DeltaAwareList) d; other.list.addAll(deltas); } else { other = new DeltaAwareList(deltas); } return other; } } @SuppressWarnings("rawtypes") public static class DeltaListExternalizer extends AbstractExternalizer { private static final long serialVersionUID = 5859147782602054109L; @Override public void writeObject(ObjectOutput output, DeltaList list) throws IOException { output.writeObject(list.deltas); } @Override @SuppressWarnings("unchecked") public DeltaList readObject(ObjectInput input) throws IOException, ClassNotFoundException { return new DeltaList((List) input.readObject()); } @Override public Integer getId() { return Ids.DELTA_MAPREDUCE_LIST_ID; } @Override @SuppressWarnings("unchecked") public Set> getTypeClasses() { return Util.>asSet(DeltaList.class); } } @SuppressWarnings("rawtypes") public static class DeltaAwareListExternalizer extends AbstractExternalizer { private static final long serialVersionUID = -8956663669844107351L; @Override public void writeObject(ObjectOutput output, DeltaAwareList deltaAwareList) throws IOException { output.writeObject(deltaAwareList.list); } @Override @SuppressWarnings("unchecked") public DeltaAwareList readObject(ObjectInput input) throws IOException, ClassNotFoundException { return new DeltaAwareList((List) input.readObject()); } @Override public Integer getId() { return Ids.DELTA_AWARE_MAPREDUCE_LIST_ID; } @Override @SuppressWarnings("unchecked") public Set> getTypeClasses() { return Util.>asSet(DeltaAwareList.class); } } /** * IntermediateCompositeKey */ public static final class IntermediateKey implements Serializable { /** The serialVersionUID */ private static final long serialVersionUID = 4434717760740027918L; private final String taskId; private final V key; public IntermediateKey(String taskId, V key) { this.taskId = taskId; this.key = key; } public String getTaskId() { return taskId; } public V getKey(){ return key; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((key == null) ? 0 : key.hashCode()); result = prime * result + ((taskId == null) ? 0 : taskId.hashCode()); return result; } @SuppressWarnings("unchecked") @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (!(obj instanceof IntermediateKey)) { return false; } IntermediateKey other = (IntermediateKey) obj; if (key == null) { if (other.key != null) { return false; } } else if (!key.equals(other.key)) { return false; } if (taskId == null) { if (other.taskId != null) { return false; } } else if (!taskId.equals(other.taskId)) { return false; } return true; } @Override public String toString() { return "IntermediateCompositeKey [taskId=" + taskId + ", key=" + key + "]"; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy