Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.operators.sort;
import org.apache.flink.api.common.functions.GroupCombineFunction;
import org.apache.flink.api.common.functions.util.FunctionUtils;
import org.apache.flink.api.common.typeutils.TypeComparator;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.common.typeutils.TypeSerializerFactory;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.memory.MemorySegment;
import org.apache.flink.runtime.io.disk.iomanager.BlockChannelWriter;
import org.apache.flink.runtime.io.disk.iomanager.ChannelWriterOutputView;
import org.apache.flink.runtime.io.disk.iomanager.FileIOChannel;
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.runtime.memory.MemoryAllocationException;
import org.apache.flink.runtime.memory.MemoryManager;
import org.apache.flink.runtime.util.EmptyMutableObjectIterator;
import org.apache.flink.runtime.util.NonReusingKeyGroupedIterator;
import org.apache.flink.runtime.util.ReusingKeyGroupedIterator;
import org.apache.flink.util.Collector;
import org.apache.flink.util.MutableObjectIterator;
import org.apache.flink.util.TraversableOnceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Queue;
/**
* The CombiningUnilateralSortMerger is part of a merge-sort implementation.
* Conceptually, a merge sort with combining works as follows:
* (1) Divide the unsorted list into n sublists of about 1/n the size. (2) Sort each sublist recursively by re-applying
* merge sort. (3) Combine all tuples with the same key within a sublist (4) Merge the two sublists back into one sorted
* list.
* Internally, the {@link CombiningUnilateralSortMerger} logic is factored into four threads (read, sort, combine,
* spill) which communicate through a set of blocking queues (forming a closed loop).
* Memory is allocated using the {@link MemoryManager} interface. Thus the component will most likely not exceed the
* user-provided memory limits.
*/
public class CombiningUnilateralSortMerger extends UnilateralSortMerger {
// ------------------------------------------------------------------------
// Constants & Fields
// ------------------------------------------------------------------------
/**
* Logging.
*/
private static final Logger LOG = LoggerFactory.getLogger(CombiningUnilateralSortMerger.class);
private final GroupCombineFunction combineStub; // the user code stub that does the combining
private Configuration udfConfig;
// ------------------------------------------------------------------------
// Constructor
// ------------------------------------------------------------------------
/**
*Creates a new sorter that reads the data from a given reader and provides an iterator returning that
* data in a sorted manner. The memory is divided among sort buffers, write buffers and read buffers
* automatically.
*
* @param combineStub The stub used to combine values with the same key.
* @param memoryManager The memory manager from which to allocate the memory.
* @param ioManager The I/O manager, which is used to write temporary files to disk.
* @param input The input that is sorted by this sorter.
* @param parentTask The parent task, which owns all resources used by this sorter.
* @param serializerFactory The type serializer.
* @param comparator The type comparator establishing the order relation.
* @param memoryFraction The fraction of memory dedicated to sorting, merging and I/O.
* @param maxNumFileHandles The maximum number of files to be merged at once.
* @param startSpillingFraction The faction of the buffers that have to be filled before the spilling thread
* actually begins spilling data to disk.
*
* @throws IOException Thrown, if an error occurs initializing the resources for external sorting.
* @throws MemoryAllocationException Thrown, if not enough memory can be obtained from the memory manager to
* perform the sort.
*/
public CombiningUnilateralSortMerger(GroupCombineFunction combineStub, MemoryManager memoryManager, IOManager ioManager,
MutableObjectIterator input, AbstractInvokable parentTask,
TypeSerializerFactory serializerFactory, TypeComparator comparator,
double memoryFraction, int maxNumFileHandles, float startSpillingFraction,
boolean handleLargeRecords, boolean objectReuseEnabled)
throws IOException, MemoryAllocationException
{
this(combineStub, memoryManager, ioManager, input, parentTask, serializerFactory, comparator,
memoryFraction, -1, maxNumFileHandles, startSpillingFraction, handleLargeRecords, objectReuseEnabled);
}
/**
* Creates a new sorter that reads the data from a given reader and provides an iterator returning that
* data in a sorted manner. The memory is divided among sort buffers, write buffers and read buffers
* automatically.
*
* @param combineStub The stub used to combine values with the same key.
* @param memoryManager The memory manager from which to allocate the memory.
* @param ioManager The I/O manager, which is used to write temporary files to disk.
* @param input The input that is sorted by this sorter.
* @param parentTask The parent task, which owns all resources used by this sorter.
* @param serializerFactory The type serializer.
* @param comparator The type comparator establishing the order relation.
* @param memoryFraction The fraction of memory dedicated to sorting, merging and I/O.
* @param numSortBuffers The number of distinct buffers to use creation of the initial runs.
* @param maxNumFileHandles The maximum number of files to be merged at once.
* @param startSpillingFraction The faction of the buffers that have to be filled before the spilling thread
* actually begins spilling data to disk.
*
* @throws IOException Thrown, if an error occurs initializing the resources for external sorting.
* @throws MemoryAllocationException Thrown, if not enough memory can be obtained from the memory manager to
* perform the sort.
*/
public CombiningUnilateralSortMerger(GroupCombineFunction combineStub, MemoryManager memoryManager, IOManager ioManager,
MutableObjectIterator input, AbstractInvokable parentTask,
TypeSerializerFactory serializerFactory, TypeComparator comparator,
double memoryFraction, int numSortBuffers, int maxNumFileHandles,
float startSpillingFraction, boolean handleLargeRecords, boolean objectReuseEnabled)
throws IOException, MemoryAllocationException
{
super(memoryManager, ioManager, input, parentTask, serializerFactory, comparator,
memoryFraction, numSortBuffers, maxNumFileHandles, startSpillingFraction, false,
handleLargeRecords, objectReuseEnabled);
this.combineStub = combineStub;
}
public void setUdfConfiguration(Configuration config) {
this.udfConfig = config;
}
// ------------------------------------------------------------------------
// Factory Methods
// ------------------------------------------------------------------------
@Override
protected ThreadBase getSpillingThread(ExceptionHandler exceptionHandler, CircularQueues queues,
AbstractInvokable parentTask, MemoryManager memoryManager, IOManager ioManager,
TypeSerializerFactory serializerFactory, TypeComparator comparator,
List sortReadMemory, List writeMemory, int maxFileHandles)
{
return new CombiningSpillingThread(exceptionHandler, queues, parentTask,
memoryManager, ioManager, serializerFactory.getSerializer(),
comparator, sortReadMemory, writeMemory, maxFileHandles, objectReuseEnabled);
}
// ------------------------------------------------------------------------
// Threads
// ------------------------------------------------------------------------
protected class CombiningSpillingThread extends SpillingThread {
private final TypeComparator comparator2;
private final boolean objectReuseEnabled;
public CombiningSpillingThread(ExceptionHandler exceptionHandler, CircularQueues queues,
AbstractInvokable parentTask, MemoryManager memManager, IOManager ioManager,
TypeSerializer serializer, TypeComparator comparator,
List sortReadMemory, List writeMemory, int maxNumFileHandles,
boolean objectReuseEnabled)
{
super(exceptionHandler, queues, parentTask, memManager, ioManager, serializer, comparator,
sortReadMemory, writeMemory, maxNumFileHandles);
this.comparator2 = comparator.duplicate();
this.objectReuseEnabled = objectReuseEnabled;
}
/**
* Entry point of the thread.
*/
public void go() throws IOException {
// ------------------- In-Memory Cache ------------------------
final Queue> cache = new ArrayDeque>();
CircularElement element;
boolean cacheOnly = false;
// fill cache
while (isRunning()) {
// take next element from queue
try {
element = this.queues.spill.take();
}
catch (InterruptedException iex) {
if (isRunning()) {
LOG.error("Sorting thread was interrupted (without being shut down) while grabbing a buffer. " +
"Retrying to grab buffer...");
continue;
} else {
return;
}
}
if (element == spillingMarker()) {
break;
}
else if (element == endMarker()) {
cacheOnly = true;
break;
}
cache.add(element);
}
// check whether the thread was canceled
if (!isRunning()) {
return;
}
// ------------------- In-Memory Merge ------------------------
if (cacheOnly) {
/* operates on in-memory segments only */
if (LOG.isDebugEnabled()) {
LOG.debug("Initiating in memory merge.");
}
List> iterators = new ArrayList>(cache.size());
// iterate buffers and collect a set of iterators
for (CircularElement cached : cache) {
iterators.add(cached.buffer.getIterator());
}
// release the remaining sort-buffers
if (LOG.isDebugEnabled()) {
LOG.debug("Releasing unused sort-buffer memory.");
}
disposeSortBuffers(true);
// set lazy iterator
MutableObjectIterator resIter = iterators.isEmpty() ? EmptyMutableObjectIterator.get() :
iterators.size() == 1 ? iterators.get(0) :
new MergeIterator(iterators, this.comparator);
setResultIterator(resIter);
return;
}
// ------------------- Spilling Phase ------------------------
final GroupCombineFunction combineStub = CombiningUnilateralSortMerger.this.combineStub;
// now that we are actually spilling, take the combiner, and open it
try {
Configuration conf = CombiningUnilateralSortMerger.this.udfConfig;
FunctionUtils.openFunction (combineStub, (conf == null ? new Configuration() : conf));
}
catch (Throwable t) {
throw new IOException("The user-defined combiner failed in its 'open()' method.", t);
}
final FileIOChannel.Enumerator enumerator = this.ioManager.createChannelEnumerator();
List channelIDs = new ArrayList();
// loop as long as the thread is marked alive and we do not see the final element
while (isRunning()) {
try {
element = takeNext(this.queues.spill, cache);
}
catch (InterruptedException iex) {
if (isRunning()) {
LOG.error("Sorting thread was interrupted (without being shut down) while grabbing a buffer. " +
"Retrying to grab buffer...");
continue;
} else {
return;
}
}
// check if we are still running
if (!isRunning()) {
return;
}
// check if this is the end-of-work buffer
if (element == endMarker()) {
break;
}
// open next channel
FileIOChannel.ID channel = enumerator.next();
registerChannelToBeRemovedAtShudown(channel);
if (LOG.isDebugEnabled()) {
LOG.debug("Creating temp file " + channel.toString() + '.');
}
// create writer
final BlockChannelWriter writer = this.ioManager.createBlockChannelWriter(channel);
registerOpenChannelToBeRemovedAtShudown(writer);
final ChannelWriterOutputView output = new ChannelWriterOutputView(writer, this.writeMemory,
this.memManager.getPageSize());
// write sort-buffer to channel
if (LOG.isDebugEnabled()) {
LOG.debug("Combining buffer " + element.id + '.');
}
// set up the combining helpers
final InMemorySorter buffer = element.buffer;
final CombineValueIterator iter = new CombineValueIterator(
buffer, this.serializer.createInstance(), this.objectReuseEnabled);
final WriterCollector collector = new WriterCollector(output, this.serializer);
int i = 0;
int stop = buffer.size() - 1;
try {
while (i < stop) {
int seqStart = i;
while (i < stop && 0 == buffer.compare(i, i + 1)) {
i++;
}
if (i == seqStart) {
// no duplicate key, no need to combine. simply copy
buffer.writeToOutput(output, seqStart, 1);
} else {
// get the iterator over the values
iter.set(seqStart, i);
// call the combiner to combine
combineStub.combine(iter, collector);
}
i++;
}
}
catch (Exception ex) {
throw new IOException("An error occurred in the combiner user code.", ex);
}
// write the last pair, if it has not yet been included in the last iteration
if (i == stop) {
buffer.writeToOutput(output, stop, 1);
}
// done combining and writing out
if (LOG.isDebugEnabled()) {
LOG.debug("Combined and spilled buffer " + element.id + ".");
}
output.close();
unregisterOpenChannelToBeRemovedAtShudown(writer);
channelIDs.add(new ChannelWithBlockCount(channel, output.getBlockCount()));
// pass empty sort-buffer to reading thread
element.buffer.reset();
this.queues.empty.add(element);
}
// done with the spilling
if (LOG.isDebugEnabled()) {
LOG.debug("Spilling done.");
LOG.debug("Releasing sort-buffer memory.");
}
// clear the sort buffers, but do not return the memory to the manager, as we use it for merging
disposeSortBuffers(false);
if (LOG.isDebugEnabled()) {
LOG.debug("Closing combiner user code.");
}
// close the user code
try {
FunctionUtils.closeFunction(combineStub);
}
catch (Throwable t) {
throw new IOException("The user-defined combiner failed in its 'close()' method.", t);
}
if (LOG.isDebugEnabled()) {
LOG.debug("User code closed.");
}
// ------------------- Merging Phase ------------------------
// merge channels until sufficient file handles are available
while (isRunning() && channelIDs.size() > this.maxFanIn) {
channelIDs = mergeChannelList(channelIDs, this.mergeReadMemory, this.writeMemory);
}
// from here on, we won't write again
this.memManager.release(this.writeMemory);
this.writeMemory.clear();
// check if we have spilled some data at all
if (channelIDs.isEmpty()) {
setResultIterator(EmptyMutableObjectIterator.get());
}
else {
if (LOG.isDebugEnabled()) {
LOG.debug("Beginning final merge.");
}
// allocate the memory for the final merging step
List> readBuffers = new ArrayList>(channelIDs.size());
// allocate the read memory and register it to be released
getSegmentsForReaders(readBuffers, this.mergeReadMemory, channelIDs.size());
// get the readers and register them to be released
final MergeIterator mergeIterator = getMergingIterator(
channelIDs, readBuffers, new ArrayList(channelIDs.size()), null);
// set the target for the user iterator
// if the final merge combines, create a combining iterator around the merge iterator,
// otherwise not
setResultIterator(mergeIterator);
}
// done
if (LOG.isDebugEnabled()) {
LOG.debug("Spilling and merging thread done.");
}
}
// ------------------ Combining & Merging Methods -----------------
/**
* Merges the sorted runs described by the given Channel IDs into a single sorted run. The merging process
* uses the given read and write buffers. During the merging process, the combiner is used to reduce the
* number of values with identical key.
*
* @param channelIDs The IDs of the runs' channels.
* @param readBuffers The buffers for the readers that read the sorted runs.
* @param writeBuffers The buffers for the writer that writes the merged channel.
* @return The ID of the channel that describes the merged run.
*/
@Override
protected ChannelWithBlockCount mergeChannels(List channelIDs, List> readBuffers,
List writeBuffers)
throws IOException
{
// the list with the readers, to be closed at shutdown
final List channelAccesses = new ArrayList(channelIDs.size());
// the list with the target iterators
final MergeIterator mergeIterator = getMergingIterator(channelIDs, readBuffers, channelAccesses, null);
// create a new channel writer
final FileIOChannel.ID mergedChannelID = this.ioManager.createChannel();
registerChannelToBeRemovedAtShudown(mergedChannelID);
final BlockChannelWriter writer = this.ioManager.createBlockChannelWriter(mergedChannelID);
registerOpenChannelToBeRemovedAtShudown(writer);
final ChannelWriterOutputView output = new ChannelWriterOutputView(writer, writeBuffers,
this.memManager.getPageSize());
final WriterCollector collector = new WriterCollector(output, this.serializer);
final GroupCombineFunction combineStub = CombiningUnilateralSortMerger.this.combineStub;
// combine and write to disk
try {
if (objectReuseEnabled) {
final ReusingKeyGroupedIterator groupedIter = new ReusingKeyGroupedIterator<>(
mergeIterator, this.serializer, this.comparator2);
while (groupedIter.nextKey()) {
combineStub.combine(groupedIter.getValues(), collector);
}
} else {
final NonReusingKeyGroupedIterator groupedIter = new NonReusingKeyGroupedIterator<>(
mergeIterator, this.comparator2);
while (groupedIter.nextKey()) {
combineStub.combine(groupedIter.getValues(), collector);
}
}
}
catch (Exception e) {
throw new IOException("An error occurred in the combiner user code.");
}
output.close(); //IS VERY IMPORTANT!!!!
final int numBlocksWritten = output.getBlockCount();
// register merged result to be removed at shutdown
unregisterOpenChannelToBeRemovedAtShudown(writer);
// remove the merged channel readers from the clear-at-shutdown list
for (int i = 0; i < channelAccesses.size(); i++) {
FileIOChannel access = channelAccesses.get(i);
access.closeAndDelete();
unregisterOpenChannelToBeRemovedAtShudown(access);
}
return new ChannelWithBlockCount(mergedChannelID, numBlocksWritten);
}
} // end spilling/merging thread
// ------------------------------------------------------------------------
/**
* This class implements an iterator over values from a sort buffer. The iterator returns the values of a given
* interval.
*/
private static final class CombineValueIterator implements Iterator, Iterable {
private final InMemorySorter buffer; // the buffer from which values are returned
private E recordReuse;
private final boolean objectReuseEnabled;
private int last; // the position of the last value to be returned
private int position; // the position of the next value to be returned
private boolean iteratorAvailable;
/**
* Creates an iterator over the values in a BufferSortable.
*
* @param buffer
* The buffer to get the values from.
*/
public CombineValueIterator(InMemorySorter buffer, E instance, boolean objectReuseEnabled) {
this.buffer = buffer;
this.recordReuse = instance;
this.objectReuseEnabled = objectReuseEnabled;
}
/**
* Sets the interval for the values that are to be returned by this iterator.
*
* @param first
* The position of the first value to be returned.
* @param last
* The position of the last value to be returned.
*/
public void set(int first, int last) {
this.last = last;
this.position = first;
this.iteratorAvailable = true;
}
@Override
public boolean hasNext() {
return this.position <= this.last;
}
@Override
public E next() {
if (this.position <= this.last) {
try {
E record;
if (objectReuseEnabled) {
record = this.buffer.getRecord(this.recordReuse, this.position);
} else {
record = this.buffer.getRecord(this.position);
}
this.position++;
return record;
}
catch (IOException ioex) {
LOG.error("Error retrieving a value from a buffer.", ioex);
throw new RuntimeException("Could not load the next value: " + ioex.getMessage(), ioex);
}
}
else {
throw new NoSuchElementException();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public Iterator iterator() {
if (iteratorAvailable) {
iteratorAvailable = false;
return this;
} else {
throw new TraversableOnceException();
}
}
}
// ------------------------------------------------------------------------
/**
* A simple collector that collects Key and Value and writes them into a given Writer.
*/
private static final class WriterCollector implements Collector {
private final ChannelWriterOutputView output; // the writer to write to
private final TypeSerializer serializer;
/**
* Creates a new writer collector that writes to the given writer.
*
* @param output The writer output view to write to.
*/
private WriterCollector(ChannelWriterOutputView output, TypeSerializer serializer) {
this.output = output;
this.serializer = serializer;
}
@Override
public void collect(E record) {
try {
this.serializer.serialize(record, this.output);
}
catch (IOException ioex) {
throw new RuntimeException("An error occurred forwarding the record to the writer.", ioex);
}
}
@Override
public void close() {}
}
}