Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.operators.sort;
import org.apache.flink.api.common.typeutils.TypeComparator;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.core.memory.MemorySegment;
import org.apache.flink.runtime.io.disk.ChannelBackendMutableObjectIterator;
import org.apache.flink.runtime.io.disk.iomanager.FileIOChannel;
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
import org.apache.flink.util.MutableObjectIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* A merging policy who merge files by running outer sort-merge.
*/
public class RecordComparisonMerger implements SortedDataFileMerger {
private static final Logger LOG = LoggerFactory.getLogger(RecordComparisonMerger.class);
private List> sortedDataFiles;
protected final SortedDataFileFactory sortedDataFileFactory;
protected final IOManager ioManager;
protected final TypeSerializer typeSerializer;
protected final TypeComparator typeComparator;
private final int maxFileHandlesPerMerge;
protected final boolean objectReuseEnabled;
public RecordComparisonMerger(SortedDataFileFactory sortedDataFileFactory,
IOManager ioManager,
TypeSerializer typeSerializer,
TypeComparator typeComparator,
int maxFileHandlesPerMerge,
boolean objectReuseEnabled) {
this.sortedDataFileFactory = sortedDataFileFactory;
this.ioManager = ioManager;
this.typeSerializer = typeSerializer;
this.typeComparator = typeComparator;
this.maxFileHandlesPerMerge = maxFileHandlesPerMerge;
this.objectReuseEnabled = objectReuseEnabled;
this.sortedDataFiles = new ArrayList<>();
}
private void merge(List writeMemory,
List mergeReadMemory,
ChannelDeleteRegistry channelDeleteRegistry,
AtomicBoolean aliveFlag) throws IOException {
int maxFanIn = Math.min(maxFileHandlesPerMerge, mergeReadMemory.size());
while (aliveFlag.get() && sortedDataFiles.size() > maxFanIn) {
sortedDataFiles = mergeChannelList(
sortedDataFiles, mergeReadMemory, writeMemory, channelDeleteRegistry, maxFanIn);
}
}
@Override
public MutableObjectIterator getMergingIterator(List> channels,
List mergeReadMemory,
MutableObjectIterator largeRecords,
ChannelDeleteRegistry channelDeleteRegistry) throws IOException {
List> segmentedReadMemory = distributeReadMemory(mergeReadMemory, channels.size());
return getMergingIteratorWithSegmentedMemory(channels, segmentedReadMemory, null, largeRecords, channelDeleteRegistry);
}
@Override
public void notifyNewSortedDataFile(SortedDataFile sortedDataFile,
List writeMemory,
List mergeReadMemory,
ChannelDeleteRegistry channelDeleteRegistry,
AtomicBoolean aliveFlag) throws IOException {
sortedDataFiles.add(sortedDataFile);
}
@Override
public List> finishMerging(List writeMemory,
List mergeReadMemory,
ChannelDeleteRegistry channelDeleteRegistry,
AtomicBoolean aliveFlag) throws IOException {
merge(writeMemory, mergeReadMemory, channelDeleteRegistry, aliveFlag);
return sortedDataFiles;
}
/**
* Returns an iterator that iterates over the merged result from all given channels.
*
* @param files the channels that are to be merged and returned.
* @param inputSegments the buffers to be used for reading. The list contains for each channel one
* list of input segments. The size of the inputSegments list must be equal to
* that of the files list.
* @param channelAccessed the list to store the channels opened during merging, if needed.
* @param largeRecords the iterator of large records, if present.
* @param channelDeleteRegistry the registry to manage files to be close and delete.
*
* @return an iterator over the merged records of the input channels.
* @throws IOException thrown if the readers encounter an I/O problem.
*/
protected final MergeIterator getMergingIteratorWithSegmentedMemory(List> files,
List> inputSegments,
List channelAccessed,
MutableObjectIterator largeRecords,
ChannelDeleteRegistry channelDeleteRegistry) throws IOException {
// create one iterator per channel id
if (LOG.isDebugEnabled()) {
LOG.debug("Performing merge of " + files.size() + " sorted streams.");
}
final List> iterators = new ArrayList<>(files.size() + 1);
for (int i = 0; i < files.size(); i++) {
final List segsForChannel = inputSegments.get(i);
ChannelBackendMutableObjectIterator channelBackendMutableObjectIterator = files.get(i).createReader(segsForChannel);
if (channelAccessed != null) {
channelAccessed.add(channelBackendMutableObjectIterator.getReaderChannel());
}
channelDeleteRegistry.registerOpenChannel(channelBackendMutableObjectIterator.getReaderChannel());
channelDeleteRegistry.registerChannelToBeDelete(channelBackendMutableObjectIterator.getReaderChannel().getChannelID());
iterators.add(channelBackendMutableObjectIterator);
}
if (largeRecords != null) {
iterators.add(largeRecords);
}
return new MergeIterator(iterators, this.typeComparator);
}
/**
* Merges the given sorted runs to a smaller number of sorted runs.
*
* @param files The IDs of the sorted runs that need to be merged.
* @param allReadBuffers The buffers to be used by the readers.
* @param writeBuffers The buffers to be used by the writers.
* @return A list of the IDs of the merged channels.
* @throws IOException Thrown, if the readers or writers encountered an I/O problem.
*/
protected final List> mergeChannelList(List> files,
List allReadBuffers,
List writeBuffers,
ChannelDeleteRegistry channelDeleteRegistry,
int maxFanIn) throws IOException {
// A channel list with length maxFanIni can be merged to maxFanIn files in i-1 rounds where every merge
// is a full merge with maxFanIn input channels. A partial round includes merges with fewer than maxFanIn
// inputs. It is most efficient to perform the partial round first.
final double scale = Math.ceil(Math.log(files.size()) / Math.log(maxFanIn)) - 1;
final int numStart = files.size();
final int numEnd = (int) Math.pow(maxFanIn, scale);
final int numMerges = (int) Math.ceil((numStart - numEnd) / (double) (maxFanIn - 1));
final int numNotMerged = numEnd - numMerges;
final int numToMerge = numStart - numNotMerged;
// unmerged channel IDs are copied directly to the result list
final List> mergedFiles = new ArrayList<>(numEnd);
mergedFiles.addAll(files.subList(0, numNotMerged));
final int channelsToMergePerStep = (int) Math.ceil(numToMerge / (double) numMerges);
// allocate the memory for the merging step
final List> segmentedFileChannels = distributeReadMemory(allReadBuffers, channelsToMergePerStep);
final List> channelsToMergeThisStep = new ArrayList<>(channelsToMergePerStep);
int channelNum = numNotMerged;
while (channelNum < files.size()) {
channelsToMergeThisStep.clear();
for (int i = 0; i < channelsToMergePerStep && channelNum < files.size(); i++, channelNum++) {
channelsToMergeThisStep.add(files.get(channelNum));
}
mergedFiles.add(mergeToNewFile(channelsToMergeThisStep, segmentedFileChannels, writeBuffers, channelDeleteRegistry));
}
return mergedFiles;
}
/**
* Merges the sorted runs described by the given Channel IDs into a single sorted run. The merging process
* uses the given read and write buffers.
*
* @param files The IDs of the runs' channels.
* @param readBuffers The buffers for the readers that read the sorted runs.
* @param writeBuffers The buffers for the writer that writes the merged channel.
* @return The ID and number of blocks of the channel that describes the merged run.
*/
protected SortedDataFile mergeToNewFile(List> files,
List> readBuffers,
List writeBuffers,
ChannelDeleteRegistry channelDeleteRegistry) throws IOException {
// the list with the readers, to be closed at shutdown
final List channelAccesses = new ArrayList<>(files.size());
// the list with the target iterators
final MergeIterator mergeIterator = getMergingIteratorWithSegmentedMemory(files, readBuffers,
channelAccesses, null, channelDeleteRegistry);
final SortedDataFile writer = sortedDataFileFactory.createFile(writeBuffers);
channelDeleteRegistry.registerChannelToBeDelete(writer.getChannelID());
channelDeleteRegistry.registerOpenChannel(writer.getWriteChannel());
// read the merged stream and write the data back
if (objectReuseEnabled) {
T rec = typeSerializer.createInstance();
while ((rec = mergeIterator.next(rec)) != null) {
writer.writeRecord(rec);
}
} else {
T rec;
while ((rec = mergeIterator.next()) != null) {
writer.writeRecord(rec);
}
}
writer.finishWriting();
// unregister merged result to be removed at shutdown
channelDeleteRegistry.unregisterOpenChannel(writer.getWriteChannel());
// remove the merged channel readers from the clear-at-shutdown list
for (int i = 0; i < channelAccesses.size(); i++) {
FileIOChannel access = channelAccesses.get(i);
access.closeAndDelete();
channelDeleteRegistry.unregisterOpenChannel(access);
}
return writer;
}
/**
* Divides the given collection of memory buffers among {@code numChannels} sublists.
*
* @param memory A list containing the memory buffers to be distributed. The buffers are not
* removed from this list.
* @param numChannels The number of channels for which to allocate buffers. Must not be zero.
*/
protected final List> distributeReadMemory(List memory, int numChannels) {
List> target = new ArrayList<>(numChannels);
// determine the memory to use per channel and the number of buffers
final int numBuffers = memory.size();
final int buffersPerChannelLowerBound = numBuffers / numChannels;
final int numChannelsWithOneMore = numBuffers % numChannels;
final Iterator segments = memory.iterator();
// collect memory for the channels that get one segment more
for (int i = 0; i < numChannels; i++) {
int toAssign = (i < numChannelsWithOneMore ? buffersPerChannelLowerBound + 1 : buffersPerChannelLowerBound);
final ArrayList segs = new ArrayList<>(toAssign);
target.add(segs);
for (int j = 0; j < toAssign; ++j) {
segs.add(segments.next());
}
}
return target;
}
}