org.apache.hudi.sink.clustering.ClusteringOperator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sink.clustering;
import org.apache.hudi.adapter.MaskingOutputAdapter;
import org.apache.hudi.adapter.Utils;
import org.apache.hudi.client.FlinkTaskContextSupplier;
import org.apache.hudi.client.HoodieFlinkWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.utils.ConcatenatingIterator;
import org.apache.hudi.common.config.HoodieStorageConfig;
import org.apache.hudi.common.model.ClusteringOperation;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.log.HoodieFileSliceReader;
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.CloseableMappingIterator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.exception.HoodieClusteringException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.IOUtils;
import org.apache.hudi.io.storage.HoodieAvroFileReader;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.sink.bulk.BulkInsertWriterHelper;
import org.apache.hudi.sink.bulk.sort.SortOperatorGen;
import org.apache.hudi.sink.utils.NonThrownExecutor;
import org.apache.hudi.table.HoodieFlinkTable;
import org.apache.hudi.util.AvroSchemaConverter;
import org.apache.hudi.util.AvroToRowDataConverters;
import org.apache.hudi.util.FlinkWriteClients;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.Gauge;
import org.apache.flink.runtime.memory.MemoryManager;
import org.apache.flink.streaming.api.graph.StreamConfig;
import org.apache.flink.streaming.api.operators.BoundedOneInput;
import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
import org.apache.flink.streaming.api.operators.Output;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
import org.apache.flink.streaming.runtime.tasks.StreamTask;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.binary.BinaryRowData;
import org.apache.flink.table.planner.codegen.sort.SortCodeGenerator;
import org.apache.flink.table.runtime.generated.NormalizedKeyComputer;
import org.apache.flink.table.runtime.generated.RecordComparator;
import org.apache.flink.table.runtime.operators.TableStreamOperator;
import org.apache.flink.table.runtime.operators.sort.BinaryExternalSorter;
import org.apache.flink.table.runtime.typeutils.AbstractRowDataSerializer;
import org.apache.flink.table.runtime.typeutils.BinaryRowDataSerializer;
import org.apache.flink.table.runtime.typeutils.RowDataSerializer;
import org.apache.flink.table.runtime.util.StreamRecordCollector;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
/**
* Operator to execute the actual clustering task assigned by the clustering plan task.
* In order to execute scalable, the input should shuffle by the clustering event {@link ClusteringPlanEvent}.
*/
public class ClusteringOperator extends TableStreamOperator implements
OneInputStreamOperator, BoundedOneInput {
private static final Logger LOG = LoggerFactory.getLogger(ClusteringOperator.class);
private final Configuration conf;
private final RowType rowType;
private int taskID;
private transient HoodieWriteConfig writeConfig;
private transient HoodieFlinkTable> table;
private transient Schema schema;
private transient Schema readerSchema;
private transient int[] requiredPos;
private transient AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter;
private transient HoodieFlinkWriteClient writeClient;
private transient StreamRecordCollector collector;
private transient BinaryRowDataSerializer binarySerializer;
/**
* Whether to execute clustering asynchronously.
*/
private final boolean asyncClustering;
/**
* Whether the clustering sort is enabled.
*/
private final boolean sortClusteringEnabled;
/**
* Executor service to execute the clustering task.
*/
private transient NonThrownExecutor executor;
public ClusteringOperator(Configuration conf, RowType rowType) {
this.conf = conf;
this.rowType = BulkInsertWriterHelper.addMetadataFields(rowType, false);
this.asyncClustering = OptionsResolver.needsAsyncClustering(conf);
this.sortClusteringEnabled = OptionsResolver.sortClusteringEnabled(conf);
// override max parquet file size in conf
this.conf.setLong(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(),
this.conf.getLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES));
// target size should larger than small file limit
this.conf.setLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT.key(),
Math.min(this.conf.getLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES) / 1024 / 1024,
this.conf.getLong(FlinkOptions.CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT)));
}
@Override
public void setup(StreamTask, ?> containingTask, StreamConfig config, Output> output) {
super.setup(containingTask, config, new MaskingOutputAdapter<>(output));
}
@Override
public void open() throws Exception {
super.open();
this.taskID = getRuntimeContext().getIndexOfThisSubtask();
this.writeConfig = FlinkWriteClients.getHoodieClientConfig(this.conf);
this.writeClient = FlinkWriteClients.createWriteClient(conf, getRuntimeContext());
this.table = writeClient.getHoodieTable();
this.schema = AvroSchemaConverter.convertToSchema(rowType);
this.readerSchema = this.schema;
this.requiredPos = getRequiredPositions();
this.avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(rowType);
this.binarySerializer = new BinaryRowDataSerializer(rowType.getFieldCount());
if (this.asyncClustering) {
this.executor = NonThrownExecutor.builder(LOG).build();
}
this.collector = new StreamRecordCollector<>(output);
}
@Override
public void processElement(StreamRecord element) throws Exception {
final ClusteringPlanEvent event = element.getValue();
final String instantTime = event.getClusteringInstantTime();
final List clusteringOperations = event.getClusteringGroupInfo().getOperations();
if (this.asyncClustering) {
// executes the compaction task asynchronously to not block the checkpoint barrier propagate.
executor.execute(
() -> doClustering(instantTime, clusteringOperations),
(errMsg, t) -> collector.collect(new ClusteringCommitEvent(instantTime, getFileIds(clusteringOperations), taskID)),
"Execute clustering for instant %s from task %d", instantTime, taskID);
} else {
// executes the clustering task synchronously for batch mode.
LOG.info("Execute clustering for instant {} from task {}", instantTime, taskID);
doClustering(instantTime, clusteringOperations);
}
}
@Override
public void close() throws Exception {
if (null != this.executor) {
this.executor.close();
}
if (this.writeClient != null) {
this.writeClient.close();
this.writeClient = null;
}
}
/**
* End input action for batch source.
*/
public void endInput() {
// no operation
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------
private void doClustering(String instantTime, List clusteringOperations) throws Exception {
BulkInsertWriterHelper writerHelper = new BulkInsertWriterHelper(this.conf, this.table, this.writeConfig,
instantTime, this.taskID, getRuntimeContext().getNumberOfParallelSubtasks(), getRuntimeContext().getAttemptNumber(),
this.rowType, true);
Iterator iterator;
if (clusteringOperations.stream().anyMatch(operation -> CollectionUtils.nonEmpty(operation.getDeltaFilePaths()))) {
// if there are log files, we read all records into memory for a file group and apply updates.
iterator = readRecordsForGroupWithLogs(clusteringOperations, instantTime);
} else {
// We want to optimize reading records for case there are no log files.
iterator = readRecordsForGroupBaseFiles(clusteringOperations);
}
if (this.sortClusteringEnabled) {
RowDataSerializer rowDataSerializer = new RowDataSerializer(rowType);
BinaryExternalSorter sorter = initSorter();
while (iterator.hasNext()) {
RowData rowData = iterator.next();
BinaryRowData binaryRowData = rowDataSerializer.toBinaryRow(rowData).copy();
sorter.write(binaryRowData);
}
BinaryRowData row = binarySerializer.createInstance();
while ((row = sorter.getIterator().next(row)) != null) {
writerHelper.write(row);
}
sorter.close();
} else {
while (iterator.hasNext()) {
writerHelper.write(iterator.next());
}
}
List writeStatuses = writerHelper.getWriteStatuses(this.taskID);
collector.collect(new ClusteringCommitEvent(instantTime, getFileIds(clusteringOperations), writeStatuses, this.taskID));
writerHelper.close();
}
/**
* Read records from baseFiles, apply updates and convert to Iterator.
*/
@SuppressWarnings("unchecked")
private Iterator readRecordsForGroupWithLogs(List clusteringOps, String instantTime) {
List> recordIterators = new ArrayList<>();
long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new FlinkTaskContextSupplier(null), writeConfig);
LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction);
for (ClusteringOperation clusteringOp : clusteringOps) {
try {
Option baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath())
? Option.empty()
: Option.of(HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType()).getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath())));
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(table.getMetaClient().getFs())
.withBasePath(table.getMetaClient().getBasePath())
.withLogFilePaths(clusteringOp.getDeltaFilePaths())
.withReaderSchema(readerSchema)
.withLatestInstantTime(instantTime)
.withMaxMemorySizeInBytes(maxMemoryPerCompaction)
.withReadBlocksLazily(writeConfig.getCompactionLazyBlockReadEnabled())
.withReverseReader(writeConfig.getCompactionReverseLogReadEnabled())
.withBufferSize(writeConfig.getMaxDFSStreamBufferSize())
.withSpillableMapBasePath(writeConfig.getSpillableMapBasePath())
.withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType())
.withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled())
.withRecordMerger(writeConfig.getRecordMerger())
.build();
HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig();
HoodieFileSliceReader extends IndexedRecord> hoodieFileSliceReader = HoodieFileSliceReader.getFileSliceReader(baseFileReader, scanner, readerSchema,
tableConfig.getProps(),
tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(),
tableConfig.getPartitionFieldProp())));
recordIterators.add(StreamSupport.stream(Spliterators.spliteratorUnknownSize(hoodieFileSliceReader, Spliterator.NONNULL), false).map(hoodieRecord -> {
try {
return this.transform(hoodieRecord.toIndexedRecord(readerSchema, new Properties()).get().getData());
} catch (IOException e) {
throw new HoodieIOException("Failed to read next record", e);
}
}).iterator());
} catch (IOException e) {
throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath()
+ " and " + clusteringOp.getDeltaFilePaths(), e);
}
}
return new ConcatenatingIterator<>(recordIterators);
}
/**
* Read records from baseFiles and get iterator.
*/
private Iterator readRecordsForGroupBaseFiles(List clusteringOps) {
List> iteratorsForPartition = clusteringOps.stream().map(clusteringOp -> {
Iterable indexedRecords = () -> {
try {
HoodieFileReaderFactory fileReaderFactory = HoodieFileReaderFactory.getReaderFactory(table.getConfig().getRecordMerger().getRecordType());
HoodieAvroFileReader fileReader = (HoodieAvroFileReader) fileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath()));
return new CloseableMappingIterator<>(fileReader.getRecordIterator(readerSchema), HoodieRecord::getData);
} catch (IOException e) {
throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath()
+ " and " + clusteringOp.getDeltaFilePaths(), e);
}
};
return StreamSupport.stream(indexedRecords.spliterator(), false).map(this::transform).iterator();
}).collect(Collectors.toList());
return new ConcatenatingIterator<>(iteratorsForPartition);
}
/**
* Transform IndexedRecord into HoodieRecord.
*/
private RowData transform(IndexedRecord indexedRecord) {
GenericRecord record = (GenericRecord) indexedRecord;
return (RowData) avroToRowDataConverter.convert(record);
}
private int[] getRequiredPositions() {
final List fieldNames = readerSchema.getFields().stream().map(Schema.Field::name).collect(Collectors.toList());
return schema.getFields().stream()
.map(field -> fieldNames.indexOf(field.name()))
.mapToInt(i -> i)
.toArray();
}
private BinaryExternalSorter initSorter() {
ClassLoader cl = getContainingTask().getUserCodeClassLoader();
NormalizedKeyComputer computer = createSortCodeGenerator().generateNormalizedKeyComputer("SortComputer").newInstance(cl);
RecordComparator comparator = createSortCodeGenerator().generateRecordComparator("SortComparator").newInstance(cl);
MemoryManager memManager = getContainingTask().getEnvironment().getMemoryManager();
BinaryExternalSorter sorter = Utils.getBinaryExternalSorter(
this.getContainingTask(),
memManager,
computeMemorySize(),
this.getContainingTask().getEnvironment().getIOManager(),
(AbstractRowDataSerializer) binarySerializer,
binarySerializer,
computer,
comparator,
this.conf);
sorter.startThreads();
// register the metrics.
getMetricGroup().gauge("memoryUsedSizeInBytes", (Gauge) sorter::getUsedMemoryInBytes);
getMetricGroup().gauge("numSpillFiles", (Gauge) sorter::getNumSpillFiles);
getMetricGroup().gauge("spillInBytes", (Gauge) sorter::getSpillInBytes);
return sorter;
}
private SortCodeGenerator createSortCodeGenerator() {
SortOperatorGen sortOperatorGen = new SortOperatorGen(rowType,
conf.getString(FlinkOptions.CLUSTERING_SORT_COLUMNS).split(","));
return sortOperatorGen.createSortCodeGenerator();
}
private String getFileIds(List clusteringOperations) {
return clusteringOperations.stream().map(ClusteringOperation::getFileId).collect(Collectors.joining(","));
}
@VisibleForTesting
public void setExecutor(NonThrownExecutor executor) {
this.executor = executor;
}
@VisibleForTesting
public void setOutput(Output> output) {
this.output = output;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy