org.apache.flink.lakesoul.sink.writer.AbstractLakeSoulMultiTableSinkWriter Maven / Gradle / Ivy
// SPDX-FileCopyrightText: 2023 LakeSoul Contributors
//
// SPDX-License-Identifier: Apache-2.0
package org.apache.flink.lakesoul.sink.writer;
import org.apache.flink.api.connector.sink.Sink;
import org.apache.flink.api.connector.sink.SinkWriter;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.lakesoul.sink.LakeSoulMultiTablesSink;
import org.apache.flink.lakesoul.sink.state.LakeSoulMultiTableSinkCommittable;
import org.apache.flink.lakesoul.sink.state.LakeSoulWriterBucketState;
import org.apache.flink.lakesoul.tool.LakeSoulSinkOptions;
import org.apache.flink.lakesoul.types.TableSchemaIdentity;
import org.apache.flink.metrics.Counter;
import org.apache.flink.metrics.groups.SinkWriterMetricGroup;
import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.BucketWriter;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.RollingPolicy;
import org.apache.flink.table.data.RowData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.*;
import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* A {@link SinkWriter} implementation for {@link LakeSoulMultiTablesSink}.
*
* It writes data to and manages the different active {@link LakeSoulWriterBucket buckes} in the
* {@link LakeSoulMultiTablesSink}.
*
* @param The type of input elements.
*/
public abstract class AbstractLakeSoulMultiTableSinkWriter
implements SinkWriter,
Sink.ProcessingTimeService.ProcessingTimeCallback {
private static final Logger LOG = LoggerFactory.getLogger(AbstractLakeSoulMultiTableSinkWriter.class);
private final int subTaskId;
private final LakeSoulWriterBucketFactory bucketFactory;
private final RollingPolicy rollingPolicy;
private final Sink.ProcessingTimeService processingTimeService;
private final long bucketCheckInterval;
// --------------------------- runtime fields -----------------------------
private final BucketerContext bucketerContext;
private final Map, LakeSoulWriterBucket> activeBuckets;
private final OutputFileConfig outputFileConfig;
private final Counter recordsOutCounter;
private final Configuration conf;
public AbstractLakeSoulMultiTableSinkWriter(
int subTaskId,
final SinkWriterMetricGroup metricGroup,
final LakeSoulWriterBucketFactory bucketFactory,
final RollingPolicy rollingPolicy,
final OutputFileConfig outputFileConfig,
final Sink.ProcessingTimeService processingTimeService,
final long bucketCheckInterval,
final Configuration conf) {
this.subTaskId = subTaskId;
this.bucketFactory = checkNotNull(bucketFactory);
this.rollingPolicy = checkNotNull(rollingPolicy);
this.outputFileConfig = checkNotNull(outputFileConfig);
this.activeBuckets = new HashMap<>();
this.bucketerContext = new BucketerContext();
this.recordsOutCounter =
checkNotNull(metricGroup).getIOMetricGroup().getNumRecordsOutCounter();
this.processingTimeService = checkNotNull(processingTimeService);
checkArgument(
bucketCheckInterval > 0,
"Bucket checking interval for processing time should be positive.");
this.bucketCheckInterval = bucketCheckInterval;
this.conf = conf;
}
public void initializeState(List bucketStates) throws IOException {
checkNotNull(bucketStates, "The retrieved state was null.");
LOG.info("initializeState size {}", bucketStates.size());
for (LakeSoulWriterBucketState state : bucketStates) {
String bucketId = state.getBucketId();
LOG.info("initializeState restoring state: {}", state);
TableSchemaIdentity identity = state.getIdentity();
TableSchemaWriterCreator creator = getOrCreateTableSchemaWriterCreator(identity);
LakeSoulWriterBucket restoredBucket =
bucketFactory.restoreBucket(
subTaskId,
state.getIdentity(),
creator.createBucketWriter(),
rollingPolicy,
state,
outputFileConfig);
updateActiveBucketId(identity, bucketId, restoredBucket);
}
registerNextBucketInspectionTimer();
}
private void updateActiveBucketId(TableSchemaIdentity tableId, String bucketId, LakeSoulWriterBucket restoredBucket)
throws IOException {
final LakeSoulWriterBucket bucket = activeBuckets.get(Tuple2.of(tableId, bucketId));
if (bucket != null) {
bucket.merge(restoredBucket);
} else {
activeBuckets.put(Tuple2.of(tableId, bucketId), restoredBucket);
}
}
protected abstract TableSchemaWriterCreator getOrCreateTableSchemaWriterCreator(TableSchemaIdentity identity);
protected abstract List> extractTableSchemaAndRowData(IN element) throws Exception;
protected long getDataDmlTsMs(IN element) {
return Long.MAX_VALUE;
}
@Override
public void write(IN element, Context context) throws IOException {
if (element == null) {
return;
}
// setting the values in the bucketer context
bucketerContext.update(
context.timestamp(),
context.currentWatermark(),
processingTimeService.getCurrentProcessingTime());
List> schemaAndRowDatas;
long dataDmlTsMs = getDataDmlTsMs(element);
try {
schemaAndRowDatas = extractTableSchemaAndRowData(element);
} catch (Exception e) {
throw new IOException(e);
}
for (Tuple2 schemaAndRowData : schemaAndRowDatas) {
TableSchemaIdentity identity = schemaAndRowData.f0;
RowData rowData = schemaAndRowData.f1;
TableSchemaWriterCreator creator = getOrCreateTableSchemaWriterCreator(identity);
final String bucketId = creator.bucketAssigner.getBucketId(rowData, bucketerContext);
final LakeSoulWriterBucket bucket = getOrCreateBucketForBucketId(identity, bucketId, creator);
bucket.write(rowData, processingTimeService.getCurrentProcessingTime(), dataDmlTsMs);
recordsOutCounter.inc();
}
}
@Override
public List prepareCommit(boolean flush) throws IOException {
List committables = new ArrayList<>();
String dmlType = this.conf.getString(LakeSoulSinkOptions.DMLTYPE);
// Every time before we prepare commit, we first check and remove the inactive
// buckets. Checking the activeness right before pre-committing avoid re-creating
// the bucket every time if the bucket use OnCheckpointingRollingPolicy.
Iterator, LakeSoulWriterBucket>> activeBucketIt =
activeBuckets.entrySet().iterator();
while (activeBucketIt.hasNext()) {
Map.Entry, LakeSoulWriterBucket> entry = activeBucketIt.next();
if (!entry.getValue().isActive()) {
activeBucketIt.remove();
} else {
committables.addAll(entry.getValue().prepareCommit(flush,dmlType));
}
}
return committables;
}
@Override
public List snapshotState(long checkpointId) throws IOException {
List states = new ArrayList<>();
for (LakeSoulWriterBucket bucket : activeBuckets.values()) {
LakeSoulWriterBucketState state = bucket.snapshotState();
LOG.info("snapshotState: {}", state);
states.add(state);
}
LOG.info("snapshotState size: {}", states.size());
return states;
}
private LakeSoulWriterBucket getOrCreateBucketForBucketId(
TableSchemaIdentity identity,
String bucketId,
TableSchemaWriterCreator creator) throws IOException {
LakeSoulWriterBucket bucket = activeBuckets.get(Tuple2.of(identity, bucketId));
if (bucket == null) {
final Path bucketPath = assembleBucketPath(creator.tableLocation, bucketId);
BucketWriter bucketWriter = creator.createBucketWriter();
bucket =
bucketFactory.getNewBucket(
subTaskId,
creator.identity,
bucketId, bucketPath, bucketWriter, rollingPolicy, outputFileConfig);
activeBuckets.put(Tuple2.of(identity, bucketId), bucket);
LOG.info("Create new bucket {}, {}, {}",
identity, bucketId, bucketPath);
}
return bucket;
}
@Override
public void close() {
if (activeBuckets != null) {
activeBuckets.values().forEach(LakeSoulWriterBucket::disposePartFile);
}
}
private Path assembleBucketPath(Path basePath, String bucketId) {
if ("".equals(bucketId)) {
return basePath;
}
return new Path(basePath, bucketId);
}
@Override
public void onProcessingTime(long time) throws IOException {
for (LakeSoulWriterBucket bucket : activeBuckets.values()) {
bucket.onProcessingTime(time);
}
registerNextBucketInspectionTimer();
}
private void registerNextBucketInspectionTimer() {
final long nextInspectionTime =
processingTimeService.getCurrentProcessingTime() + bucketCheckInterval;
processingTimeService.registerProcessingTimer(nextInspectionTime, this);
}
/**
* The {@link BucketAssigner.Context} exposed to the {@link BucketAssigner#getBucketId(Object,
* BucketAssigner.Context)} whenever a new incoming element arrives.
*/
private static final class BucketerContext implements BucketAssigner.Context {
@Nullable
private Long elementTimestamp;
private long currentWatermark;
private long currentProcessingTime;
private BucketerContext() {
this.elementTimestamp = null;
this.currentWatermark = Long.MIN_VALUE;
this.currentProcessingTime = Long.MIN_VALUE;
}
void update(@Nullable Long elementTimestamp, long watermark, long currentProcessingTime) {
this.elementTimestamp = elementTimestamp;
this.currentWatermark = watermark;
this.currentProcessingTime = currentProcessingTime;
}
@Override
public long currentProcessingTime() {
return currentProcessingTime;
}
@Override
public long currentWatermark() {
return currentWatermark;
}
@Override
@Nullable
public Long timestamp() {
return elementTimestamp;
}
}
}