Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.sink;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordMerger;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.HoodieRecordUtils;
import org.apache.hudi.common.util.ObjectSizeCalculator;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.metrics.FlinkStreamWriteMetrics;
import org.apache.hudi.sink.common.AbstractStreamWriteFunction;
import org.apache.hudi.sink.event.WriteMetadataEvent;
import org.apache.hudi.table.action.commit.FlinkWriteHelper;
import org.apache.hudi.util.StreamerUtil;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.function.BiFunction;
/**
* Sink function to write the data to the underneath filesystem.
*
*
Work Flow
*
*
The function firstly buffers the data as a batch of {@link HoodieRecord}s,
* It flushes(write) the records batch when the batch size exceeds the configured size {@link FlinkOptions#WRITE_BATCH_SIZE}
* or the total buffer size exceeds the configured size {@link FlinkOptions#WRITE_TASK_MAX_SIZE}
* or a Flink checkpoint starts. After a batch has been written successfully,
* the function notifies its operator coordinator {@link StreamWriteOperatorCoordinator} to mark a successful write.
*
*
The Semantics
*
*
The task implements exactly-once semantics by buffering the data between checkpoints. The operator coordinator
* starts a new instant on the timeline when a checkpoint triggers, the coordinator checkpoints always
* start before its operator, so when this function starts a checkpoint, a REQUESTED instant already exists.
*
*
The function process thread blocks data buffering after the checkpoint thread finishes flushing the existing data buffer until
* the current checkpoint succeed and the coordinator starts a new instant. Any error triggers the job failure during the metadata committing,
* when the job recovers from a failure, the write function re-send the write metadata to the coordinator to see if these metadata
* can re-commit, thus if unexpected error happens during the instant committing, the coordinator would retry to commit when the job
* recovers.
*
*
Fault Tolerance
*
*
The operator coordinator checks and commits the last instant then starts a new one after a checkpoint finished successfully.
* It rolls back any inflight instant before it starts a new instant, this means one hoodie instant only span one checkpoint,
* the write function blocks data buffer flushing for the configured checkpoint timeout
* before it throws exception, any checkpoint failure would finally trigger the job failure.
*
*
Note: The function task requires the input stream be shuffled by the file IDs.
*
* @param Type of the input record
* @see StreamWriteOperatorCoordinator
*/
public class StreamWriteFunction extends AbstractStreamWriteFunction {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(StreamWriteFunction.class);
/**
* Write buffer as buckets for a checkpoint. The key is bucket ID.
*/
private transient Map buckets;
protected transient BiFunction, String, List> writeFunction;
private transient HoodieRecordMerger recordMerger;
/**
* Total size tracer.
*/
private transient TotalSizeTracer tracer;
/**
* Metrics for flink stream write.
*/
protected transient FlinkStreamWriteMetrics writeMetrics;
/**
* Constructs a StreamingSinkFunction.
*
* @param config The config options
*/
public StreamWriteFunction(Configuration config) {
super(config);
}
@Override
public void open(Configuration parameters) throws IOException {
this.tracer = new TotalSizeTracer(this.config);
initBuffer();
initWriteFunction();
initMergeClass();
registerMetrics();
}
@Override
public void snapshotState() {
// Based on the fact that the coordinator starts the checkpoint first,
// it would check the validity.
// wait for the buffer data flush out and request a new instant
flushRemaining(false);
}
@Override
public void processElement(I value, ProcessFunction.Context ctx, Collector