org.apache.flink.connector.base.source.hybrid.HybridSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.connector.base.source.hybrid;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.connector.source.Boundedness;
import org.apache.flink.api.connector.source.Source;
import org.apache.flink.api.connector.source.SourceReader;
import org.apache.flink.api.connector.source.SourceReaderContext;
import org.apache.flink.api.connector.source.SplitEnumerator;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.apache.flink.api.java.ClosureCleaner;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.util.Preconditions;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
/**
* Hybrid source that switches underlying sources based on configured source chain.
*
* A simple example with FileSource and KafkaSource with fixed Kafka start position:
*
*
{@code
* FileSource fileSource =
* FileSource.forRecordStreamFormat(new TextLineFormat(), Path.fromLocalFile(testDir)).build();
* KafkaSource kafkaSource =
* KafkaSource.builder()
* .setBootstrapServers("localhost:9092")
* .setGroupId("MyGroup")
* .setTopics(Arrays.asList("quickstart-events"))
* .setDeserializer(
* KafkaRecordDeserializer.valueOnly(StringDeserializer.class))
* .setStartingOffsets(OffsetsInitializer.earliest())
* .build();
* HybridSource hybridSource =
* HybridSource.builder(fileSource)
* .addSource(kafkaSource)
* .build();
* }
*
* A more complex example with Kafka start position derived from previous source:
*
*
{@code
* HybridSource hybridSource =
* HybridSource.builder(fileSource)
* .addSource(
* switchContext -> {
* StaticFileSplitEnumerator previousEnumerator =
* switchContext.getPreviousEnumerator();
* // how to get timestamp depends on specific enumerator
* long timestamp = previousEnumerator.getEndTimestamp();
* OffsetsInitializer offsets =
* OffsetsInitializer.timestamp(timestamp);
* KafkaSource kafkaSource =
* KafkaSource.builder()
* .setBootstrapServers("localhost:9092")
* .setGroupId("MyGroup")
* .setTopics(Arrays.asList("quickstart-events"))
* .setDeserializer(
* KafkaRecordDeserializer.valueOnly(StringDeserializer.class))
* .setStartingOffsets(offsets)
* .build();
* return kafkaSource;
* },
* Boundedness.CONTINUOUS_UNBOUNDED)
* .build();
* }
*/
@PublicEvolving
public class HybridSource implements Source {
private final List sources;
/** Protected for subclass, use {@link #builder(Source)} to construct source. */
protected HybridSource(List sources) {
Preconditions.checkArgument(!sources.isEmpty());
this.sources = sources;
}
/** Builder for {@link HybridSource}. */
public static HybridSourceBuilder builder(
Source firstSource) {
HybridSourceBuilder builder = new HybridSourceBuilder<>();
return builder.addSource(firstSource);
}
@Override
public Boundedness getBoundedness() {
return sources.get(sources.size() - 1).boundedness;
}
@Override
public SourceReader createReader(SourceReaderContext readerContext)
throws Exception {
return new HybridSourceReader(readerContext);
}
@Override
public SplitEnumerator createEnumerator(
SplitEnumeratorContext enumContext) {
return new HybridSourceSplitEnumerator(enumContext, sources, 0, null);
}
@Override
public SplitEnumerator restoreEnumerator(
SplitEnumeratorContext enumContext,
HybridSourceEnumeratorState checkpoint)
throws Exception {
return new HybridSourceSplitEnumerator(
enumContext, sources, checkpoint.getCurrentSourceIndex(), checkpoint);
}
@Override
public SimpleVersionedSerializer getSplitSerializer() {
return new HybridSourceSplitSerializer();
}
@Override
public SimpleVersionedSerializer
getEnumeratorCheckpointSerializer() {
return new HybridSourceEnumeratorStateSerializer();
}
/**
* Context provided to source factory.
*
* To derive a start position at switch time, the source can be initialized from context of
* the previous enumerator. A specific enumerator implementation may carry state such as an end
* timestamp, that can be used to derive the start position of the next source.
*
*
Currently only the previous enumerator is exposed. The context interface allows for
* backward compatible extension, i.e. additional information about the previous source can be
* supplied in the future.
*/
public interface SourceSwitchContext {
EnumT getPreviousEnumerator();
}
/**
* Factory for underlying sources of {@link HybridSource}.
*
* This factory permits building of a source at graph construction time or deferred at switch
* time. Provides the ability to set a start position in any way a specific source allows.
* Future convenience could be built on top of it, for example a default implementation that
* recognizes optional interfaces to transfer position in a universal format.
*
*
Called when the current enumerator has finished. The previous source's final state can
* thus be used to construct the next source, as required for dynamic position transfer at time
* of switching.
*
*
If start position is known at job submission time, the source can be constructed in the
* entry point and simply wrapped into the factory, providing the benefit of validation during
* submission.
*/
@FunctionalInterface
public interface SourceFactory<
T, SourceT extends Source, FromEnumT extends SplitEnumerator>
extends Serializable {
SourceT create(SourceSwitchContext context);
}
private static class PassthroughSourceFactory<
T, SourceT extends Source, FromEnumT extends SplitEnumerator>
implements SourceFactory {
private final SourceT source;
private PassthroughSourceFactory(SourceT source) {
this.source = source;
}
@Override
public SourceT create(SourceSwitchContext context) {
return source;
}
}
/** Entry for list of underlying sources. */
static class SourceListEntry implements Serializable {
protected final SourceFactory factory;
protected final Boundedness boundedness;
private SourceListEntry(SourceFactory factory, Boundedness boundedness) {
this.factory = Preconditions.checkNotNull(factory);
this.boundedness = Preconditions.checkNotNull(boundedness);
}
static SourceListEntry of(SourceFactory configurer, Boundedness boundedness) {
return new SourceListEntry(configurer, boundedness);
}
}
/** Builder for HybridSource. */
public static class HybridSourceBuilder
implements Serializable {
private final List sources;
public HybridSourceBuilder() {
sources = new ArrayList<>();
}
/** Add pre-configured source (without switch time modification). */
public >
HybridSourceBuilder addSource(NextSourceT source) {
return addSource(new PassthroughSourceFactory<>(source), source.getBoundedness());
}
/** Add source with deferred instantiation based on previous enumerator. */
public >
HybridSourceBuilder addSource(
SourceFactory sourceFactory,
Boundedness boundedness) {
if (!sources.isEmpty()) {
Preconditions.checkArgument(
Boundedness.BOUNDED.equals(sources.get(sources.size() - 1).boundedness),
"All sources except the final source need to be bounded.");
}
ClosureCleaner.clean(
sourceFactory, ExecutionConfig.ClosureCleanerLevel.RECURSIVE, true);
sources.add(SourceListEntry.of(sourceFactory, boundedness));
return (HybridSourceBuilder) this;
}
/** Build the source. */
public HybridSource build() {
return new HybridSource(sources);
}
}
}