org.apache.beam.sdk.io.sparkreceiver.SparkReceiverIO Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.sparkreceiver;
import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull;
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
import com.google.auto.value.AutoValue;
import org.apache.beam.sdk.transforms.Impulse;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.spark.streaming.receiver.Receiver;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Streaming sources for Spark {@link Receiver}.
*
* Reading using {@link SparkReceiverIO}
*
* You will need to pass a {@link ReceiverBuilder} which is responsible for instantiating new
* {@link Receiver} objects.
*
*
{@link Receiver} that will be used should implement {@link HasOffset} interface. You will need
* to pass {@code getOffsetFn} which is a {@link SerializableFunction} that defines how to get
* {@code Long offset} from {@code V record}.
*
*
Optionally you can pass {@code timestampFn} which is a {@link SerializableFunction} that
* defines how to get {@code Instant timestamp} from {@code V record}, you can pass {@code
* startOffset} which is inclusive start offset from which the reading should be started.
*
*
Optionally you can pass {@code pullFrequencySec} which is a delay in seconds between polling
* for new records updates. Also, you can pass {@code startPollTimeoutSec} which is delay in seconds
* before start polling.
*
*
Example of {@link SparkReceiverIO#read()} usage:
*
*
{@code
* Pipeline p = ...; // Create pipeline.
*
* // Create ReceiverBuilder for CustomReceiver
* ReceiverBuilder receiverBuilder =
* new ReceiverBuilder<>(CustomReceiver.class).withConstructorArgs();
*
* //Read from CustomReceiver
* p.apply("Spark Receiver Read",
* SparkReceiverIO.Read reader =
* SparkReceiverIO.read()
* .withGetOffsetFn(Long::valueOf)
* .withTimestampFn(Instant::parse)
* .withPullFrequencySec(1L)
* .withStartPollTimeoutSec(2L)
* .withStartOffset(10L)
* .withSparkReceiverBuilder(receiverBuilder);
* }
*/
public class SparkReceiverIO {
private static final Logger LOG = LoggerFactory.getLogger(SparkReceiverIO.class);
public static Read read() {
return new AutoValue_SparkReceiverIO_Read.Builder().build();
}
/** A {@link PTransform} to read from Spark {@link Receiver}. */
@AutoValue
@AutoValue.CopyAnnotations
public abstract static class Read extends PTransform> {
abstract @Nullable ReceiverBuilder> getSparkReceiverBuilder();
abstract @Nullable SerializableFunction getGetOffsetFn();
abstract @Nullable SerializableFunction getTimestampFn();
abstract @Nullable Long getPullFrequencySec();
abstract @Nullable Long getStartPollTimeoutSec();
abstract @Nullable Long getStartOffset();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setSparkReceiverBuilder(
ReceiverBuilder> sparkReceiverBuilder);
abstract Builder setGetOffsetFn(SerializableFunction getOffsetFn);
abstract Builder setTimestampFn(SerializableFunction timestampFn);
abstract Builder setPullFrequencySec(Long pullFrequencySec);
abstract Builder setStartPollTimeoutSec(Long startPollTimeoutSec);
abstract Builder setStartOffset(Long startOffset);
abstract Read build();
}
/** Sets {@link ReceiverBuilder} with value and custom Spark {@link Receiver} class. */
public Read withSparkReceiverBuilder(
ReceiverBuilder> sparkReceiverBuilder) {
checkArgument(sparkReceiverBuilder != null, "Spark receiver builder can not be null");
return toBuilder().setSparkReceiverBuilder(sparkReceiverBuilder).build();
}
/** A function to get offset in order to start {@link Receiver} from it. */
public Read withGetOffsetFn(SerializableFunction getOffsetFn) {
checkArgument(getOffsetFn != null, "Get offset function can not be null");
return toBuilder().setGetOffsetFn(getOffsetFn).build();
}
/** A function to calculate timestamp for a record. */
public Read withTimestampFn(SerializableFunction timestampFn) {
checkArgument(timestampFn != null, "Timestamp function can not be null");
return toBuilder().setTimestampFn(timestampFn).build();
}
/** Delay in seconds between polling for new records updates. */
public Read withPullFrequencySec(Long pullFrequencySec) {
checkArgument(pullFrequencySec != null, "Pull frequency can not be null");
return toBuilder().setPullFrequencySec(pullFrequencySec).build();
}
/** Waiting time after the {@link Receiver} starts. Required to prepare for polling. */
public Read withStartPollTimeoutSec(Long startPollTimeoutSec) {
checkArgument(startPollTimeoutSec != null, "Start poll timeout can not be null");
return toBuilder().setStartPollTimeoutSec(startPollTimeoutSec).build();
}
/** Inclusive start offset from which the reading should be started. */
public Read withStartOffset(Long startOffset) {
checkArgument(startOffset != null, "Start offset can not be null");
return toBuilder().setStartOffset(startOffset).build();
}
@Override
public PCollection expand(PBegin input) {
validateTransform();
return input.apply(new ReadFromSparkReceiverViaSdf<>(this));
}
public void validateTransform() {
ReceiverBuilder> sparkReceiverBuilder = getSparkReceiverBuilder();
checkStateNotNull(sparkReceiverBuilder, "withSparkReceiverBuilder() is required");
checkStateNotNull(getGetOffsetFn(), "withGetOffsetFn() is required");
}
}
static class ReadFromSparkReceiverViaSdf extends PTransform> {
private final Read sparkReceiverRead;
ReadFromSparkReceiverViaSdf(Read sparkReceiverRead) {
this.sparkReceiverRead = sparkReceiverRead;
}
@Override
public PCollection expand(PBegin input) {
final ReceiverBuilder> sparkReceiverBuilder =
sparkReceiverRead.getSparkReceiverBuilder();
checkStateNotNull(sparkReceiverBuilder, "withSparkReceiverBuilder() is required");
if (!HasOffset.class.isAssignableFrom(sparkReceiverBuilder.getSparkReceiverClass())) {
throw new UnsupportedOperationException(
String.format(
"Given Spark Receiver class %s doesn't implement HasOffset interface,"
+ " therefore it is not supported!",
sparkReceiverBuilder.getSparkReceiverClass().getName()));
} else {
LOG.info("{} started reading", ReadFromSparkReceiverWithOffsetDoFn.class.getSimpleName());
return input
.apply(Impulse.create())
.apply(ParDo.of(new ReadFromSparkReceiverWithOffsetDoFn<>(sparkReceiverRead)));
// TODO: Split data from SparkReceiver into multiple workers
}
}
}
}