org.apache.flink.streaming.connectors.cassandra.CassandraSink Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of flink-connector-cassandra_2.12 Show documentation
There is a newer version: 3.2.0-1.19
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.connectors.cassandra;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.dag.Transformation;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.PojoTypeInfo;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.api.scala.typeutils.CaseClassTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.transformations.LegacySinkTransformation;
import org.apache.flink.streaming.runtime.operators.CheckpointCommitter;
import org.apache.flink.types.Row;

import com.datastax.driver.core.Cluster;

import java.time.Duration;

import scala.Product;

/**
 * This class wraps different Cassandra sink implementations to provide a common interface for all
 * of them.
 *
 * @param  input type
 */
public class CassandraSink {
    private final boolean useDataStreamSink;
    private DataStreamSink sink1;
    private SingleOutputStreamOperator sink2;

    private CassandraSink(DataStreamSink sink) {
        sink1 = sink;
        useDataStreamSink = true;
    }

    private CassandraSink(SingleOutputStreamOperator sink) {
        sink2 = sink;
        useDataStreamSink = false;
    }

    private LegacySinkTransformation getSinkTransformation() {
        return sink1.getTransformation();
    }

    private Transformation getTransformation() {
        return sink2.getTransformation();
    }

    /**
     * Sets the name of this sink. This name is used by the visualization and logging during
     * runtime.
     *
     * @return The named sink.
     */
    public CassandraSink name(String name) {
        if (useDataStreamSink) {
            getSinkTransformation().setName(name);
        } else {
            getTransformation().setName(name);
        }
        return this;
    }

    /**
     * Sets an ID for this operator.
     *
     * The specified ID is used to assign the same operator ID across job submissions (for
     * example when starting a job from a savepoint).
     *
     * 
Important: this ID needs to be unique per transformation and job.
     * Otherwise, job submission will fail.
     *
     * @param uid The unique user-specified ID of this transformation.
     * @return The operator with the specified ID.
     */
    @PublicEvolving
    public CassandraSink uid(String uid) {
        if (useDataStreamSink) {
            getSinkTransformation().setUid(uid);
        } else {
            getTransformation().setUid(uid);
        }
        return this;
    }

    /**
     * Sets an user provided hash for this operator. This will be used AS IS the create the
     * JobVertexID.
     *
     * 
The user provided hash is an alternative to the generated hashes, that is considered when
     * identifying an operator through the default hash mechanics fails (e.g. because of changes
     * between Flink versions).
     *
     * 
Important: this should be used as a workaround or for trouble shooting.
     * The provided hash needs to be unique per transformation and job. Otherwise, job submission
     * will fail. Furthermore, you cannot assign user-specified hash to intermediate nodes in an
     * operator chain and trying so will let your job fail.
     *
     * 
A use case for this is in migration between Flink versions or changing the jobs in a way
     * that changes the automatically generated hashes. In this case, providing the previous hashes
     * directly through this method (e.g. obtained from old logs) can help to reestablish a lost
     * mapping from states to their target operator.
     *
     * @param uidHash The user provided hash for this operator. This will become the JobVertexID,
     *     which is shown in the logs and web ui.
     * @return The operator with the user provided hash.
     */
    @PublicEvolving
    public CassandraSink setUidHash(String uidHash) {
        if (useDataStreamSink) {
            getSinkTransformation().setUidHash(uidHash);
        } else {
            getTransformation().setUidHash(uidHash);
        }
        return this;
    }

    /**
     * Sets the parallelism for this sink. The degree must be higher than zero.
     *
     * @param parallelism The parallelism for this sink.
     * @return The sink with set parallelism.
     */
    public CassandraSink setParallelism(int parallelism) {
        if (useDataStreamSink) {
            sink1.setParallelism(parallelism);
        } else {
            sink2.setParallelism(parallelism);
        }
        return this;
    }

    /**
     * Turns off chaining for this operator so thread co-location will not be used as an
     * optimization.
     *
     * 
Chaining can be turned off for the whole job by {@link
     * org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#disableOperatorChaining()}
     * however it is not advised for performance considerations.
     *
     * @return The sink with chaining disabled
     */
    public CassandraSink disableChaining() {
        if (useDataStreamSink) {
            sink1.disableChaining();
        } else {
            sink2.disableChaining();
        }
        return this;
    }

    /**
     * Sets the slot sharing group of this operation. Parallel instances of operations that are in
     * the same slot sharing group will be co-located in the same TaskManager slot, if possible.
     *
     * 
Operations inherit the slot sharing group of input operations if all input operations are
     * in the same slot sharing group and no slot sharing group was explicitly specified.
     *
     * 
Initially an operation is in the default slot sharing group. An operation can be put into
     * the default group explicitly by setting the slot sharing group to {@code "default"}.
     *
     * @param slotSharingGroup The slot sharing group name.
     */
    public CassandraSink slotSharingGroup(String slotSharingGroup) {
        if (useDataStreamSink) {
            getSinkTransformation().setSlotSharingGroup(slotSharingGroup);
        } else {
            getTransformation().setSlotSharingGroup(slotSharingGroup);
        }
        return this;
    }

    /**
     * Writes a DataStream into a Cassandra database.
     *
     * @param input input DataStream
     * @param  input type
     * @return CassandraSinkBuilder, to further configure the sink
     */
    public static  CassandraSinkBuilder addSink(
            org.apache.flink.streaming.api.scala.DataStream input) {
        return addSink(input.javaStream());
    }

    /**
     * Writes a DataStream into a Cassandra database.
     *
     * @param input input DataStream
     * @param  input type
     * @return CassandraSinkBuilder, to further configure the sink
     */
    public static  CassandraSinkBuilder addSink(DataStream input) {
        TypeInformation typeInfo = input.getType();
        if (typeInfo instanceof TupleTypeInfo) {
            DataStream tupleInput = (DataStream) input;
            return (CassandraSinkBuilder)
                    new CassandraTupleSinkBuilder<>(
                            tupleInput,
                            tupleInput.getType(),
                            tupleInput
                                    .getType()
                                    .createSerializer(
                                            tupleInput.getExecutionEnvironment().getConfig()));
        }
        if (typeInfo instanceof RowTypeInfo) {
            DataStream rowInput = (DataStream) input;
            return (CassandraSinkBuilder)
                    new CassandraRowSinkBuilder(
                            rowInput,
                            rowInput.getType(),
                            rowInput.getType()
                                    .createSerializer(
                                            rowInput.getExecutionEnvironment().getConfig()));
        }
        if (typeInfo instanceof PojoTypeInfo) {
            return new CassandraPojoSinkBuilder<>(
                    input,
                    input.getType(),
                    input.getType().createSerializer(input.getExecutionEnvironment().getConfig()));
        }
        if (typeInfo instanceof CaseClassTypeInfo) {
            DataStream productInput = (DataStream) input;
            return (CassandraSinkBuilder)
                    new CassandraScalaProductSinkBuilder<>(
                            productInput,
                            productInput.getType(),
                            productInput
                                    .getType()
                                    .createSerializer(input.getExecutionEnvironment().getConfig()));
        }
        throw new IllegalArgumentException(
                "No support for the type of the given DataStream: " + input.getType());
    }

    /**
     * Builder for a {@link CassandraSink}.
     *
     * @param 
     */
    public abstract static class CassandraSinkBuilder {
        protected final DataStream input;
        protected final TypeSerializer serializer;
        protected final TypeInformation typeInfo;
        protected final CassandraSinkBaseConfig.Builder configBuilder;
        protected ClusterBuilder builder;
        protected String keyspace;
        protected MapperOptions mapperOptions;
        protected String query;
        protected CheckpointCommitter committer;
        protected boolean isWriteAheadLogEnabled;
        protected CassandraFailureHandler failureHandler;

        public CassandraSinkBuilder(
                DataStream input, TypeInformation typeInfo, TypeSerializer serializer) {
            this.input = input;
            this.typeInfo = typeInfo;
            this.serializer = serializer;
            this.configBuilder = CassandraSinkBaseConfig.newBuilder();
        }

        /**
         * Sets the query that is to be executed for every record.
         *
         * @param query query to use
         * @return this builder
         */
        public CassandraSinkBuilder setQuery(String query) {
            this.query = query;
            return this;
        }

        /**
         * Sets the keyspace to be used.
         *
         * @param keyspace keyspace to use
         * @return this builder
         */
        public CassandraSinkBuilder setDefaultKeyspace(String keyspace) {
            this.keyspace = keyspace;
            return this;
        }

        /**
         * Sets the cassandra host to connect to.
         *
         * @param host host to connect to
         * @return this builder
         */
        public CassandraSinkBuilder setHost(String host) {
            return setHost(host, 9042);
        }

        /**
         * Sets the cassandra host/port to connect to.
         *
         * @param host host to connect to
         * @param port port to connect to
         * @return this builder
         */
        public CassandraSinkBuilder setHost(final String host, final int port) {
            if (this.builder != null) {
                throw new IllegalArgumentException(
                        "Builder was already set. You must use either setHost() or setClusterBuilder().");
            }
            this.builder =
                    new ClusterBuilder() {
                        @Override
                        protected Cluster buildCluster(Cluster.Builder builder) {
                            return builder.addContactPoint(host).withPort(port).build();
                        }
                    };
            return this;
        }

        /**
         * Sets the ClusterBuilder for this sink. A ClusterBuilder is used to configure the
         * connection to cassandra.
         *
         * @param builder ClusterBuilder to configure the connection to cassandra
         * @return this builder
         */
        public CassandraSinkBuilder setClusterBuilder(ClusterBuilder builder) {
            if (this.builder != null) {
                throw new IllegalArgumentException(
                        "Builder was already set. You must use either setHost() or setClusterBuilder().");
            }
            this.builder = builder;
            return this;
        }

        /**
         * Enables the write-ahead log, which allows exactly-once processing for non-deterministic
         * algorithms that use idempotent updates.
         *
         * @return this builder
         */
        public CassandraSinkBuilder enableWriteAheadLog() {
            this.isWriteAheadLogEnabled = true;
            return this;
        }

        /**
         * Enables the write-ahead log, which allows exactly-once processing for non-deterministic
         * algorithms that use idempotent updates.
         *
         * @param committer CheckpointCommitter, that stores information about completed checkpoints
         *     in an external resource. By default this information is stored within a separate
         *     table within Cassandra.
         * @return this builder
         */
        public CassandraSinkBuilder enableWriteAheadLog(CheckpointCommitter committer) {
            this.isWriteAheadLogEnabled = true;
            this.committer = committer;
            return this;
        }

        /**
         * Sets the mapper options for this sink. The mapper options are used to configure the
         * DataStax {@link com.datastax.driver.mapping.Mapper} when writing POJOs.
         *
         * 
This call has no effect if the input {@link DataStream} for this sink does not contain
         * POJOs.
         *
         * @param options MapperOptions, that return an array of options that are used to configure
         *     the DataStax mapper.
         * @return this builder
         */
        public CassandraSinkBuilder setMapperOptions(MapperOptions options) {
            this.mapperOptions = options;
            return this;
        }

        /**
         * Sets the failure handler for this sink. The failure handler is used to provide custom
         * error handling.
         *
         * @param failureHandler CassandraFailureHandler, that handles any Throwable error.
         * @return this builder
         */
        public CassandraSinkBuilder setFailureHandler(CassandraFailureHandler failureHandler) {
            this.failureHandler = failureHandler;
            return this;
        }

        /**
         * Sets the maximum allowed number of concurrent requests for this sink.
         *
         * 
This call has no effect if {@link CassandraSinkBuilder#enableWriteAheadLog()} is
         * called.
         *
         * @param maxConcurrentRequests maximum number of concurrent requests allowed
         * @param timeout timeout duration when acquiring a permit to execute
         * @return this builder
         */
        public CassandraSinkBuilder setMaxConcurrentRequests(
                int maxConcurrentRequests, Duration timeout) {
            this.configBuilder.setMaxConcurrentRequests(maxConcurrentRequests);
            this.configBuilder.setMaxConcurrentRequestsTimeout(timeout);
            return this;
        }

        /**
         * Sets the maximum allowed number of concurrent requests for this sink.
         *
         * 
This call has no effect if {@link CassandraSinkBuilder#enableWriteAheadLog()} is
         * called.
         *
         * @param maxConcurrentRequests maximum number of concurrent requests allowed
         * @return this builder
         */
        public CassandraSinkBuilder setMaxConcurrentRequests(int maxConcurrentRequests) {
            this.configBuilder.setMaxConcurrentRequests(maxConcurrentRequests);
            return this;
        }

        /**
         * Enables ignoring null values, treats null values as unset and avoids writing null fields
         * and creating tombstones.
         *
         * This call has no effect if {@link CassandraSinkBuilder#enableWriteAheadLog()} is
         * called.
         *
         * @return this builder
         */
        public CassandraSinkBuilder enableIgnoreNullFields() {
            this.configBuilder.setIgnoreNullFields(true);
            return this;
        }

        /**
         * Finalizes the configuration of this sink.
         *
         * @return finalized sink
         * @throws Exception
         */
        public CassandraSink build() throws Exception {
            sanityCheck();
            if (failureHandler == null) {
                failureHandler = new NoOpCassandraFailureHandler();
            }
            return isWriteAheadLogEnabled ? createWriteAheadSink() : createSink();
        }

        protected abstract CassandraSink createSink() throws Exception;

        protected abstract CassandraSink createWriteAheadSink() throws Exception;

        protected void sanityCheck() {
            if (builder == null) {
                throw new IllegalArgumentException(
                        "Cassandra host information must be supplied using either setHost() or setClusterBuilder().");
            }
        }
    }

    /**
     * Builder for a {@link CassandraTupleSink}.
     *
     * @param 
     */
    public static class CassandraTupleSinkBuilder
            extends CassandraSinkBuilder {
        public CassandraTupleSinkBuilder(
                DataStream input, TypeInformation typeInfo, TypeSerializer serializer) {
            super(input, typeInfo, serializer);
        }

        @Override
        protected void sanityCheck() {
            super.sanityCheck();
            if (query == null || query.length() == 0) {
                throw new IllegalArgumentException("Query must not be null or empty.");
            }
            if (keyspace != null) {
                throw new IllegalArgumentException(
                        "Specifying a default keyspace is only allowed when using a Pojo-Stream as input.");
            }
        }

        @Override
        public CassandraSink createSink() throws Exception {
            final CassandraTupleSink sink =
                    new CassandraTupleSink<>(query, builder, configBuilder.build(), failureHandler);
            return new CassandraSink<>(input.addSink(sink).name("Cassandra Sink"));
        }

        @Override
        protected CassandraSink createWriteAheadSink() throws Exception {
            return committer == null
                    ? new CassandraSink<>(
                            input.transform(
                                    "Cassandra Sink",
                                    null,
                                    new CassandraTupleWriteAheadSink<>(
                                            query,
                                            serializer,
                                            builder,
                                            new CassandraCommitter(builder))))
                    : new CassandraSink<>(
                            input.transform(
                                    "Cassandra Sink",
                                    null,
                                    new CassandraTupleWriteAheadSink<>(
                                            query, serializer, builder, committer)));
        }
    }

    /** Builder for a {@link CassandraRowSink}. */
    public static class CassandraRowSinkBuilder extends CassandraSinkBuilder {
        public CassandraRowSinkBuilder(
                DataStream input,
                TypeInformation typeInfo,
                TypeSerializer serializer) {
            super(input, typeInfo, serializer);
        }

        @Override
        protected void sanityCheck() {
            super.sanityCheck();
            if (query == null || query.length() == 0) {
                throw new IllegalArgumentException("Query must not be null or empty.");
            }
            if (keyspace != null) {
                throw new IllegalArgumentException(
                        "Specifying a default keyspace is only allowed when using a Pojo-Stream as input.");
            }
        }

        @Override
        protected CassandraSink createSink() throws Exception {
            final CassandraRowSink sink =
                    new CassandraRowSink(
                            typeInfo.getArity(),
                            query,
                            builder,
                            configBuilder.build(),
                            failureHandler);
            return new CassandraSink<>(input.addSink(sink).name("Cassandra Sink"));
        }

        @Override
        protected CassandraSink createWriteAheadSink() throws Exception {
            return committer == null
                    ? new CassandraSink<>(
                            input.transform(
                                    "Cassandra Sink",
                                    null,
                                    new CassandraRowWriteAheadSink(
                                            query,
                                            serializer,
                                            builder,
                                            new CassandraCommitter(builder))))
                    : new CassandraSink<>(
                            input.transform(
                                    "Cassandra Sink",
                                    null,
                                    new CassandraRowWriteAheadSink(
                                            query, serializer, builder, committer)));
        }
    }

    /**
     * Builder for a {@link CassandraPojoSink}.
     *
     * @param 
     */
    public static class CassandraPojoSinkBuilder extends CassandraSinkBuilder {
        public CassandraPojoSinkBuilder(
                DataStream input, TypeInformation typeInfo, TypeSerializer serializer) {
            super(input, typeInfo, serializer);
        }

        @Override
        protected void sanityCheck() {
            super.sanityCheck();
            if (query != null) {
                throw new IllegalArgumentException(
                        "Specifying a query is not allowed when using a Pojo-Stream as input.");
            }
        }

        @Override
        public CassandraSink createSink() throws Exception {
            final CassandraPojoSink sink =
                    new CassandraPojoSink<>(
                            typeInfo.getTypeClass(),
                            builder,
                            mapperOptions,
                            keyspace,
                            configBuilder.build(),
                            failureHandler);
            return new CassandraSink<>(input.addSink(sink).name("Cassandra Sink"));
        }

        @Override
        protected CassandraSink createWriteAheadSink() throws Exception {
            throw new IllegalArgumentException(
                    "Exactly-once guarantees can only be provided for tuple types.");
        }
    }

    /**
     * Builder for a {@link CassandraScalaProductSink}.
     *
     * @param 
     */
    public static class CassandraScalaProductSinkBuilder
            extends CassandraSinkBuilder {
        public CassandraScalaProductSinkBuilder(
                DataStream input, TypeInformation typeInfo, TypeSerializer serializer) {
            super(input, typeInfo, serializer);
        }

        @Override
        protected void sanityCheck() {
            super.sanityCheck();
            if (query == null || query.length() == 0) {
                throw new IllegalArgumentException("Query must not be null or empty.");
            }
            if (keyspace != null) {
                throw new IllegalArgumentException(
                        "Specifying a default keyspace is only allowed when using a Pojo-Stream as input.");
            }
        }

        @Override
        public CassandraSink createSink() throws Exception {
            final CassandraScalaProductSink sink =
                    new CassandraScalaProductSink<>(
                            query, builder, configBuilder.build(), failureHandler);
            return new CassandraSink<>(input.addSink(sink).name("Cassandra Sink"));
        }

        @Override
        protected CassandraSink createWriteAheadSink() throws Exception {
            throw new IllegalArgumentException(
                    "Exactly-once guarantees can only be provided for flink tuple types.");
        }
    }
}