org.apache.kafka.streams.StreamsBuilder Maven / Gradle / Ivy
Show all versions of org.apache.servicemix.bundles.kafka-streams
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kafka.streams;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.errors.TopologyException;
import org.apache.kafka.streams.kstream.Consumed;
import org.apache.kafka.streams.kstream.GlobalKTable;
import org.apache.kafka.streams.kstream.KGroupedStream;
import org.apache.kafka.streams.kstream.KGroupedTable;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KTable;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.kstream.Transformer;
import org.apache.kafka.streams.kstream.ValueTransformer;
import org.apache.kafka.streams.kstream.internals.ConsumedInternal;
import org.apache.kafka.streams.kstream.internals.InternalStreamsBuilder;
import org.apache.kafka.streams.kstream.internals.MaterializedInternal;
import org.apache.kafka.streams.processor.Processor;
import org.apache.kafka.streams.processor.ProcessorSupplier;
import org.apache.kafka.streams.processor.StateStore;
import org.apache.kafka.streams.processor.TimestampExtractor;
import org.apache.kafka.streams.processor.internals.InternalTopologyBuilder;
import org.apache.kafka.streams.processor.internals.ProcessorNode;
import org.apache.kafka.streams.processor.internals.SourceNode;
import org.apache.kafka.streams.state.KeyValueStore;
import org.apache.kafka.streams.state.StoreBuilder;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import java.util.Properties;
import java.util.regex.Pattern;
/**
* {@code StreamsBuilder} provide the high-level Kafka Streams DSL to specify a Kafka Streams topology.
*
* @see Topology
* @see KStream
* @see KTable
* @see GlobalKTable
*/
public class StreamsBuilder {
/** The actual topology that is constructed by this StreamsBuilder. */
private final Topology topology = new Topology();
/** The topology's internal builder. */
final InternalTopologyBuilder internalTopologyBuilder = topology.internalTopologyBuilder;
private final InternalStreamsBuilder internalStreamsBuilder = new InternalStreamsBuilder(internalTopologyBuilder);
/**
* Create a {@link KStream} from the specified topic.
* The default {@code "auto.offset.reset"} strategy, default {@link TimestampExtractor}, and default key and value
* deserializers as specified in the {@link StreamsConfig config} are used.
*
* If multiple topics are specified there is no ordering guarantee for records from different topics.
*
* Note that the specified input topic must be partitioned by key.
* If this is not the case it is the user's responsibility to repartition the data before any key based operation
* (like aggregation or join) is applied to the returned {@link KStream}.
*
* @param topic the topic name; cannot be {@code null}
* @return a {@link KStream} for the specified topic
*/
public synchronized KStream stream(final String topic) {
return stream(Collections.singleton(topic));
}
/**
* Create a {@link KStream} from the specified topic.
* The {@code "auto.offset.reset"} strategy, {@link TimestampExtractor}, key and value deserializers
* are defined by the options in {@link Consumed} are used.
*
* Note that the specified input topic must be partitioned by key.
* If this is not the case it is the user's responsibility to repartition the data before any key based operation
* (like aggregation or join) is applied to the returned {@link KStream}.
*
* @param topic the topic names; cannot be {@code null}
* @param consumed the instance of {@link Consumed} used to define optional parameters
* @return a {@link KStream} for the specified topic
*/
public synchronized KStream stream(final String topic,
final Consumed consumed) {
return stream(Collections.singleton(topic), consumed);
}
/**
* Create a {@link KStream} from the specified topics.
* The default {@code "auto.offset.reset"} strategy, default {@link TimestampExtractor}, and default key and value
* deserializers as specified in the {@link StreamsConfig config} are used.
*
* If multiple topics are specified there is no ordering guarantee for records from different topics.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case it is the user's responsibility to repartition the data before any key based operation
* (like aggregation or join) is applied to the returned {@link KStream}.
*
* @param topics the topic names; must contain at least one topic name
* @return a {@link KStream} for the specified topics
*/
public synchronized KStream stream(final Collection topics) {
return stream(topics, Consumed.with(null, null, null, null));
}
/**
* Create a {@link KStream} from the specified topics.
* The {@code "auto.offset.reset"} strategy, {@link TimestampExtractor}, key and value deserializers
* are defined by the options in {@link Consumed} are used.
*
* If multiple topics are specified there is no ordering guarantee for records from different topics.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case it is the user's responsibility to repartition the data before any key based operation
* (like aggregation or join) is applied to the returned {@link KStream}.
*
* @param topics the topic names; must contain at least one topic name
* @param consumed the instance of {@link Consumed} used to define optional parameters
* @return a {@link KStream} for the specified topics
*/
public synchronized KStream stream(final Collection topics,
final Consumed consumed) {
Objects.requireNonNull(topics, "topics can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
return internalStreamsBuilder.stream(topics, new ConsumedInternal<>(consumed));
}
/**
* Create a {@link KStream} from the specified topic pattern.
* The default {@code "auto.offset.reset"} strategy, default {@link TimestampExtractor}, and default key and value
* deserializers as specified in the {@link StreamsConfig config} are used.
*
* If multiple topics are matched by the specified pattern, the created {@link KStream} will read data from all of
* them and there is no ordering guarantee between records from different topics. This also means that the work
* will not be parallelized for multiple topics, and the number of tasks will scale with the maximum partition
* count of any matching topic rather than the total number of partitions across all topics.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case it is the user's responsibility to repartition the data before any key based operation
* (like aggregation or join) is applied to the returned {@link KStream}.
*
* @param topicPattern the pattern to match for topic names
* @return a {@link KStream} for topics matching the regex pattern.
*/
public synchronized KStream stream(final Pattern topicPattern) {
return stream(topicPattern, Consumed.with(null, null));
}
/**
* Create a {@link KStream} from the specified topic pattern.
* The {@code "auto.offset.reset"} strategy, {@link TimestampExtractor}, key and value deserializers
* are defined by the options in {@link Consumed} are used.
*
* If multiple topics are matched by the specified pattern, the created {@link KStream} will read data from all of
* them and there is no ordering guarantee between records from different topics. This also means that the work
* will not be parallelized for multiple topics, and the number of tasks will scale with the maximum partition
* count of any matching topic rather than the total number of partitions across all topics.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case it is the user's responsibility to repartition the data before any key based operation
* (like aggregation or join) is applied to the returned {@link KStream}.
*
* @param topicPattern the pattern to match for topic names
* @param consumed the instance of {@link Consumed} used to define optional parameters
* @return a {@link KStream} for topics matching the regex pattern.
*/
public synchronized KStream stream(final Pattern topicPattern,
final Consumed consumed) {
Objects.requireNonNull(topicPattern, "topicPattern can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
return internalStreamsBuilder.stream(topicPattern, new ConsumedInternal<>(consumed));
}
/**
* Create a {@link KTable} for the specified topic.
* The {@code "auto.offset.reset"} strategy, {@link TimestampExtractor}, key and value deserializers
* are defined by the options in {@link Consumed} are used.
* Input {@link KeyValue records} with {@code null} key will be dropped.
*
* Note that the specified input topic must be partitioned by key.
* If this is not the case the returned {@link KTable} will be corrupted.
*
* The resulting {@link KTable} will be materialized in a local {@link KeyValueStore} using the given
* {@code Materialized} instance.
* An internal changelog topic is created by default. Because the source topic can
* be used for recovery, you can avoid creating the changelog topic by setting
* the {@code "topology.optimization"} to {@code "all"} in the {@link StreamsConfig}.
*
* You should only specify serdes in the {@link Consumed} instance as these will also be used to overwrite the
* serdes in {@link Materialized}, i.e.,
*
{@code
* streamBuilder.table(topic, Consumed.with(Serde.String(), Serde.String()), Materialized.as(storeName))
* }
*
* To query the local {@link org.apache.kafka.streams.state.ReadOnlyKeyValueStore} it must be obtained via
* {@link KafkaStreams#store(StoreQueryParameters) KafkaStreams#store(...)}:
* {@code
* KafkaStreams streams = ...
* ReadOnlyKeyValueStore> localStore = streams.store(queryableStoreName, QueryableStoreTypes.>timestampedKeyValueStore());
* K key = "some-key";
* ValueAndTimestamp valueForKey = localStore.get(key); // key must be local (application state is shared over all running Kafka Streams instances)
* }
* For non-local keys, a custom RPC mechanism must be implemented using {@link KafkaStreams#allMetadata()} to
* query the value of the key on a parallel running instance of your Kafka Streams application.
*
* @param topic the topic name; cannot be {@code null}
* @param consumed the instance of {@link Consumed} used to define optional parameters; cannot be {@code null}
* @param materialized the instance of {@link Materialized} used to materialize a state store; cannot be {@code null}
* @return a {@link KTable} for the specified topic
*/
public synchronized KTable table(final String topic,
final Consumed consumed,
final Materialized> materialized) {
Objects.requireNonNull(topic, "topic can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
Objects.requireNonNull(materialized, "materialized can't be null");
final ConsumedInternal consumedInternal = new ConsumedInternal<>(consumed);
materialized.withKeySerde(consumedInternal.keySerde()).withValueSerde(consumedInternal.valueSerde());
final MaterializedInternal> materializedInternal =
new MaterializedInternal<>(materialized, internalStreamsBuilder, topic + "-");
return internalStreamsBuilder.table(topic, consumedInternal, materializedInternal);
}
/**
* Create a {@link KTable} for the specified topic.
* The default {@code "auto.offset.reset"} strategy and default key and value deserializers as specified in the
* {@link StreamsConfig config} are used.
* Input {@link KeyValue records} with {@code null} key will be dropped.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case the returned {@link KTable} will be corrupted.
*
* The resulting {@link KTable} will be materialized in a local {@link KeyValueStore} with an internal
* store name. Note that store name may not be queryable through Interactive Queries.
* An internal changelog topic is created by default. Because the source topic can
* be used for recovery, you can avoid creating the changelog topic by setting
* the {@code "topology.optimization"} to {@code "all"} in the {@link StreamsConfig}.
*
* @param topic the topic name; cannot be {@code null}
* @return a {@link KTable} for the specified topic
*/
public synchronized KTable table(final String topic) {
return table(topic, new ConsumedInternal<>());
}
/**
* Create a {@link KTable} for the specified topic.
* The {@code "auto.offset.reset"} strategy, {@link TimestampExtractor}, key and value deserializers
* are defined by the options in {@link Consumed} are used.
* Input {@link KeyValue records} with {@code null} key will be dropped.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case the returned {@link KTable} will be corrupted.
*
* The resulting {@link KTable} will be materialized in a local {@link KeyValueStore} with an internal
* store name. Note that store name may not be queryable through Interactive Queries.
* An internal changelog topic is created by default. Because the source topic can
* be used for recovery, you can avoid creating the changelog topic by setting
* the {@code "topology.optimization"} to {@code "all"} in the {@link StreamsConfig}.
*
* @param topic the topic name; cannot be {@code null}
* @param consumed the instance of {@link Consumed} used to define optional parameters; cannot be {@code null}
* @return a {@link KTable} for the specified topic
*/
public synchronized KTable table(final String topic,
final Consumed consumed) {
Objects.requireNonNull(topic, "topic can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
final ConsumedInternal consumedInternal = new ConsumedInternal<>(consumed);
final MaterializedInternal> materializedInternal =
new MaterializedInternal<>(
Materialized.with(consumedInternal.keySerde(), consumedInternal.valueSerde()),
internalStreamsBuilder,
topic + "-");
return internalStreamsBuilder.table(topic, consumedInternal, materializedInternal);
}
/**
* Create a {@link KTable} for the specified topic.
* The default {@code "auto.offset.reset"} strategy as specified in the {@link StreamsConfig config} are used.
* Key and value deserializers as defined by the options in {@link Materialized} are used.
* Input {@link KeyValue records} with {@code null} key will be dropped.
*
* Note that the specified input topics must be partitioned by key.
* If this is not the case the returned {@link KTable} will be corrupted.
*
* The resulting {@link KTable} will be materialized in a local {@link KeyValueStore} using the {@link Materialized} instance.
* An internal changelog topic is created by default. Because the source topic can
* be used for recovery, you can avoid creating the changelog topic by setting
* the {@code "topology.optimization"} to {@code "all"} in the {@link StreamsConfig}.
*
* @param topic the topic name; cannot be {@code null}
* @param materialized the instance of {@link Materialized} used to materialize a state store; cannot be {@code null}
* @return a {@link KTable} for the specified topic
*/
public synchronized KTable table(final String topic,
final Materialized> materialized) {
Objects.requireNonNull(topic, "topic can't be null");
Objects.requireNonNull(materialized, "materialized can't be null");
final MaterializedInternal> materializedInternal =
new MaterializedInternal<>(materialized, internalStreamsBuilder, topic + "-");
final ConsumedInternal consumedInternal =
new ConsumedInternal<>(Consumed.with(materializedInternal.keySerde(), materializedInternal.valueSerde()));
return internalStreamsBuilder.table(topic, consumedInternal, materializedInternal);
}
/**
* Create a {@link GlobalKTable} for the specified topic.
* Input {@link KeyValue records} with {@code null} key will be dropped.
*
* The resulting {@link GlobalKTable} will be materialized in a local {@link KeyValueStore} with an internal
* store name. Note that store name may not be queryable through Interactive Queries.
* No internal changelog topic is created since the original input topic can be used for recovery (cf.
* methods of {@link KGroupedStream} and {@link KGroupedTable} that return a {@link KTable}).
*
* Note that {@link GlobalKTable} always applies {@code "auto.offset.reset"} strategy {@code "earliest"}
* regardless of the specified value in {@link StreamsConfig} or {@link Consumed}.
*
* @param topic the topic name; cannot be {@code null}
* @param consumed the instance of {@link Consumed} used to define optional parameters
* @return a {@link GlobalKTable} for the specified topic
*/
public synchronized GlobalKTable globalTable(final String topic,
final Consumed consumed) {
Objects.requireNonNull(topic, "topic can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
final ConsumedInternal consumedInternal = new ConsumedInternal<>(consumed);
final MaterializedInternal> materializedInternal =
new MaterializedInternal<>(
Materialized.with(consumedInternal.keySerde(), consumedInternal.valueSerde()),
internalStreamsBuilder, topic + "-");
return internalStreamsBuilder.globalTable(topic, consumedInternal, materializedInternal);
}
/**
* Create a {@link GlobalKTable} for the specified topic.
* The default key and value deserializers as specified in the {@link StreamsConfig config} are used.
* Input {@link KeyValue records} with {@code null} key will be dropped.
*
* The resulting {@link GlobalKTable} will be materialized in a local {@link KeyValueStore} with an internal
* store name. Note that store name may not be queryable through Interactive Queries.
* No internal changelog topic is created since the original input topic can be used for recovery (cf.
* methods of {@link KGroupedStream} and {@link KGroupedTable} that return a {@link KTable}).
*
* Note that {@link GlobalKTable} always applies {@code "auto.offset.reset"} strategy {@code "earliest"}
* regardless of the specified value in {@link StreamsConfig}.
*
* @param topic the topic name; cannot be {@code null}
* @return a {@link GlobalKTable} for the specified topic
*/
public synchronized GlobalKTable globalTable(final String topic) {
return globalTable(topic, Consumed.with(null, null));
}
/**
* Create a {@link GlobalKTable} for the specified topic.
*
* Input {@link KeyValue} pairs with {@code null} key will be dropped.
*
* The resulting {@link GlobalKTable} will be materialized in a local {@link KeyValueStore} configured with
* the provided instance of {@link Materialized}.
* However, no internal changelog topic is created since the original input topic can be used for recovery (cf.
* methods of {@link KGroupedStream} and {@link KGroupedTable} that return a {@link KTable}).
*
* You should only specify serdes in the {@link Consumed} instance as these will also be used to overwrite the
* serdes in {@link Materialized}, i.e.,
*
{@code
* streamBuilder.globalTable(topic, Consumed.with(Serde.String(), Serde.String()), Materialized.as(storeName))
* }
*
* To query the local {@link org.apache.kafka.streams.state.ReadOnlyKeyValueStore} it must be obtained via
* {@link KafkaStreams#store(StoreQueryParameters) KafkaStreams#store(...)}:
* {@code
* KafkaStreams streams = ...
* ReadOnlyKeyValueStore> localStore = streams.store(queryableStoreName, QueryableStoreTypes.>timestampedKeyValueStore());
* K key = "some-key";
* ValueAndTimestamp valueForKey = localStore.get(key);
* }
* Note that {@link GlobalKTable} always applies {@code "auto.offset.reset"} strategy {@code "earliest"}
* regardless of the specified value in {@link StreamsConfig} or {@link Consumed}.
*
* @param topic the topic name; cannot be {@code null}
* @param consumed the instance of {@link Consumed} used to define optional parameters; can't be {@code null}
* @param materialized the instance of {@link Materialized} used to materialize a state store; cannot be {@code null}
* @return a {@link GlobalKTable} for the specified topic
*/
public synchronized GlobalKTable globalTable(final String topic,
final Consumed consumed,
final Materialized> materialized) {
Objects.requireNonNull(topic, "topic can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
Objects.requireNonNull(materialized, "materialized can't be null");
final ConsumedInternal consumedInternal = new ConsumedInternal<>(consumed);
// always use the serdes from consumed
materialized.withKeySerde(consumedInternal.keySerde()).withValueSerde(consumedInternal.valueSerde());
final MaterializedInternal> materializedInternal =
new MaterializedInternal<>(materialized, internalStreamsBuilder, topic + "-");
return internalStreamsBuilder.globalTable(topic, consumedInternal, materializedInternal);
}
/**
* Create a {@link GlobalKTable} for the specified topic.
*
* Input {@link KeyValue} pairs with {@code null} key will be dropped.
*
* The resulting {@link GlobalKTable} will be materialized in a local {@link KeyValueStore} configured with
* the provided instance of {@link Materialized}.
* However, no internal changelog topic is created since the original input topic can be used for recovery (cf.
* methods of {@link KGroupedStream} and {@link KGroupedTable} that return a {@link KTable}).
*
* To query the local {@link org.apache.kafka.streams.state.ReadOnlyKeyValueStore} it must be obtained via
* {@link KafkaStreams#store(StoreQueryParameters) KafkaStreams#store(...)}:
*
{@code
* KafkaStreams streams = ...
* ReadOnlyKeyValueStore> localStore = streams.store(queryableStoreName, QueryableStoreTypes.>timestampedKeyValueStore());
* K key = "some-key";
* ValueAndTimestamp valueForKey = localStore.get(key);
* }
* Note that {@link GlobalKTable} always applies {@code "auto.offset.reset"} strategy {@code "earliest"}
* regardless of the specified value in {@link StreamsConfig}.
*
* @param topic the topic name; cannot be {@code null}
* @param materialized the instance of {@link Materialized} used to materialize a state store; cannot be {@code null}
* @return a {@link GlobalKTable} for the specified topic
*/
public synchronized GlobalKTable globalTable(final String topic,
final Materialized> materialized) {
Objects.requireNonNull(topic, "topic can't be null");
Objects.requireNonNull(materialized, "materialized can't be null");
final MaterializedInternal> materializedInternal =
new MaterializedInternal<>(materialized, internalStreamsBuilder, topic + "-");
return internalStreamsBuilder.globalTable(topic,
new ConsumedInternal<>(Consumed.with(materializedInternal.keySerde(),
materializedInternal.valueSerde())),
materializedInternal);
}
/**
* Adds a state store to the underlying {@link Topology}.
*
* It is required to connect state stores to {@link Processor Processors}, {@link Transformer Transformers},
* or {@link ValueTransformer ValueTransformers} before they can be used.
*
* @param builder the builder used to obtain this state store {@link StateStore} instance
* @return itself
* @throws TopologyException if state store supplier is already added
*/
public synchronized StreamsBuilder addStateStore(final StoreBuilder> builder) {
Objects.requireNonNull(builder, "builder can't be null");
internalStreamsBuilder.addStateStore(builder);
return this;
}
/**
* @deprecated use {@link #addGlobalStore(StoreBuilder, String, Consumed, ProcessorSupplier)} instead
*/
@Deprecated
public synchronized StreamsBuilder addGlobalStore(final StoreBuilder> storeBuilder,
final String topic,
final String sourceName,
final Consumed consumed,
final String processorName,
final ProcessorSupplier stateUpdateSupplier) {
Objects.requireNonNull(storeBuilder, "storeBuilder can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
internalStreamsBuilder.addGlobalStore(
storeBuilder,
sourceName,
topic,
new ConsumedInternal<>(consumed),
processorName,
stateUpdateSupplier
);
return this;
}
/**
* Adds a global {@link StateStore} to the topology.
* The {@link StateStore} sources its data from all partitions of the provided input topic.
* There will be exactly one instance of this {@link StateStore} per Kafka Streams instance.
*
* A {@link SourceNode} with the provided sourceName will be added to consume the data arriving from the partitions
* of the input topic.
*
* The provided {@link ProcessorSupplier} will be used to create an {@link ProcessorNode} that will receive all
* records forwarded from the {@link SourceNode}. NOTE: you should not use the {@code Processor} to insert transformed records into
* the global state store. This store uses the source topic as changelog and during restore will insert records directly
* from the source.
* This {@link ProcessorNode} should be used to keep the {@link StateStore} up-to-date.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* It is not required to connect a global store to {@link Processor Processors}, {@link Transformer Transformers},
* or {@link ValueTransformer ValueTransformer}; those have read-only access to all global stores by default.
*
* @param storeBuilder user defined {@link StoreBuilder}; can't be {@code null}
* @param topic the topic to source the data from
* @param consumed the instance of {@link Consumed} used to define optional parameters; can't be {@code null}
* @param stateUpdateSupplier the instance of {@link ProcessorSupplier}
* @return itself
* @throws TopologyException if the processor of state is already registered
*/
public synchronized StreamsBuilder addGlobalStore(final StoreBuilder> storeBuilder,
final String topic,
final Consumed consumed,
final ProcessorSupplier stateUpdateSupplier) {
Objects.requireNonNull(storeBuilder, "storeBuilder can't be null");
Objects.requireNonNull(consumed, "consumed can't be null");
internalStreamsBuilder.addGlobalStore(
storeBuilder,
topic,
new ConsumedInternal<>(consumed),
stateUpdateSupplier
);
return this;
}
/**
* Returns the {@link Topology} that represents the specified processing logic.
* Note that using this method means no optimizations are performed.
*
* @return the {@link Topology} that represents the specified processing logic
*/
public synchronized Topology build() {
return build(null);
}
/**
* Returns the {@link Topology} that represents the specified processing logic and accepts
* a {@link Properties} instance used to indicate whether to optimize topology or not.
*
* @param props the {@link Properties} used for building possibly optimized topology
* @return the {@link Topology} that represents the specified processing logic
*/
public synchronized Topology build(final Properties props) {
internalStreamsBuilder.buildAndOptimizeTopology(props);
return topology;
}
}