All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.pipeline.Sources Maven / Gradle / Ivy

There is a newer version: 4.5.4
Show newest version
/*
 * Copyright (c) 2008-2018, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.pipeline;

import com.hazelcast.cache.journal.EventJournalCacheEvent;
import com.hazelcast.client.config.ClientConfig;
import com.hazelcast.jet.Util;
import com.hazelcast.jet.core.ProcessorMetaSupplier;
import com.hazelcast.jet.core.WatermarkGenerationParams;
import com.hazelcast.jet.function.DistributedBiFunction;
import com.hazelcast.jet.function.DistributedFunction;
import com.hazelcast.jet.function.DistributedPredicate;
import com.hazelcast.jet.impl.pipeline.transform.BatchSourceTransform;
import com.hazelcast.jet.impl.pipeline.transform.StreamSourceTransform;
import com.hazelcast.map.journal.EventJournalMapEvent;
import com.hazelcast.projection.Projection;
import com.hazelcast.projection.Projections;
import com.hazelcast.query.Predicate;

import javax.annotation.Nonnull;
import java.io.File;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.Function;

import static com.hazelcast.jet.Util.cacheEventToEntry;
import static com.hazelcast.jet.Util.cachePutEvents;
import static com.hazelcast.jet.Util.mapEventToEntry;
import static com.hazelcast.jet.Util.mapPutEvents;
import static com.hazelcast.jet.core.processor.SourceProcessors.readCacheP;
import static com.hazelcast.jet.core.processor.SourceProcessors.readFilesP;
import static com.hazelcast.jet.core.processor.SourceProcessors.readListP;
import static com.hazelcast.jet.core.processor.SourceProcessors.readMapP;
import static com.hazelcast.jet.core.processor.SourceProcessors.readRemoteCacheP;
import static com.hazelcast.jet.core.processor.SourceProcessors.readRemoteListP;
import static com.hazelcast.jet.core.processor.SourceProcessors.readRemoteMapP;
import static com.hazelcast.jet.core.processor.SourceProcessors.streamCacheP;
import static com.hazelcast.jet.core.processor.SourceProcessors.streamFilesP;
import static com.hazelcast.jet.core.processor.SourceProcessors.streamMapP;
import static com.hazelcast.jet.core.processor.SourceProcessors.streamRemoteCacheP;
import static com.hazelcast.jet.core.processor.SourceProcessors.streamRemoteMapP;
import static com.hazelcast.jet.core.processor.SourceProcessors.streamSocketP;
import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * Contains factory methods for various types of pipeline sources. To start
 * building a pipeline, pass a source to {@link Pipeline#drawFrom(BatchSource)}
 * and you will obtain the initial {@link BatchStage}. You can then
 * attach further stages to it.
 * 

* The same pipeline may contain more than one source, each starting its * own branch. The branches may be merged with multiple-input transforms * such as co-group and hash-join. *

* The default local parallelism for sources in this class is 1 or 2, check the * documentation of individual methods. */ public final class Sources { private static final String GLOB_WILDCARD = "*"; private Sources() { } /** * Returns a bounded (batch) source constructed directly from the given * Core API processor meta-supplier. * * @param sourceName user-friendly source name * @param metaSupplier the processor meta-supplier */ @Nonnull public static BatchSource batchFromProcessor( @Nonnull String sourceName, @Nonnull ProcessorMetaSupplier metaSupplier ) { return new BatchSourceTransform<>(sourceName, metaSupplier); } /** * Returns an unbounded (event stream) source that will use the supplied * function to create processor meta-suppliers as required by the Core API. * Jet will call the function you supply with the watermark generation * parameters and it must return a meta-supplier of processors that will * act according to these parameters and emit the watermark items as they * specify. *

* If you are implementing a custom source processor, be sure to check out * the {@link com.hazelcast.jet.core.WatermarkSourceUtil} class that will * help you correctly implement watermark item emission. * * @param sourceName user-friendly source name * @param metaSupplierFn factory of processor meta-suppliers */ @Nonnull public static StreamSource streamFromProcessorWithWatermarks( @Nonnull String sourceName, @Nonnull Function, ProcessorMetaSupplier> metaSupplierFn ) { return new StreamSourceTransform<>(sourceName, metaSupplierFn, true); } /** * Returns an unbounded (event stream) source constructed directly from the given * Core API processor meta-supplier. * * @param sourceName user-friendly source name * @param metaSupplier the processor meta-supplier */ @Nonnull public static StreamSource streamFromProcessor( @Nonnull String sourceName, @Nonnull ProcessorMetaSupplier metaSupplier ) { return new StreamSourceTransform<>(sourceName, w -> metaSupplier, false); } /** * Returns a source that fetches entries from a local Hazelcast {@code IMap} * with the specified name and emits them as {@code Map.Entry}. It leverages * data locality by making each of the underlying processors fetch only those * entries that are stored on the member where it is running. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* If the {@code IMap} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). */ @Nonnull public static BatchSource> map(@Nonnull String mapName) { return batchFromProcessor("mapSource(" + mapName + ')', readMapP(mapName)); } /** * Returns a source that fetches entries from a local Hazelcast {@code * IMap} with the specified name. By supplying a {@code predicate} and * {@code projection} here instead of in separate {@code map/filter} * transforms you allow the source to apply these functions early, before * generating any output, with the potential of significantly reducing * data traffic. If your data is stored in the IMDG using the * portable serialization format, there are additional optimizations * available when using {@link * com.hazelcast.projection.Projections#singleAttribute(String) * Projections.singleAttribute()} and {@link * com.hazelcast.projection.Projections#multiAttribute(String...) * Projections.multiAttribute()}) to create your projection instance and * using the {@link com.hazelcast.jet.GenericPredicates} factory or * {@link com.hazelcast.query.PredicateBuilder PredicateBuilder} to create * the predicate. In this case Jet can test the predicate and apply the * projection without deserializing the whole object. *

* The source leverages data locality by making each of the underlying * processors fetch only those entries that are stored on the member where * it is running. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* If the {@code IMap} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * *

Predicate/projection class requirements

* * The classes implementing {@code predicate} and {@code projection} need * to be available on the cluster's classpath, or loaded using * Hazelcast User Code Deployment. It's not enough to add them to * job classpath in {@link com.hazelcast.jet.config.JobConfig}. Same is * true for the class of the objects stored in the map itself. If you * cannot fulfill these conditions, use {@link #map(String)} and add a * subsequent {@link GeneralStage#map map} or {@link GeneralStage#filter * filter} stage. * * @param mapName the name of the map * @param predicate the predicate to filter the events. If you want to specify just the * projection, use {@link * com.hazelcast.jet.GenericPredicates#alwaysTrue()} as a pass-through * predicate * @param projection the projection to map the events. If the projection returns a {@code * null} for an item, that item will be filtered out. If you want to * specify just the predicate, use {@link Projections#identity()}. * @param type of emitted item */ @Nonnull public static BatchSource map( @Nonnull String mapName, @Nonnull Predicate predicate, @Nonnull Projection, T> projection ) { return batchFromProcessor("mapSource(" + mapName + ')', readMapP(mapName, predicate, projection)); } /** * Convenience for {@link #map(String, Predicate, Projection)} * which uses a {@link DistributedFunction} as the projection function. */ @Nonnull public static BatchSource map( @Nonnull String mapName, @Nonnull Predicate predicate, @Nonnull DistributedFunction, T> projectionFn ) { return batchFromProcessor("mapSource(" + mapName + ')', readMapP(mapName, predicate, projectionFn)); } /** * Returns a source that will stream {@link EventJournalMapEvent}s of the * Hazelcast {@code IMap} with the specified name. By supplying a {@code * predicate} and {@code projection} here instead of in separate {@code * map/filter} transforms you allow the source to apply these functions * early, before generating any output, with the potential of significantly * reducing data traffic. *

* The source leverages data locality by making each of the underlying * processors fetch only those entries that are stored on the member where * it is running. *

* To use an {@code IMap} as a streaming source, you must {@link * com.hazelcast.config.EventJournalConfig configure the event journal} * for it. The journal has fixed capacity and will drop events if it * overflows. *

* The source saves the journal offset to the snapshot. If the job * restarts, it starts emitting from the saved offset with an * exactly-once guarantee (unless the journal has overflowed). *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * *

Predicate/projection class requirements

* * The classes implementing {@code predicateFn} and {@code projectionFn} * need to be available on the cluster's classpath, or loaded using * Hazelcast User Code Deployment. It's not enough to add them to * job classpath in {@link com.hazelcast.jet.config.JobConfig}. Same is * true for the class of the objects stored in the map itself. If you * cannot fulfill these conditions, use {@link #mapJournal(String, * JournalInitialPosition)} and add a subsequent {@link GeneralStage#map * map} or {@link GeneralStage#filter filter} stage. * * @param mapName the name of the map * @param predicateFn the predicate to filter the events. If you want to specify just the * projection, use {@link Util#mapPutEvents} to pass only {@link * com.hazelcast.core.EntryEventType#ADDED ADDED} and {@link * com.hazelcast.core.EntryEventType#UPDATED UPDATED} events. * @param projectionFn the projection to map the events. If the projection returns a {@code * null} for an item, that item will be filtered out. You may use {@link * Util#mapEventToEntry()} to extract just the key and the new value. * @param initialPos describes which event to start receiving from * @param type of emitted item */ @Nonnull public static StreamSource mapJournal( @Nonnull String mapName, @Nonnull DistributedPredicate> predicateFn, @Nonnull DistributedFunction, T> projectionFn, @Nonnull JournalInitialPosition initialPos ) { return streamFromProcessorWithWatermarks("mapJournalSource(" + mapName + ')', w -> streamMapP(mapName, predicateFn, projectionFn, initialPos, w)); } /** * Convenience for {@link #mapJournal(String, DistributedPredicate, * DistributedFunction, JournalInitialPosition)} * which will pass only {@link com.hazelcast.core.EntryEventType#ADDED * ADDED} and {@link com.hazelcast.core.EntryEventType#UPDATED UPDATED} * events and will project the event's key and new value into a {@code * Map.Entry}. */ @Nonnull public static StreamSource> mapJournal( @Nonnull String mapName, @Nonnull JournalInitialPosition initialPos ) { return mapJournal(mapName, mapPutEvents(), mapEventToEntry(), initialPos); } /** * Returns a source that fetches entries from the Hazelcast {@code IMap} * with the specified name in a remote cluster identified by the supplied * {@code ClientConfig} and emits them as {@code Map.Entry}. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* If the {@code IMap} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. *

* The default local parallelism for this processor is 1. */ @Nonnull public static BatchSource> remoteMap( @Nonnull String mapName, @Nonnull ClientConfig clientConfig ) { return batchFromProcessor("remoteMapSource(" + mapName + ')', readRemoteMapP(mapName, clientConfig)); } /** * Returns a source that fetches entries from a remote Hazelcast {@code * IMap} with the specified name in a remote cluster identified by the * supplied {@code ClientConfig}. By supplying a {@code predicate} and * {@code projection} here instead of in separate {@code map/filter} * transforms you allow the source to apply these functions early, before * generating any output, with the potential of significantly reducing * data traffic. If your data is stored in the IMDG using the * portable serialization format, there are additional optimizations * available when using {@link * com.hazelcast.projection.Projections#singleAttribute(String) * Projections.singleAttribute()} and {@link * com.hazelcast.projection.Projections#multiAttribute(String...) * Projections.multiAttribute()}) to create your projection instance and * using the {@link com.hazelcast.jet.GenericPredicates} factory or * {@link com.hazelcast.query.PredicateBuilder PredicateBuilder} to create * the predicate. In this case Jet can test the predicate and apply the * projection without deserializing the whole object. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* If the {@code IMap} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. *

* The default local parallelism for this processor is 1. * *

Predicate/projection class requirements

* * The classes implementing {@code predicate} and {@code projection} need * to be available on the remote cluster's classpath, or loaded using * Hazelcast User Code Deployment. It's not enough to add them to * job classpath in {@link com.hazelcast.jet.config.JobConfig}. Same is * true for the class of the objects stored in the map itself. If you * cannot fulfill these conditions, use {@link #remoteMap(String, * ClientConfig)} and add a subsequent {@link GeneralStage#map map} or * {@link GeneralStage#filter filter} stage. * * @param mapName the name of the map * @param predicate the predicate to filter the events. If you want to specify just the * projection, use {@link * com.hazelcast.jet.GenericPredicates#alwaysTrue()} as a pass-through * predicate * @param projection the projection to map the events. If the projection returns a {@code * null} for an item, that item will be filtered out. If you want to * specify just the predicate, use {@link Projections#identity()}. * @param type of emitted item */ @Nonnull public static BatchSource remoteMap( @Nonnull String mapName, @Nonnull ClientConfig clientConfig, @Nonnull Predicate predicate, @Nonnull Projection, T> projection ) { return batchFromProcessor("remoteMapSource(" + mapName + ')', readRemoteMapP(mapName, clientConfig, predicate, projection)); } /** * Convenience for {@link #remoteMap(String, ClientConfig, Predicate, Projection)} * which use a {@link DistributedFunction} as the projection function. */ @Nonnull public static BatchSource remoteMap( @Nonnull String mapName, @Nonnull ClientConfig clientConfig, @Nonnull Predicate predicate, @Nonnull DistributedFunction, T> projectionFn ) { return batchFromProcessor("remoteMapSource(" + mapName + ')', readRemoteMapP(mapName, clientConfig, predicate, projectionFn)); } /** * Returns a source that will stream the {@link EventJournalMapEvent} * events of the Hazelcast {@code IMap} with the specified name from a * remote cluster. By supplying a {@code predicate} and {@code projection} * here instead of in separate {@code map/filter} transforms you allow the * source to apply these functions early, before generating any output, * with the potential of significantly reducing data traffic. *

* To use an {@code IMap} as a streaming source, you must {@link * com.hazelcast.config.EventJournalConfig configure the event journal} * for it. The journal has fixed capacity and will drop events if it * overflows. *

* The source saves the journal offset to the snapshot. If the job * restarts, it starts emitting from the saved offset with an * exactly-once guarantee (unless the journal has overflowed). *

* The default local parallelism for this processor is 1. * *

Predicate/projection class requirements

* * The classes implementing {@code predicateFn} and {@code projectionFn} * need to be available on the remote cluster's classpath, or loaded using * Hazelcast User Code Deployment. It's not enough to add them to * job classpath in {@link com.hazelcast.jet.config.JobConfig}. Same is * true for the class of the objects stored in the map itself. If you * cannot fulfill these conditions, use {@link #remoteMapJournal(String, * ClientConfig, JournalInitialPosition)} and add a subsequent {@link * GeneralStage#map map} or {@link GeneralStage#filter filter} stage. * * @param mapName the name of the map * @param clientConfig configuration for the client to connect to the remote cluster * @param predicateFn the predicate to filter the events. You may use {@link Util#mapPutEvents} * to pass only {@link com.hazelcast.core.EntryEventType#ADDED * ADDED} and {@link com.hazelcast.core.EntryEventType#UPDATED UPDATED} * events. * @param projectionFn the projection to map the events. If the projection returns a {@code * null} for an item, that item will be filtered out. You may use {@link * Util#mapEventToEntry()} to extract just the key and the new value. * @param initialPos describes which event to start receiving from * @param type of key * @param type of value * @param type of emitted item */ @Nonnull public static StreamSource remoteMapJournal( @Nonnull String mapName, @Nonnull ClientConfig clientConfig, @Nonnull DistributedPredicate> predicateFn, @Nonnull DistributedFunction, T> projectionFn, @Nonnull JournalInitialPosition initialPos ) { return streamFromProcessorWithWatermarks("remoteMapJournalSource(" + mapName + ')', w -> streamRemoteMapP(mapName, clientConfig, predicateFn, projectionFn, initialPos, w)); } /** * Convenience for {@link #remoteMapJournal(String, ClientConfig, * DistributedPredicate, DistributedFunction, JournalInitialPosition)} * which will pass only {@link com.hazelcast.core.EntryEventType#ADDED ADDED} * and {@link com.hazelcast.core.EntryEventType#UPDATED UPDATED} events and will * project the event's key and new value into a {@code Map.Entry}. */ @Nonnull public static StreamSource> remoteMapJournal( @Nonnull String mapName, @Nonnull ClientConfig clientConfig, @Nonnull JournalInitialPosition initialPos ) { return remoteMapJournal(mapName, clientConfig, mapPutEvents(), mapEventToEntry(), initialPos); } /** * Returns a source that fetches entries from the Hazelcast {@code ICache} * with the specified name and emits them as {@code Map.Entry}. It * leverages data locality by making each of the underlying processors * fetch only those entries that are stored on the member where it is * running. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* If the {@code ICache} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). */ @Nonnull public static BatchSource> cache(@Nonnull String cacheName) { return batchFromProcessor("cacheSource(" + cacheName + ')', readCacheP(cacheName)); } /** * Returns a source that will stream the {@link EventJournalCacheEvent} * events of the Hazelcast {@code ICache} with the specified name. By * supplying a {@code predicate} and {@code projection} here instead of * in separate {@code map/filter} transforms you allow the source to apply * these functions early, before generating any output, with the potential * of significantly reducing data traffic. *

* The source leverages data locality by making each of the underlying * processors fetch only those entries that are stored on the member where * it is running. *

* To use an {@code ICache} as a streaming source, you must {@link * com.hazelcast.config.EventJournalConfig configure the event journal} * for it. The journal has fixed capacity and will drop events if it * overflows. *

* The source saves the journal offset to the snapshot. If the job * restarts, it starts emitting from the saved offset with an * exactly-once guarantee (unless the journal has overflowed). *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * *

Predicate/projection class requirements

* * The classes implementing {@code predicateFn} and {@code projectionFn} * need to be available on the cluster's classpath, or loaded using * Hazelcast User Code Deployment. It's not enough to add them to * job classpath in {@link com.hazelcast.jet.config.JobConfig}. Same is * true for the class of the objects stored in the cache itself. If you * cannot fulfill these conditions, use {@link #cacheJournal(String, * JournalInitialPosition)} and add a subsequent {@link GeneralStage#map * map} or {@link GeneralStage#filter filter} stage. * * @param cacheName the name of the cache * @param predicateFn the predicate to filter the events. You may use {@link * Util#cachePutEvents()} to pass only {@link * com.hazelcast.cache.CacheEventType#CREATED CREATED} and {@link * com.hazelcast.cache.CacheEventType#UPDATED UPDATED} events. * @param projectionFn the projection to map the events. If the projection returns a {@code * null} for an item, that item will be filtered out. You may use {@link * Util#cacheEventToEntry()} to extract just the key and the new value. * @param initialPos describes which event to start receiving from * @param type of emitted item */ @Nonnull public static StreamSource cacheJournal( @Nonnull String cacheName, @Nonnull DistributedPredicate> predicateFn, @Nonnull DistributedFunction, T> projectionFn, @Nonnull JournalInitialPosition initialPos ) { return streamFromProcessorWithWatermarks("cacheJournalSource(" + cacheName + ')', w -> streamCacheP(cacheName, predicateFn, projectionFn, initialPos, w) ); } /** * Convenience for {@link #cacheJournal(String, DistributedPredicate, * DistributedFunction, JournalInitialPosition)} * which will pass only {@link com.hazelcast.cache.CacheEventType#CREATED * CREATED} and {@link com.hazelcast.cache.CacheEventType#UPDATED UPDATED} * events and will project the event's key and new value into a {@code * Map.Entry}. */ @Nonnull public static StreamSource> cacheJournal( @Nonnull String cacheName, @Nonnull JournalInitialPosition initialPos ) { return cacheJournal(cacheName, cachePutEvents(), cacheEventToEntry(), initialPos); } /** * Returns a source that fetches entries from the Hazelcast {@code ICache} * with the specified name in a remote cluster identified by the supplied * {@code ClientConfig} and emits them as {@code Map.Entry}. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* If the {@code ICache} is modified while being read, or if there is a * cluster topology change (triggering data migration), the source may * miss and/or duplicate some entries. *

* The default local parallelism for this processor is 1. */ @Nonnull public static BatchSource> remoteCache( @Nonnull String cacheName, @Nonnull ClientConfig clientConfig ) { return batchFromProcessor( "remoteCacheSource(" + cacheName + ')', readRemoteCacheP(cacheName, clientConfig) ); } /** * Returns a source that will stream the {@link EventJournalCacheEvent} * events of the Hazelcast {@code ICache} with the specified name from a * remote cluster. By supplying a {@code predicate} and {@code projection} * here instead of in separate {@code map/filter} transforms you allow the * source to apply these functions early, before generating any output, * with the potential of significantly reducing data traffic. *

* To use an {@code ICache} as a streaming source, you must {@link * com.hazelcast.config.EventJournalConfig configure the event journal} * for it. The journal has fixed capacity and will drop events if it * overflows. *

* The source saves the journal offset to the snapshot. If the job * restarts, it starts emitting from the saved offset with an * exactly-once guarantee (unless the journal has overflowed). *

* The default local parallelism for this processor is 1. * *

Predicate/projection class requirements

* * The classes implementing {@code predicateFn} and {@code projectionFn} * need to be available on the cluster's classpath, or loaded using * Hazelcast User Code Deployment. It's not enough to add them to * job classpath in {@link com.hazelcast.jet.config.JobConfig}. Same is * true for the class of the objects stored in the cache itself. If you * cannot fulfill these conditions, use {@link #remoteCacheJournal(String, * ClientConfig, JournalInitialPosition)} and add a subsequent {@link * GeneralStage#map map} or {@link GeneralStage#filter filter} stage. * * @param cacheName the name of the cache * @param clientConfig configuration for the client to connect to the remote cluster * @param predicateFn the predicate to filter the events. You may use {@link * Util#cachePutEvents()} to pass only {@link * com.hazelcast.cache.CacheEventType#CREATED CREATED} and {@link * com.hazelcast.cache.CacheEventType#UPDATED UPDATED} events. * @param projectionFn the projection to map the events. If the projection returns a {@code * null} for an item, that item will be filtered out. You may use {@link * Util#cacheEventToEntry()} to extract just the key and the new value. * @param initialPos describes which event to start receiving from * @param type of emitted item */ @Nonnull public static StreamSource remoteCacheJournal( @Nonnull String cacheName, @Nonnull ClientConfig clientConfig, @Nonnull DistributedPredicate> predicateFn, @Nonnull DistributedFunction, T> projectionFn, @Nonnull JournalInitialPosition initialPos ) { return streamFromProcessorWithWatermarks("remoteCacheJournalSource(" + cacheName + ')', w -> streamRemoteCacheP(cacheName, clientConfig, predicateFn, projectionFn, initialPos, w)); } /** * Convenience for {@link #remoteCacheJournal(String, ClientConfig, * DistributedPredicate, DistributedFunction, JournalInitialPosition)} * which will pass only * {@link com.hazelcast.cache.CacheEventType#CREATED CREATED} * and {@link com.hazelcast.cache.CacheEventType#UPDATED UPDATED} * events and will project the event's key and new value * into a {@code Map.Entry}. */ @Nonnull public static StreamSource> remoteCacheJournal( @Nonnull String cacheName, @Nonnull ClientConfig clientConfig, @Nonnull JournalInitialPosition initialPos ) { return remoteCacheJournal(cacheName, clientConfig, cachePutEvents(), cacheEventToEntry(), initialPos); } /** * Returns a source that emits items retrieved from a Hazelcast {@code * IList}. All elements are emitted on a single member — the one * where the entire list is stored by the IMDG. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* The default local parallelism for this processor is 1. */ @Nonnull public static BatchSource list(@Nonnull String listName) { return batchFromProcessor("listSource(" + listName + ')', readListP(listName)); } /** * Returns a source that emits items retrieved from a Hazelcast {@code * IList} in a remote cluster identified by the supplied {@code * ClientConfig}. All elements are emitted on a single member. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* The default local parallelism for this processor is 1. */ @Nonnull public static BatchSource remoteList(@Nonnull String listName, @Nonnull ClientConfig clientConfig) { return batchFromProcessor("remoteListSource(" + listName + ')', readRemoteListP(listName, clientConfig)); } /** * Returns a source which connects to the specified socket and emits lines * of text received from it. It decodes the text using the supplied {@code * charset}. *

* Each underlying processor opens its own TCP connection, so there will be * {@code clusterSize * localParallelism} open connections to the server. *

* The source completes when the server closes the socket. It never attempts * to reconnect. Any {@code IOException} will cause the job to fail. *

* The source does not save any state to snapshot. On job restart, it will * emit whichever items the server sends. The implementation uses * non-blocking API, the processor is cooperative. *

* The default local parallelism for this processor is 1. */ @Nonnull public static StreamSource socket( @Nonnull String host, int port, @Nonnull Charset charset ) { return streamFromProcessor( "socketSource(" + host + ':' + port + ')', streamSocketP(host, port, charset) ); } /** * Convenience for {@link #socket socket(host, port, charset)} with * UTF-8 as the charset. * * @param host the hostname to connect to * @param port the port to connect to */ @Nonnull public static StreamSource socket(@Nonnull String host, int port) { return socket(host, port, UTF_8); } /** * A source that emits lines from files in a directory (but not its * subdirectories. The files must not change while being read; if they do, * the behavior is unspecified. *

* To be useful, the source should be configured to read data local to each * member. For example, if the pathname resolves to a shared network * filesystem visible by multiple members, they will emit duplicate data. *

* The source does not save any state to snapshot. If the job is restarted, * it will re-emit all entries. *

* Any {@code IOException} will cause the job to fail. *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * * @param directory parent directory of the files * @param charset charset to use to decode the files * @param glob the globbing mask, see {@link * java.nio.file.FileSystem#getPathMatcher(String) getPathMatcher()}. * Use {@code "*"} for all files. * @param mapOutputFn function to create output items. Parameters are * {@code fileName} and {@code line}. */ @Nonnull public static BatchSource files( @Nonnull String directory, @Nonnull Charset charset, @Nonnull String glob, @Nonnull DistributedBiFunction mapOutputFn ) { return batchFromProcessor("filesSource(" + new File(directory, glob) + ')', readFilesP(directory, charset, glob, mapOutputFn)); } /** * Convenience for {@link #files(String, Charset, String, DistributedBiFunction) * the full version of readFiles} which uses UTF-8 encoding, matches all * the files in the directory and emits lines of text in the files. */ @Nonnull public static BatchSource files(@Nonnull String directory) { return files(directory, UTF_8, GLOB_WILDCARD, (file, line) -> line); } /** * A source that emits a stream of lines of text coming from files in * the watched directory (but not its subdirectories). It will emit only * new contents added after startup: both new files and new content * appended to existing ones. *

* To be useful, the source should be configured to read data local to each * member. For example, if the pathname resolves to a shared network * filesystem visible by multiple members, they will emit duplicate data. *

* If, during the scanning phase, the source observes a file that doesn't * end with a newline, it will assume that there is a line just being * written. This line won't appear in its output. *

* The source completes when the directory is deleted. However, in order * to delete the directory, all files in it must be deleted and if you * delete a file that is currently being read from, the job may encounter * an {@code IOException}. The directory must be deleted on all nodes. *

* Any {@code IOException} will cause the job to fail. *

* The source does not save any state to snapshot. If the job is restarted, * lines added after the restart will be emitted, which gives at-most-once * behavior. *

* The default local parallelism for this processor is 2 (or 1 if just 1 * CPU is available). * *

Limitation on Windows

* On Windows the {@code WatchService} is not notified of appended lines * until the file is closed. If the file-writing process keeps the file * open while appending, the processor may fail to observe the changes. * It will be notified if any process tries to open that file, such as * looking at the file in Explorer. This holds for Windows 10 with the NTFS * file system and might change in future. You are advised to do your own * testing on your target Windows platform. * *

Use the latest JRE

* The underlying JDK API ({@link java.nio.file.WatchService}) has a * history of unreliability and this source may experience infinite * blocking, missed, or duplicate events as a result. Such problems may be * resolved by upgrading the JRE to the latest version. * * @param watchedDirectory pathname to the source directory * @param charset charset to use to decode the files * @param glob the globbing mask, see {@link * java.nio.file.FileSystem#getPathMatcher(String) getPathMatcher()}. * Use {@code "*"} for all files. */ @Nonnull public static StreamSource fileWatcher( @Nonnull String watchedDirectory, @Nonnull Charset charset, @Nonnull String glob ) { return streamFromProcessor("fileWatcherSource(" + watchedDirectory + '/' + glob + ')', streamFilesP(watchedDirectory, charset, glob, (file, line) -> line) ); } /** * Convenience for {@link #fileWatcher(String, Charset, String) * streamFiles(watchedDirectory, UTF_8, "*")}. */ @Nonnull public static StreamSource fileWatcher(@Nonnull String watchedDirectory) { return fileWatcher(watchedDirectory, UTF_8, GLOB_WILDCARD); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy