
com.hazelcast.jet.pipeline.file.FileSourceBuilder Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.pipeline.file;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.core.ProcessorMetaSupplier;
import com.hazelcast.jet.pipeline.BatchSource;
import com.hazelcast.jet.pipeline.Sources;
import com.hazelcast.jet.pipeline.file.impl.FileSourceConfiguration;
import com.hazelcast.jet.pipeline.file.impl.FileSourceFactory;
import com.hazelcast.jet.pipeline.file.impl.LocalFileSourceFactory;
import javax.annotation.Nonnull;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.ServiceLoader;
import static com.hazelcast.jet.pipeline.file.WildcardMatcher.hasWildcard;
import static java.util.Objects.requireNonNull;
/**
* A unified builder object for various kinds of file sources.
*
* To create an instance, use {@link FileSources#files(String)}.
*
* @param the type of items a source using this file format will emit
* @since Jet 4.4
*/
public class FileSourceBuilder {
private static final List HADOOP_PREFIXES = List.of(
"s3a://", // Amazon S3
"hdfs://", // HDFS
"wasbs://", // Azure Cloud Storage
"adl://", // Azure Data Lake Gen 1
"abfs://", // Azure Data Lake Gen 2
"gs://" // Google Cloud Storage
);
private final Map options = new HashMap<>();
private final String path;
private String glob = "*";
private FileFormat format;
private boolean useHadoop;
private boolean sharedFileSystem;
private boolean ignoreFileNotFound;
FileSourceBuilder(@Nonnull String path) {
this.path = requireNonNull(path, "path must not be null");
if (hasWildcard(path)) {
throw new IllegalArgumentException("Provided path must not contain any wildcard characters, path: " + path);
}
if (!(hasHadoopPrefix(path) || Paths.get(path).isAbsolute())) {
throw new IllegalArgumentException("Provided path must be absolute, path: " + path);
}
}
/**
* Sets a glob pattern to filter the files in the specified directory. The
* default value is '*', matching all files in the directory.
*
* @param glob glob pattern,
*/
public FileSourceBuilder glob(@Nonnull String glob) {
this.glob = requireNonNull(glob, "glob must not be null");
return this;
}
/**
* Set the file format for the source. See {@link FileFormat} for available
* formats and factory methods.
*
* It's not possible to implement a custom format.
*/
@Nonnull
public FileSourceBuilder format(@Nonnull FileFormat fileFormat) {
@SuppressWarnings("unchecked")
FileSourceBuilder newThis = (FileSourceBuilder) this;
newThis.format = fileFormat;
return newThis;
}
/**
* Specifies that Jet should use Apache Hadoop for files from the local
* filesystem. Otherwise, local files are read by Jet directly. One
* advantage of Hadoop is that it can provide better parallelization when
* the number of files is smaller than the total parallelism of the
* pipeline source.
*
* Default value is {@code false}.
*
* @param useHadoop if Hadoop should be used for reading local filesystem
*/
@Nonnull
public FileSourceBuilder useHadoopForLocalFiles(boolean useHadoop) {
this.useHadoop = useHadoop;
return this;
}
/**
* If {@code sharedFileSystem} is {@code true}, Jet will assume all members
* see the same files. They will split the work so that each member will
* read a part of the files. If {@code sharedFileSystem} is {@code false},
* each member will read all files in the directory, assuming that other
* members see different files.
*
* This option applies only for the local filesystem when {@linkplain
* #useHadoopForLocalFiles(boolean) Hadoop is not used} and when the
* directory doesn't contain a prefix for a remote file system. Distributed
* filesystems are always assumed to be shared.
*
* If you start all the members on a single machine (such as for
* development), set this property to {@code true}. If you have multiple
* machines with multiple members each and the directory is not a shared
* storage, it's not possible to configure the file reader correctly - use
* only one member per machine.
*
* Default value is {@code false}.
*/
@Nonnull
public FileSourceBuilder sharedFileSystem(boolean sharedFileSystem) {
this.sharedFileSystem = sharedFileSystem;
return this;
}
/**
* Set to true to ignore no matching files in the directory specified by
* {@code path}.
*
* When there is no file matching the glob specified by
* {@link #glob(String)} (or the default glob) Jet throws an exception by
* default. This might be problematic in some cases, where the directory
* is empty. To override this behaviour set this to true.
*
* If set to true and there are no files in the directory the source will
* produce 0 items.
*
* Default value is {@code false}.
*
* @param ignoreFileNotFound true if no files in the specified directory should be accepted
*/
@Nonnull
public FileSourceBuilder ignoreFileNotFound(boolean ignoreFileNotFound) {
this.ignoreFileNotFound = ignoreFileNotFound;
return this;
}
/**
* Specifies an arbitrary option for the underlying source. If you are
* looking for a missing option, check out the {@link FileFormat} class
* you're using, it offers parsing-related options.
*/
@Nonnull
public FileSourceBuilder option(String key, String value) {
requireNonNull(key, "key must not be null");
requireNonNull(value, "value must not be null");
options.put(key, value);
return this;
}
/**
* Builds a {@link BatchSource} based on the current state of the builder.
*/
@Nonnull
public BatchSource build() {
ProcessorMetaSupplier metaSupplier = buildMetaSupplier();
return Sources.batchFromProcessor("files(path=" + path + ", glob=" + glob + ", hadoop=" + shouldUseHadoop(),
metaSupplier);
}
/**
* Builds a {@link ProcessorMetaSupplier} based on the current state of the
* builder. Use for integration with the Core API.
*
* This method is a part of Core API and has lower backward-compatibility
* guarantees (we can change it in minor version).
*/
@Nonnull
public ProcessorMetaSupplier buildMetaSupplier() {
if (path == null) {
throw new IllegalStateException("Parameter 'path' is required");
}
if (format == null) {
throw new IllegalStateException("Parameter 'format' is required");
}
FileSourceConfiguration fsc = new FileSourceConfiguration<>(
path, glob, format, sharedFileSystem, ignoreFileNotFound, options
);
if (shouldUseHadoop()) {
ServiceLoader loader = ServiceLoader.load(FileSourceFactory.class);
// Only one implementation is expected to be present on classpath
Iterator iterator = loader.iterator();
if (!iterator.hasNext()) {
throw new JetException("No suitable FileSourceFactory found. " +
"Do you have Jet's Hadoop module on classpath?");
}
FileSourceFactory fileSourceFactory = iterator.next();
if (iterator.hasNext()) {
throw new JetException("Multiple FileSourceFactory implementations found");
}
return fileSourceFactory.create(fsc);
}
return new LocalFileSourceFactory().create(fsc);
}
private boolean shouldUseHadoop() {
return useHadoop || hasHadoopPrefix(path);
}
/**
* Checks if the given path starts with one of the defined Hadoop
* prefixes:
* "s3a://", // Amazon S3
* "hdfs://", // HDFS
* "wasbs://", // Azure Cloud Storage
* "adl://", // Azure Data Lake Gen 1
* "abfs://", // Azure Data Lake Gen 2
* "gs://" // Google Cloud Storage
*
* see {@link #HADOOP_PREFIXES}
*/
public static boolean hasHadoopPrefix(String path) {
return HADOOP_PREFIXES.stream().anyMatch(path::startsWith);
}
}