com.hazelcast.jet.pipeline.file.FileSourceBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hazelcast-jdbc Show documentation
Hazelcast JDBC Driver
The newest version!
/*
 * Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.pipeline.file;

import com.hazelcast.jet.JetException;
import com.hazelcast.jet.core.ProcessorMetaSupplier;
import com.hazelcast.jet.pipeline.BatchSource;
import com.hazelcast.jet.pipeline.Sources;
import com.hazelcast.jet.pipeline.file.impl.FileSourceConfiguration;
import com.hazelcast.jet.pipeline.file.impl.FileSourceFactory;
import com.hazelcast.jet.pipeline.file.impl.LocalFileSourceFactory;

import javax.annotation.Nonnull;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.ServiceLoader;

import static com.hazelcast.jet.pipeline.file.WildcardMatcher.hasWildcard;
import static java.util.Objects.requireNonNull;

/**
 * A unified builder object for various kinds of file sources.
 * 
 * To create an instance, use {@link FileSources#files(String)}.
 *
 * @param  the type of items a source using this file format will emit
 * @since Jet 4.4
 */
public class FileSourceBuilder {

    private static final List HADOOP_PREFIXES = List.of(
            "s3a://",   // Amazon S3
            "hdfs://",  // HDFS
            "wasbs://", // Azure Cloud Storage
            "adl://",   // Azure Data Lake Gen 1
            "abfs://",  // Azure Data Lake Gen 2
            "gs://"     // Google Cloud Storage
    );

    private final Map options = new HashMap<>();

    private final String path;
    private String glob = "*";
    private FileFormat format;
    private boolean useHadoop;
    private boolean sharedFileSystem;
    private boolean ignoreFileNotFound;

    FileSourceBuilder(@Nonnull String path) {
        this.path = requireNonNull(path, "path must not be null");
        if (hasWildcard(path)) {
            throw new IllegalArgumentException("Provided path must not contain any wildcard characters, path: " + path);
        }
        if (!(hasHadoopPrefix(path) || Paths.get(path).isAbsolute())) {
            throw new IllegalArgumentException("Provided path must be absolute, path: " + path);
        }
    }

    /**
     * Sets a glob pattern to filter the files in the specified directory. The
     * default value is '*', matching all files in the directory.
     *
     * @param glob glob pattern,
     */
    public FileSourceBuilder glob(@Nonnull String glob) {
        this.glob = requireNonNull(glob, "glob must not be null");
        return this;
    }

    /**
     * Set the file format for the source. See {@link FileFormat} for available
     * formats and factory methods.
     * 

     * It's not possible to implement a custom format.
     */
    @Nonnull
    public  FileSourceBuilder format(@Nonnull FileFormat fileFormat) {
        @SuppressWarnings("unchecked")
        FileSourceBuilder newThis = (FileSourceBuilder) this;
        newThis.format = fileFormat;
        return newThis;
    }

    /**
     * Specifies that Jet should use Apache Hadoop for files from the local
     * filesystem. Otherwise, local files are read by Jet directly. One
     * advantage of Hadoop is that it can provide better parallelization when
     * the number of files is smaller than the total parallelism of the
     * pipeline source.
     * 

     * Default value is {@code false}.
     *
     * @param useHadoop if Hadoop should be used for reading local filesystem
     */
    @Nonnull
    public FileSourceBuilder useHadoopForLocalFiles(boolean useHadoop) {
        this.useHadoop = useHadoop;
        return this;
    }

    /**
     * If {@code sharedFileSystem} is {@code true}, Jet will assume all members
     * see the same files. They will split the work so that each member will
     * read a part of the files. If {@code sharedFileSystem} is {@code false},
     * each member will read all files in the directory, assuming that other
     * members see different files.
     * 

     * This option applies only for the local filesystem when {@linkplain
     * #useHadoopForLocalFiles(boolean) Hadoop is not used} and when the
     * directory doesn't contain a prefix for a remote file system. Distributed
     * filesystems are always assumed to be shared.
     * 

     * If you start all the members on a single machine (such as for
     * development), set this property to {@code true}. If you have multiple
     * machines with multiple members each and the directory is not a shared
     * storage, it's not possible to configure the file reader correctly - use
     * only one member per machine.
     * 

     * Default value is {@code false}.
     */
    @Nonnull
    public FileSourceBuilder sharedFileSystem(boolean sharedFileSystem) {
        this.sharedFileSystem = sharedFileSystem;
        return this;
    }

    /**
     * Set to true to ignore no matching files in the directory specified by
     * {@code path}.
     * 

     * When there is no file matching the glob specified by
     * {@link #glob(String)} (or the default glob) Jet throws an exception by
     * default. This might be problematic in some cases, where the directory
     * is empty. To override this behaviour set this to true.
     * 

     * If set to true and there are no files in the directory the source will
     * produce 0 items.
     * 

     * Default value is {@code false}.
     *
     * @param ignoreFileNotFound true if no files in the specified directory should be accepted
     */
    @Nonnull
    public FileSourceBuilder ignoreFileNotFound(boolean ignoreFileNotFound) {
        this.ignoreFileNotFound = ignoreFileNotFound;
        return this;
    }

    /**
     * Specifies an arbitrary option for the underlying source. If you are
     * looking for a missing option, check out the {@link FileFormat} class
     * you're using, it offers parsing-related options.
     */
    @Nonnull
    public FileSourceBuilder option(String key, String value) {
        requireNonNull(key, "key must not be null");
        requireNonNull(value, "value must not be null");
        options.put(key, value);
        return this;
    }

    /**
     * Builds a {@link BatchSource} based on the current state of the builder.
     */
    @Nonnull
    public BatchSource build() {
        ProcessorMetaSupplier metaSupplier = buildMetaSupplier();

        return Sources.batchFromProcessor("files(path=" + path + ", glob=" + glob + ", hadoop=" + shouldUseHadoop(),
                metaSupplier);
    }

    /**
     * Builds a {@link ProcessorMetaSupplier} based on the current state of the
     * builder. Use for integration with the Core API.
     * 
     * This method is a part of Core API and has lower backward-compatibility
     * guarantees (we can change it in minor version).
     */
    @Nonnull
    public ProcessorMetaSupplier buildMetaSupplier() {
        if (path == null) {
            throw new IllegalStateException("Parameter 'path' is required");
        }
        if (format == null) {
            throw new IllegalStateException("Parameter 'format' is required");
        }

        FileSourceConfiguration fsc = new FileSourceConfiguration<>(
                path, glob, format, sharedFileSystem, ignoreFileNotFound, options
        );

        if (shouldUseHadoop()) {
            ServiceLoader loader = ServiceLoader.load(FileSourceFactory.class);
            // Only one implementation is expected to be present on classpath
            Iterator iterator = loader.iterator();
            if (!iterator.hasNext()) {
                throw new JetException("No suitable FileSourceFactory found. " +
                                       "Do you have Jet's Hadoop module on classpath?");
            }
            FileSourceFactory fileSourceFactory = iterator.next();
            if (iterator.hasNext()) {
                throw new JetException("Multiple FileSourceFactory implementations found");
            }
            return fileSourceFactory.create(fsc);
        }
        return new LocalFileSourceFactory().create(fsc);
    }

    private boolean shouldUseHadoop() {
        return useHadoop || hasHadoopPrefix(path);
    }

    /**
     * Checks if the given path starts with one of the defined Hadoop
     * prefixes:
     *     "s3a://",   // Amazon S3
     *     "hdfs://",  // HDFS
     *     "wasbs://", // Azure Cloud Storage
     *     "adl://",   // Azure Data Lake Gen 1
     *     "abfs://",  // Azure Data Lake Gen 2
     *     "gs://"     // Google Cloud Storage
     *
     * see {@link #HADOOP_PREFIXES}
     */
    public static boolean hasHadoopPrefix(String path) {
        return HADOOP_PREFIXES.stream().anyMatch(path::startsWith);
    }
}