com.hazelcast.jet.hadoop.HadoopSources Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2024 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.hadoop;

import com.hazelcast.function.BiFunctionEx;
import com.hazelcast.function.ConsumerEx;
import com.hazelcast.jet.Util;
import com.hazelcast.jet.core.Processor;
import com.hazelcast.jet.hadoop.impl.SerializableConfiguration;
import com.hazelcast.jet.pipeline.BatchSource;
import com.hazelcast.jet.pipeline.Sources;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import javax.annotation.Nonnull;
import java.util.Map.Entry;

import static com.hazelcast.jet.hadoop.HadoopProcessors.readHadoopP;

/**
 * Contains factory methods for Apache Hadoop sources.
 *
 * @since Jet 3.0
 */
public final class HadoopSources {

    /**
     * With the new HDFS API, some of the {@link RecordReader}s return the same
     * key/value instances for each record, for example {@link LineRecordReader}.
     * If this property is set to {@code true}, the source makes a copy of each
     * object after applying the {@code projectionFn}. For readers which create
     * a new instance for each record, the source can be configured to not copy
     * the objects for performance.
     * 
     * Also if you are using a projection function which doesn't refer to any
     * mutable state from the key or value, then it makes sense to set this
     * property to {@code false} to avoid unnecessary copying.
     * 

     * The source copies the objects by serializing and de-serializing them. The
     * objects should be either {@link Writable} or serializable in a way which
     * Jet can serialize/de-serialize.
     * 

     * Here is how you can configure the source. Default and always safe value is
     * {@code true}:
     *
     * 
{@code
     *     Configuration conf = new Configuration();
     *     conf.setBoolean(HadoopSources.COPY_ON_READ, false);
     *     BatchSource> source = HadoopSources.inputFormat(conf);
     * }
     */
    public static final String COPY_ON_READ = "jet.source.copyonread";

    /**
     * When reading files from local file system using Hadoop, each processor
     * reads files from its own local file system. If the local file system
     * is shared between members, e.g NFS mounted filesystem, you should
     * configure this property as {@code true}.
     * 
     * Here is how you can configure the source. Default value is {@code false}:
     *
     * 
{@code
     *     Configuration conf = new Configuration();
     *     conf.setBoolean(HadoopSources.SHARED_LOCAL_FS, true);
     *     BatchSource> source = HadoopSources.inputFormat(conf);
     * }
     *
     * @since Jet 4.4
     */
    public static final String SHARED_LOCAL_FS = "jet.source.sharedlocalfs";

    /**
     * @since Jet 4.4
     */
    public static final String IGNORE_FILE_NOT_FOUND = "jet.source.ignorefilenotfound";

    private HadoopSources() {
    }

    /**
     * Returns a source that reads records from Apache Hadoop HDFS and emits
     * the results of transforming each record (a key-value pair) with the
     * supplied projection function.
     * 
     * This source splits and balances the input data among Jet {@linkplain
     * Processor processors}, doing its best to achieve data locality. To this
     * end the Jet cluster topology should be aligned with Hadoop's — on
     * each Hadoop member there should be a Jet member.
     * 

     * The processor will use either the new or the old MapReduce API based on
     * the key which stores the {@code InputFormat} configuration. If it's
     * stored under {@value MRJobConfig#INPUT_FORMAT_CLASS_ATTR}, the new API
     * will be used. Otherwise, the old API will be used. If you get the
     * configuration from {@link Job#getConfiguration()}, the new API will be
     * used. Please see {@link #COPY_ON_READ} if you are using the new API.
     * 

     * The default local parallelism for this processor is 2 (or less if less CPUs
     * are available).
     * 

     * This source does not save any state to snapshot. If the job is restarted,
     * all entries will be emitted again.
     *
     * @param            key type of the records
     * @param            value type of the records
     * @param            the type of the emitted value
     * @param configuration JobConf for reading files with the appropriate
     *                      input format and path
     * @param projectionFn  function to create output objects from key and value.
     *                      If the projection returns a {@code null} for an item, that item
     *                      will be filtered out
     */
    @Nonnull
    public static  BatchSource inputFormat(
            @Nonnull Configuration configuration,
            @Nonnull BiFunctionEx projectionFn
    ) {
        return Sources.batchFromProcessor("hdfsSource",
                readHadoopP(SerializableConfiguration.asSerializable(configuration), projectionFn));
    }

    /**
     * Returns a source that reads records from Apache Hadoop HDFS and emits
     * the results of transforming each record (a key-value pair) with the
     * supplied projection function.
     * 

     * This source splits and balances the input data among Jet {@linkplain
     * Processor processors}, doing its best to achieve data locality. To this
     * end the Jet cluster topology should be aligned with Hadoop's — on
     * each Hadoop member there should be a Jet member.
     * 

     * The {@code configureFn} is used to configure the MR Job. The function is
     * run on the coordinator node of the Jet Job, avoiding contacting the server
     * from the machine where the job is submitted.
     * 

     * The new MapReduce API will be used.
     * 

     * The default local parallelism for this processor is 2 (or less if less CPUs
     * are available).
     * 
     * This source does not save any state to snapshot. If the job is restarted,
     * all entries will be emitted again.
     *
     * @param            key type of the records
     * @param            value type of the records
     * @param            the type of the emitted value
     * @param configureFn   function to configure the MR job
     * @param projectionFn  function to create output objects from key and value.
     *                      If the projection returns a {@code null} for an item, that item
     *                      will be filtered out
     */
    @Nonnull
    public static  BatchSource inputFormat(
            @Nonnull ConsumerEx configureFn,
            @Nonnull BiFunctionEx projectionFn
    ) {
        return Sources.batchFromProcessor("readHadoop", readHadoopP(null, configureFn, projectionFn));
    }

    /**
     * Convenience for {@link #inputFormat(Configuration, BiFunctionEx)}
     * with {@link java.util.Map.Entry} as its output type.
     */
    @Nonnull
    public static  BatchSource> inputFormat(@Nonnull Configuration jobConf) {
        return inputFormat(jobConf, (BiFunctionEx>) Util::entry);
    }
}