com.hazelcast.jet.hadoop.HadoopSources Maven / Gradle / Ivy
/*
* Copyright 2024 Hazelcast Inc.
*
* Licensed under the Hazelcast Community License (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://hazelcast.com/hazelcast-community-license
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.hadoop;
import com.hazelcast.function.BiFunctionEx;
import com.hazelcast.function.ConsumerEx;
import com.hazelcast.jet.Util;
import com.hazelcast.jet.core.Processor;
import com.hazelcast.jet.hadoop.impl.SerializableConfiguration;
import com.hazelcast.jet.pipeline.BatchSource;
import com.hazelcast.jet.pipeline.Sources;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import javax.annotation.Nonnull;
import java.util.Map.Entry;
import static com.hazelcast.jet.hadoop.HadoopProcessors.readHadoopP;
/**
* Contains factory methods for Apache Hadoop sources.
*
* @since Jet 3.0
*/
public final class HadoopSources {
/**
* With the new HDFS API, some of the {@link RecordReader}s return the same
* key/value instances for each record, for example {@link LineRecordReader}.
* If this property is set to {@code true}, the source makes a copy of each
* object after applying the {@code projectionFn}. For readers which create
* a new instance for each record, the source can be configured to not copy
* the objects for performance.
*
* Also if you are using a projection function which doesn't refer to any
* mutable state from the key or value, then it makes sense to set this
* property to {@code false} to avoid unnecessary copying.
*
* The source copies the objects by serializing and de-serializing them. The
* objects should be either {@link Writable} or serializable in a way which
* Jet can serialize/de-serialize.
*
* Here is how you can configure the source. Default and always safe value is
* {@code true}:
*
*
{@code
* Configuration conf = new Configuration();
* conf.setBoolean(HadoopSources.COPY_ON_READ, false);
* BatchSource> source = HadoopSources.inputFormat(conf);
* }
*/
public static final String COPY_ON_READ = "jet.source.copyonread";
/**
* When reading files from local file system using Hadoop, each processor
* reads files from its own local file system. If the local file system
* is shared between members, e.g NFS mounted filesystem, you should
* configure this property as {@code true}.
*
* Here is how you can configure the source. Default value is {@code false}:
*
*
{@code
* Configuration conf = new Configuration();
* conf.setBoolean(HadoopSources.SHARED_LOCAL_FS, true);
* BatchSource> source = HadoopSources.inputFormat(conf);
* }
*
* @since Jet 4.4
*/
public static final String SHARED_LOCAL_FS = "jet.source.sharedlocalfs";
/**
* @since Jet 4.4
*/
public static final String IGNORE_FILE_NOT_FOUND = "jet.source.ignorefilenotfound";
private HadoopSources() {
}
/**
* Returns a source that reads records from Apache Hadoop HDFS and emits
* the results of transforming each record (a key-value pair) with the
* supplied projection function.
*
* This source splits and balances the input data among Jet {@linkplain
* Processor processors}, doing its best to achieve data locality. To this
* end the Jet cluster topology should be aligned with Hadoop's — on
* each Hadoop member there should be a Jet member.
*
* The processor will use either the new or the old MapReduce API based on
* the key which stores the {@code InputFormat} configuration. If it's
* stored under {@value MRJobConfig#INPUT_FORMAT_CLASS_ATTR}, the new API
* will be used. Otherwise, the old API will be used. If you get the
* configuration from {@link Job#getConfiguration()}, the new API will be
* used. Please see {@link #COPY_ON_READ} if you are using the new API.
*
* The default local parallelism for this processor is 2 (or less if less CPUs
* are available).
*
* This source does not save any state to snapshot. If the job is restarted,
* all entries will be emitted again.
*
* @param key type of the records
* @param value type of the records
* @param the type of the emitted value
* @param configuration JobConf for reading files with the appropriate
* input format and path
* @param projectionFn function to create output objects from key and value.
* If the projection returns a {@code null} for an item, that item
* will be filtered out
*/
@Nonnull
public static BatchSource inputFormat(
@Nonnull Configuration configuration,
@Nonnull BiFunctionEx projectionFn
) {
return Sources.batchFromProcessor("hdfsSource",
readHadoopP(SerializableConfiguration.asSerializable(configuration), projectionFn));
}
/**
* Returns a source that reads records from Apache Hadoop HDFS and emits
* the results of transforming each record (a key-value pair) with the
* supplied projection function.
*
* This source splits and balances the input data among Jet {@linkplain
* Processor processors}, doing its best to achieve data locality. To this
* end the Jet cluster topology should be aligned with Hadoop's — on
* each Hadoop member there should be a Jet member.
*
* The {@code configureFn} is used to configure the MR Job. The function is
* run on the coordinator node of the Jet Job, avoiding contacting the server
* from the machine where the job is submitted.
*
* The new MapReduce API will be used.
*
* The default local parallelism for this processor is 2 (or less if less CPUs
* are available).
*
* This source does not save any state to snapshot. If the job is restarted,
* all entries will be emitted again.
*
* @param key type of the records
* @param value type of the records
* @param the type of the emitted value
* @param configureFn function to configure the MR job
* @param projectionFn function to create output objects from key and value.
* If the projection returns a {@code null} for an item, that item
* will be filtered out
*/
@Nonnull
public static BatchSource inputFormat(
@Nonnull ConsumerEx configureFn,
@Nonnull BiFunctionEx projectionFn
) {
return Sources.batchFromProcessor("readHadoop", readHadoopP(null, configureFn, projectionFn));
}
/**
* Convenience for {@link #inputFormat(Configuration, BiFunctionEx)}
* with {@link java.util.Map.Entry} as its output type.
*/
@Nonnull
public static BatchSource> inputFormat(@Nonnull Configuration jobConf) {
return inputFormat(jobConf, (BiFunctionEx>) Util::entry);
}
}