All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.hadoop.HadoopSinks Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2024 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.hadoop;

import com.hazelcast.function.FunctionEx;
import com.hazelcast.jet.core.Processor;
import com.hazelcast.jet.pipeline.Sink;
import com.hazelcast.jet.pipeline.Sinks;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;

import javax.annotation.Nonnull;
import java.util.Map.Entry;

/**
 * Factories of Apache Hadoop sinks.
 *
 * @since Jet 3.0
 */
public final class HadoopSinks {

    private HadoopSinks() {
    }

    /**
     * Returns a sink that writes to Apache Hadoop HDFS. It transforms each
     * received item to a key-value pair using the two supplied mapping
     * functions. The type of the key and the value must conform to the
     * expectations of the output format specified in the {@code
     * configuration}.
     * 

* The sink creates a number of files in the output path, identified by the * cluster member UUID and the {@link Processor} index. Unlike MapReduce, * the data in the files is not sorted by key. *

* The supplied {@code Configuration} must specify an {@code OutputFormat} * class with a path. *

* The processor will use either the new or the old MapReduce API based on * the key which stores the {@code OutputFormat} configuration. If it's * stored under {@value MRJobConfig#OUTPUT_FORMAT_CLASS_ATTR}}, the new API * will be used. Otherwise, the old API will be used. If you get the * configuration from {@link Job#getConfiguration()}, the new API will be * used. *

* No state is saved to snapshot for this sink. After the job is restarted, * the files will be overwritten. If the cluster members change, some files * will be overwritten and some not - we don't clean the directory before * the execution starts. *

* The default local parallelism for this processor is 2 (or less if less CPUs * are available). * * @param configuration {@code Configuration} used for output format configuration * @param extractKeyF mapper to map a key to another key * @param extractValueF mapper to map a value to another value * @param stream item type * @param type of key to write to HDFS * @param type of value to write to HDFS */ @Nonnull public static Sink outputFormat( @Nonnull Configuration configuration, @Nonnull FunctionEx extractKeyF, @Nonnull FunctionEx extractValueF ) { return Sinks.fromProcessor("hdfsSink", HadoopProcessors.writeHadoopP(configuration, extractKeyF, extractValueF)); } /** * Convenience for {@link #outputFormat(Configuration, FunctionEx, * FunctionEx)} which expects {@code Map.Entry} as * input and extracts its key and value parts to be written to HDFS. */ @Nonnull public static Sink> outputFormat(@Nonnull Configuration configuration) { return outputFormat(configuration, Entry::getKey, Entry::getValue); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy