com.hazelcast.jet.avro.AvroSinks Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hazelcast-jet-avro Show documentation
Avro support for Hazelcast Jet
There is a newer version: 5.5.0
/*
 * Copyright 2021 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.avro;

import com.hazelcast.function.SupplierEx;
import com.hazelcast.jet.pipeline.Sink;
import com.hazelcast.jet.pipeline.Sinks;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.specific.SpecificRecord;

import javax.annotation.Nonnull;

/**
 * Contains factory methods for Apache Avro sinks.
 *
 * @since Jet 3.0
 */
public final class AvroSinks {

    private AvroSinks() {
    }

    /**
     * Returns a sink that that writes the items it receives to Apache Avro
     * files. Each processor will write to its own file whose name is equal to
     * the processor's global index (an integer unique to each processor of the
     * vertex), but a single pathname is used to resolve the containing
     * directory of all files, on all cluster members. The sink always
     * overwrites the files.
     * 
     * The sink creates a {@link DataFileWriter} for each processor using the
     * supplied {@code datumWriterSupplier} with the given {@link Schema}.
     * 

     * No state is saved to snapshot for this sink. After the job is restarted,
     * the items will be missing since files will be overwritten.
     * 
     * The default local parallelism for this sink is 1.
     *
     * @param directoryName directory to create the files in. Will be created
     *                      if it doesn't exist. Must be the same on all members.
     * @param schema the record schema
     * @param datumWriterSupplier the record writer supplier
     * @param  the type of the record
     */
    @Nonnull
    public static  Sink files(
            @Nonnull String directoryName,
            @Nonnull Schema schema,
            @Nonnull SupplierEx> datumWriterSupplier
    ) {

        return Sinks.fromProcessor("avroFilesSink(" + directoryName + ')',
                AvroProcessors.writeFilesP(directoryName, schema, datumWriterSupplier));
    }

    /**
     * Convenience for {@link #files(String, Schema,
     * SupplierEx)} which uses either {@link SpecificDatumWriter} or
     * {@link ReflectDatumWriter} depending on the supplied {@code recordClass}.
     */
    @Nonnull
    public static  Sink files(
            @Nonnull String directoryName,
            @Nonnull Class recordClass,
            @Nonnull Schema schema
    ) {
        return files(directoryName, schema, () -> SpecificRecord.class.isAssignableFrom(recordClass) ?
                new SpecificDatumWriter<>(recordClass) : new ReflectDatumWriter<>(recordClass));
    }

    /**
     * Convenience for {@link #files(String, Schema,
     * SupplierEx)} which uses {@link GenericDatumWriter}.
     */
    @Nonnull
    public static Sink files(
            @Nonnull String directoryName,
            @Nonnull Schema schema
    ) {
        return files(directoryName, schema, GenericDatumWriter::new);
    }
}