All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.adgear.anoa.tools.runnable.AvroConcatenator Maven / Gradle / Ivy

Go to download

Additional functionality complementing the anoa-core module, requiring additional upstream dependencies such as jackson-databind and various jackson dataformats.

There is a newer version: 3.1.2
Show newest version
package com.adgear.anoa.tools.runnable;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.jooq.lambda.Unchecked;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UncheckedIOException;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * A Runnable for concatenating Avro record batches parser multiple input streams.
 */
public class AvroConcatenator implements Runnable {

  final protected List inputStreams;
  final protected OutputStream outputStream;

  /**
   * @param inputStreams Avro batch file input streams. The first stream provides the {@link
   *                     org.apache.avro.Schema} used during processing.
   * @param outputStream Output stream to Avro batch file.
   */
  public AvroConcatenator(List inputStreams, OutputStream outputStream) {
    this.inputStreams = inputStreams;
    this.outputStream = outputStream;
    if (inputStreams.isEmpty()) {
      throw new IllegalArgumentException("Requires at least 1 input stream.");
    }
  }

  @Override
  public void run() {
    try {
      DataFileStream dfs0 = new DataFileStream<>(inputStreams.get(0),
                                                                new GenericDatumReader<>());
      Schema schema = dfs0.getSchema();
      Function> builder = Unchecked.function(
          stream -> new DataFileStream<>(stream, new GenericDatumReader<>(schema)));

      try (DataFileWriter writer = new DataFileWriter<>(
          new GenericDatumWriter(schema))
          .create(schema, outputStream)) {
        writer.appendAllFrom(dfs0, true);
        inputStreams.stream().skip(1).sequential()
            .map(builder)
            .forEach(Unchecked.consumer(dfs -> writer.appendAllFrom(dfs, true)));
      }
    } catch (IOException e) {
      throw new UncheckedIOException(e);
    }
  }

  static public void main(String[] args) {
    List inputs = Stream.of(args)
        .map(File::new)
        .peek(f -> {
          if (!f.exists())
            throw new IllegalArgumentException("File '" + f + "' does not exist.");
        })
        .peek(f -> {if (f.isDirectory())
          throw new IllegalArgumentException("File '" + f + "' is a directory.");})
        .map(Unchecked.function(FileInputStream::new))
        .collect(Collectors.toList());
    new AvroConcatenator(inputs, System.out).run();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy