All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.file.DataFileWriter Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro.file;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FilterOutputStream;
import java.io.Flushable;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream.DataBlock;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.EncoderFactory;

/** Stores in a file a sequence of data conforming to a schema.  The schema is
 * stored in the file with the data.  Each datum in a file is of the same
 * schema.  Data is written with a {@link DatumWriter}.  Data is grouped into
 * blocks.  A synchronization marker is written between blocks, so that
 * files may be split.  Blocks may be compressed.  Extensible metadata is
 * stored at the end of the file.  Files may be appended to.
 * @see DataFileReader
 */
public class DataFileWriter implements Closeable, Flushable {
  private Schema schema;
  private DatumWriter dout;

  private OutputStream underlyingStream;

  private BufferedFileOutputStream out;
  private BinaryEncoder vout;

  private final Map meta = new HashMap();

  private long blockCount;                       // # entries in current block

  private NonCopyingByteArrayOutputStream buffer;
  private BinaryEncoder bufOut;

  private byte[] sync;                          // 16 random bytes
  private int syncInterval = DataFileConstants.DEFAULT_SYNC_INTERVAL;

  private boolean isOpen;
  private Codec codec;

  private boolean flushOnEveryBlock = true;

  /** Construct a writer, not yet open. */
  public DataFileWriter(DatumWriter dout) {
    this.dout = dout;
  }

  private void assertOpen() {
    if (!isOpen) throw new AvroRuntimeException("not open");
  }
  private void assertNotOpen() {
    if (isOpen) throw new AvroRuntimeException("already open");
  }

  /**
   * Configures this writer to use the given codec.
   * May not be reset after writes have begun.
   */
  public DataFileWriter setCodec(CodecFactory c) {
    assertNotOpen();
    this.codec = c.createInstance();
    setMetaInternal(DataFileConstants.CODEC, codec.getName());
    return this;
  }

  /**
   * Set the synchronization interval for this file, in bytes.
   * Valid values range from 32 to 2^30
   * Suggested values are between 2K and 2M
   *
   * The stream is flushed by default at the end of each synchronization
   * interval.
   *
   * If {@linkplain #setFlushOnEveryBlock(boolean)} is
   * called with param set to false, then the block may not be flushed to the
   * stream after the sync marker is written. In this case,
   * the {@linkplain #flush()} must be called to flush the stream.
   *
   * Invalid values throw IllegalArgumentException
   *
   * @param syncInterval
   *   the approximate number of uncompressed bytes to write in each block
   * @return
   *   this DataFileWriter
   */
  public DataFileWriter setSyncInterval(int syncInterval) {
    if (syncInterval < 32 || syncInterval > (1 << 30)) {
      throw new IllegalArgumentException("Invalid syncInterval value: " + syncInterval);
    }
    this.syncInterval = syncInterval;
    return this;
  }

  /** Open a new file for data matching a schema with a random sync. */
  public DataFileWriter create(Schema schema, File file) throws IOException {
    return create(schema, new SyncableFileOutputStream(file), null);
  }

  /** Open a new file for data matching a schema with a random sync. */
  public DataFileWriter create(Schema schema, OutputStream outs)
    throws IOException {
    return create(schema, outs, null);
  }

  /** Open a new file for data matching a schema with an explicit sync. */
  public DataFileWriter create(Schema schema, OutputStream outs, byte[] sync)
    throws IOException {
    assertNotOpen();

    this.schema = schema;
    setMetaInternal(DataFileConstants.SCHEMA, schema.toString());
    if (sync == null ) {
      this.sync = generateSync();
    } else if (sync.length == 16) {
      this.sync = sync;
    } else {
      throw new IOException("sync must be exactly 16 bytes");
    }

    init(outs);

    vout.writeFixed(DataFileConstants.MAGIC);           // write magic

    vout.writeMapStart();                         // write metadata
    vout.setItemCount(meta.size());
    for (Map.Entry entry : meta.entrySet()) {
      vout.startItem();
      vout.writeString(entry.getKey());
      vout.writeBytes(entry.getValue());
    }
    vout.writeMapEnd();
    vout.writeFixed(this.sync); // write initial sync
    vout.flush(); //vout may be buffered, flush before writing to out
    return this;
  }

  /**
   * Set whether this writer should flush the block to the stream every time
   * a sync marker is written. By default, the writer will flush the buffer
   * each time a sync marker is written (if the block size limit is reached
   * or the {@linkplain #sync()} is called.
   * @param flushOnEveryBlock - If set to false, this writer will not flush
   *                          the block to the stream until {@linkplain
   *                          #flush()} is explicitly called.
   */
  public void setFlushOnEveryBlock(boolean flushOnEveryBlock) {
    this.flushOnEveryBlock = flushOnEveryBlock;
  }

  /**
   * @return - true if this writer flushes the block to the stream every time
   * a sync marker is written. Else returns false.
   */
  public boolean isFlushOnEveryBlock() {
    return this.flushOnEveryBlock;
  }

  /** Open a writer appending to an existing file. */
  public DataFileWriter appendTo(File file) throws IOException {
    return appendTo(new SeekableFileInput(file),
                    new SyncableFileOutputStream(file, true));
  }

  /** Open a writer appending to an existing file.
   * @param in reading the existing file.
   * @param out positioned at the end of the existing file.
   */
  public DataFileWriter appendTo(SeekableInput in, OutputStream out)
    throws IOException {
    assertNotOpen();
    DataFileReader reader =
      new DataFileReader(in, new GenericDatumReader());
    this.schema = reader.getSchema();
    this.sync = reader.getHeader().sync;
    this.meta.putAll(reader.getHeader().meta);
    byte[] codecBytes = this.meta.get(DataFileConstants.CODEC);
    if (codecBytes != null) {
      String strCodec = new String(codecBytes, "UTF-8");
      this.codec = CodecFactory.fromString(strCodec).createInstance();
    } else {
      this.codec = CodecFactory.nullCodec().createInstance();
    }
    reader.close();

    init(out);

    return this;
  }

  private void init(OutputStream outs) throws IOException {
    this.underlyingStream = outs;
    this.out = new BufferedFileOutputStream(outs);
    EncoderFactory efactory = new EncoderFactory();
    this.vout = efactory.binaryEncoder(out, null);
    dout.setSchema(schema);
    buffer = new NonCopyingByteArrayOutputStream(
        Math.min((int)(syncInterval * 1.25), Integer.MAX_VALUE/2 -1));
    this.bufOut = efactory.binaryEncoder(buffer, null);
    if (this.codec == null) {
      this.codec = CodecFactory.nullCodec().createInstance();
    }
    this.isOpen = true;
  }

  private static byte[] generateSync() {
    try {
      MessageDigest digester = MessageDigest.getInstance("MD5");
      long time = System.currentTimeMillis();
      digester.update((UUID.randomUUID()+"@"+time).getBytes());
      return digester.digest();
    } catch (NoSuchAlgorithmException e) {
      throw new RuntimeException(e);
    }
  }

  private DataFileWriter setMetaInternal(String key, byte[] value) {
    assertNotOpen();
    meta.put(key, value);
    return this;
  }

  private DataFileWriter setMetaInternal(String key, String value) {
    try {
      return setMetaInternal(key, value.getBytes("UTF-8"));
    } catch (UnsupportedEncodingException e) {
      throw new RuntimeException(e);
    }
  }

  /** Set a metadata property. */
  public DataFileWriter setMeta(String key, byte[] value) {
    if (isReservedMeta(key)) {
      throw new AvroRuntimeException("Cannot set reserved meta key: " + key);
    }
    return setMetaInternal(key, value);
  }

  public static boolean isReservedMeta(String key) {
    return key.startsWith("avro.");
  }

  /** Set a metadata property. */
  public DataFileWriter setMeta(String key, String value) {
    try {
      return setMeta(key, value.getBytes("UTF-8"));
    } catch (UnsupportedEncodingException e) {
      throw new RuntimeException(e);
    }
  }
  /** Set a metadata property. */
  public DataFileWriter setMeta(String key, long value) {
    return setMeta(key, Long.toString(value));
  }

  /** Thrown by {@link #append(Object)} when an exception occurs while writing a
   * datum to the buffer.  When this is thrown, the file is unaltered and may
   * continue to be appended to. */
  public static class AppendWriteException extends RuntimeException {
    public AppendWriteException(Exception e) { super(e); }
  }

  /** Append a datum to the file.
   * @see AppendWriteException
   */
  public void append(D datum) throws IOException {
    assertOpen();
    int usedBuffer = bufferInUse();
    try {
      dout.write(datum, bufOut);
    } catch (IOException e) {
      resetBufferTo(usedBuffer);
      throw new AppendWriteException(e);
    } catch (RuntimeException re) {
      resetBufferTo(usedBuffer);
      throw new AppendWriteException(re);
    }
    blockCount++;
    writeIfBlockFull();
  }

  // if there is an error encoding, flush the encoder and then
  // reset the buffer position to contain size bytes, discarding the rest.
  // Otherwise the file will be corrupt with a partial record.
  private void resetBufferTo(int size) throws IOException {
    bufOut.flush();
    byte[] data = buffer.toByteArray();
    buffer.reset();
    buffer.write(data, 0, size);
  }

  /** Expert: Append a pre-encoded datum to the file.  No validation is
   * performed to check that the encoding conforms to the file's schema.
   * Appending non-conforming data may result in an unreadable file. */
  public void appendEncoded(ByteBuffer datum) throws IOException {
    assertOpen();
    bufOut.writeFixed(datum);
    blockCount++;
    writeIfBlockFull();
  }

  private int bufferInUse() {
    return (buffer.size() + bufOut.bytesBuffered());
  }

  private void writeIfBlockFull() throws IOException {
    if (bufferInUse() >= syncInterval)
      writeBlock();
  }

  /**
   * Appends data from another file.  otherFile must have the same schema.
   * Data blocks will be copied without de-serializing data.  If the codecs
   * of the two files are compatible, data blocks are copied directly without
   * decompression.  If the codecs are not compatible, blocks from otherFile
   * are uncompressed and then compressed using this file's codec.
   * 

* If the recompress flag is set all blocks are decompressed and then compressed * using this file's codec. This is useful when the two files have compatible * compression codecs but different codec options. For example, one might * append a file compressed with deflate at compression level 1 to a file with * deflate at compression level 7. If recompress is false, blocks * will be copied without changing the compression level. If true, they will * be converted to the new compression level. * @param otherFile * @param recompress * @throws IOException */ public void appendAllFrom(DataFileStream otherFile, boolean recompress) throws IOException { assertOpen(); // make sure other file has same schema Schema otherSchema = otherFile.getSchema(); if (!this.schema.equals(otherSchema)) { throw new IOException("Schema from file " + otherFile + " does not match"); } // flush anything written so far writeBlock(); Codec otherCodec = otherFile.resolveCodec(); DataBlock nextBlockRaw = null; if (codec.equals(otherCodec) && !recompress) { // copy raw bytes while(otherFile.hasNextBlock()) { nextBlockRaw = otherFile.nextRawBlock(nextBlockRaw); nextBlockRaw.writeBlockTo(vout, sync); } } else { while(otherFile.hasNextBlock()) { nextBlockRaw = otherFile.nextRawBlock(nextBlockRaw); nextBlockRaw.decompressUsing(otherCodec); nextBlockRaw.compressUsing(codec); nextBlockRaw.writeBlockTo(vout, sync); } } } private void writeBlock() throws IOException { if (blockCount > 0) { bufOut.flush(); ByteBuffer uncompressed = buffer.getByteArrayAsByteBuffer(); DataBlock block = new DataBlock(uncompressed, blockCount); block.setFlushOnWrite(flushOnEveryBlock); block.compressUsing(codec); block.writeBlockTo(vout, sync); buffer.reset(); blockCount = 0; } } /** Return the current position as a value that may be passed to {@link * DataFileReader#seek(long)}. Forces the end of the current block, * emitting a synchronization marker. By default, this will also flush the * block to the stream. * * If {@linkplain #setFlushOnEveryBlock(boolean)} is * called with param set to false, then this method may not flush * the block. In this case, the {@linkplain #flush()} must be called to * flush the stream. */ public long sync() throws IOException { assertOpen(); writeBlock(); return out.tell(); } /** Calls {@linkplain #sync()} and then flushes the current state of the * file. */ @Override public void flush() throws IOException { sync(); vout.flush(); } /** * If this writer was instantiated using a File or using an * {@linkplain Syncable} instance, this method flushes all buffers for this * writer to disk. In other cases, this method behaves exactly * like {@linkplain #flush()}. * * @throws IOException */ public void fSync() throws IOException { flush(); if (underlyingStream instanceof Syncable) { ((Syncable) underlyingStream).sync(); } } /** Flush and close the file. */ @Override public void close() throws IOException { if (isOpen) { flush(); out.close(); isOpen = false; } } private class BufferedFileOutputStream extends BufferedOutputStream { private long position; // start of buffer private class PositionFilter extends FilterOutputStream { public PositionFilter(OutputStream out) throws IOException { super(out); } @Override public void write(byte[] b, int off, int len) throws IOException { out.write(b, off, len); position += len; // update on write } } public BufferedFileOutputStream(OutputStream out) throws IOException { super(null); this.out = new PositionFilter(out); } public long tell() { return position+count; } } private static class NonCopyingByteArrayOutputStream extends ByteArrayOutputStream { NonCopyingByteArrayOutputStream(int initialSize) { super(initialSize); } ByteBuffer getByteArrayAsByteBuffer() { return ByteBuffer.wrap(buf, 0, count); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy