org.apache.avro.file.DataFileWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-core Show documentation
Show all versions of spark-core Show documentation
Shaded version of Apache Spark 2.x.x for Presto
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro.file;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FilterOutputStream;
import java.io.Flushable;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream.DataBlock;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.EncoderFactory;
/** Stores in a file a sequence of data conforming to a schema. The schema is
* stored in the file with the data. Each datum in a file is of the same
* schema. Data is written with a {@link DatumWriter}. Data is grouped into
* blocks. A synchronization marker is written between blocks, so that
* files may be split. Blocks may be compressed. Extensible metadata is
* stored at the end of the file. Files may be appended to.
* @see DataFileReader
*/
public class DataFileWriter implements Closeable, Flushable {
private Schema schema;
private DatumWriter dout;
private OutputStream underlyingStream;
private BufferedFileOutputStream out;
private BinaryEncoder vout;
private final Map meta = new HashMap();
private long blockCount; // # entries in current block
private NonCopyingByteArrayOutputStream buffer;
private BinaryEncoder bufOut;
private byte[] sync; // 16 random bytes
private int syncInterval = DataFileConstants.DEFAULT_SYNC_INTERVAL;
private boolean isOpen;
private Codec codec;
private boolean flushOnEveryBlock = true;
/** Construct a writer, not yet open. */
public DataFileWriter(DatumWriter dout) {
this.dout = dout;
}
private void assertOpen() {
if (!isOpen) throw new AvroRuntimeException("not open");
}
private void assertNotOpen() {
if (isOpen) throw new AvroRuntimeException("already open");
}
/**
* Configures this writer to use the given codec.
* May not be reset after writes have begun.
*/
public DataFileWriter setCodec(CodecFactory c) {
assertNotOpen();
this.codec = c.createInstance();
setMetaInternal(DataFileConstants.CODEC, codec.getName());
return this;
}
/**
* Set the synchronization interval for this file, in bytes.
* Valid values range from 32 to 2^30
* Suggested values are between 2K and 2M
*
* The stream is flushed by default at the end of each synchronization
* interval.
*
* If {@linkplain #setFlushOnEveryBlock(boolean)} is
* called with param set to false, then the block may not be flushed to the
* stream after the sync marker is written. In this case,
* the {@linkplain #flush()} must be called to flush the stream.
*
* Invalid values throw IllegalArgumentException
*
* @param syncInterval
* the approximate number of uncompressed bytes to write in each block
* @return
* this DataFileWriter
*/
public DataFileWriter setSyncInterval(int syncInterval) {
if (syncInterval < 32 || syncInterval > (1 << 30)) {
throw new IllegalArgumentException("Invalid syncInterval value: " + syncInterval);
}
this.syncInterval = syncInterval;
return this;
}
/** Open a new file for data matching a schema with a random sync. */
public DataFileWriter create(Schema schema, File file) throws IOException {
return create(schema, new SyncableFileOutputStream(file), null);
}
/** Open a new file for data matching a schema with a random sync. */
public DataFileWriter create(Schema schema, OutputStream outs)
throws IOException {
return create(schema, outs, null);
}
/** Open a new file for data matching a schema with an explicit sync. */
public DataFileWriter create(Schema schema, OutputStream outs, byte[] sync)
throws IOException {
assertNotOpen();
this.schema = schema;
setMetaInternal(DataFileConstants.SCHEMA, schema.toString());
if (sync == null ) {
this.sync = generateSync();
} else if (sync.length == 16) {
this.sync = sync;
} else {
throw new IOException("sync must be exactly 16 bytes");
}
init(outs);
vout.writeFixed(DataFileConstants.MAGIC); // write magic
vout.writeMapStart(); // write metadata
vout.setItemCount(meta.size());
for (Map.Entry entry : meta.entrySet()) {
vout.startItem();
vout.writeString(entry.getKey());
vout.writeBytes(entry.getValue());
}
vout.writeMapEnd();
vout.writeFixed(this.sync); // write initial sync
vout.flush(); //vout may be buffered, flush before writing to out
return this;
}
/**
* Set whether this writer should flush the block to the stream every time
* a sync marker is written. By default, the writer will flush the buffer
* each time a sync marker is written (if the block size limit is reached
* or the {@linkplain #sync()} is called.
* @param flushOnEveryBlock - If set to false, this writer will not flush
* the block to the stream until {@linkplain
* #flush()} is explicitly called.
*/
public void setFlushOnEveryBlock(boolean flushOnEveryBlock) {
this.flushOnEveryBlock = flushOnEveryBlock;
}
/**
* @return - true if this writer flushes the block to the stream every time
* a sync marker is written. Else returns false.
*/
public boolean isFlushOnEveryBlock() {
return this.flushOnEveryBlock;
}
/** Open a writer appending to an existing file. */
public DataFileWriter appendTo(File file) throws IOException {
return appendTo(new SeekableFileInput(file),
new SyncableFileOutputStream(file, true));
}
/** Open a writer appending to an existing file.
* @param in reading the existing file.
* @param out positioned at the end of the existing file.
*/
public DataFileWriter appendTo(SeekableInput in, OutputStream out)
throws IOException {
assertNotOpen();
DataFileReader reader =
new DataFileReader(in, new GenericDatumReader());
this.schema = reader.getSchema();
this.sync = reader.getHeader().sync;
this.meta.putAll(reader.getHeader().meta);
byte[] codecBytes = this.meta.get(DataFileConstants.CODEC);
if (codecBytes != null) {
String strCodec = new String(codecBytes, "UTF-8");
this.codec = CodecFactory.fromString(strCodec).createInstance();
} else {
this.codec = CodecFactory.nullCodec().createInstance();
}
reader.close();
init(out);
return this;
}
private void init(OutputStream outs) throws IOException {
this.underlyingStream = outs;
this.out = new BufferedFileOutputStream(outs);
EncoderFactory efactory = new EncoderFactory();
this.vout = efactory.binaryEncoder(out, null);
dout.setSchema(schema);
buffer = new NonCopyingByteArrayOutputStream(
Math.min((int)(syncInterval * 1.25), Integer.MAX_VALUE/2 -1));
this.bufOut = efactory.binaryEncoder(buffer, null);
if (this.codec == null) {
this.codec = CodecFactory.nullCodec().createInstance();
}
this.isOpen = true;
}
private static byte[] generateSync() {
try {
MessageDigest digester = MessageDigest.getInstance("MD5");
long time = System.currentTimeMillis();
digester.update((UUID.randomUUID()+"@"+time).getBytes());
return digester.digest();
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
private DataFileWriter setMetaInternal(String key, byte[] value) {
assertNotOpen();
meta.put(key, value);
return this;
}
private DataFileWriter setMetaInternal(String key, String value) {
try {
return setMetaInternal(key, value.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
/** Set a metadata property. */
public DataFileWriter setMeta(String key, byte[] value) {
if (isReservedMeta(key)) {
throw new AvroRuntimeException("Cannot set reserved meta key: " + key);
}
return setMetaInternal(key, value);
}
public static boolean isReservedMeta(String key) {
return key.startsWith("avro.");
}
/** Set a metadata property. */
public DataFileWriter setMeta(String key, String value) {
try {
return setMeta(key, value.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
/** Set a metadata property. */
public DataFileWriter setMeta(String key, long value) {
return setMeta(key, Long.toString(value));
}
/** Thrown by {@link #append(Object)} when an exception occurs while writing a
* datum to the buffer. When this is thrown, the file is unaltered and may
* continue to be appended to. */
public static class AppendWriteException extends RuntimeException {
public AppendWriteException(Exception e) { super(e); }
}
/** Append a datum to the file.
* @see AppendWriteException
*/
public void append(D datum) throws IOException {
assertOpen();
int usedBuffer = bufferInUse();
try {
dout.write(datum, bufOut);
} catch (IOException e) {
resetBufferTo(usedBuffer);
throw new AppendWriteException(e);
} catch (RuntimeException re) {
resetBufferTo(usedBuffer);
throw new AppendWriteException(re);
}
blockCount++;
writeIfBlockFull();
}
// if there is an error encoding, flush the encoder and then
// reset the buffer position to contain size bytes, discarding the rest.
// Otherwise the file will be corrupt with a partial record.
private void resetBufferTo(int size) throws IOException {
bufOut.flush();
byte[] data = buffer.toByteArray();
buffer.reset();
buffer.write(data, 0, size);
}
/** Expert: Append a pre-encoded datum to the file. No validation is
* performed to check that the encoding conforms to the file's schema.
* Appending non-conforming data may result in an unreadable file. */
public void appendEncoded(ByteBuffer datum) throws IOException {
assertOpen();
bufOut.writeFixed(datum);
blockCount++;
writeIfBlockFull();
}
private int bufferInUse() {
return (buffer.size() + bufOut.bytesBuffered());
}
private void writeIfBlockFull() throws IOException {
if (bufferInUse() >= syncInterval)
writeBlock();
}
/**
* Appends data from another file. otherFile must have the same schema.
* Data blocks will be copied without de-serializing data. If the codecs
* of the two files are compatible, data blocks are copied directly without
* decompression. If the codecs are not compatible, blocks from otherFile
* are uncompressed and then compressed using this file's codec.
*
* If the recompress flag is set all blocks are decompressed and then compressed
* using this file's codec. This is useful when the two files have compatible
* compression codecs but different codec options. For example, one might
* append a file compressed with deflate at compression level 1 to a file with
* deflate at compression level 7. If recompress is false, blocks
* will be copied without changing the compression level. If true, they will
* be converted to the new compression level.
* @param otherFile
* @param recompress
* @throws IOException
*/
public void appendAllFrom(DataFileStream otherFile, boolean recompress) throws IOException {
assertOpen();
// make sure other file has same schema
Schema otherSchema = otherFile.getSchema();
if (!this.schema.equals(otherSchema)) {
throw new IOException("Schema from file " + otherFile + " does not match");
}
// flush anything written so far
writeBlock();
Codec otherCodec = otherFile.resolveCodec();
DataBlock nextBlockRaw = null;
if (codec.equals(otherCodec) && !recompress) {
// copy raw bytes
while(otherFile.hasNextBlock()) {
nextBlockRaw = otherFile.nextRawBlock(nextBlockRaw);
nextBlockRaw.writeBlockTo(vout, sync);
}
} else {
while(otherFile.hasNextBlock()) {
nextBlockRaw = otherFile.nextRawBlock(nextBlockRaw);
nextBlockRaw.decompressUsing(otherCodec);
nextBlockRaw.compressUsing(codec);
nextBlockRaw.writeBlockTo(vout, sync);
}
}
}
private void writeBlock() throws IOException {
if (blockCount > 0) {
bufOut.flush();
ByteBuffer uncompressed = buffer.getByteArrayAsByteBuffer();
DataBlock block = new DataBlock(uncompressed, blockCount);
block.setFlushOnWrite(flushOnEveryBlock);
block.compressUsing(codec);
block.writeBlockTo(vout, sync);
buffer.reset();
blockCount = 0;
}
}
/** Return the current position as a value that may be passed to {@link
* DataFileReader#seek(long)}. Forces the end of the current block,
* emitting a synchronization marker. By default, this will also flush the
* block to the stream.
*
* If {@linkplain #setFlushOnEveryBlock(boolean)} is
* called with param set to false, then this method may not flush
* the block. In this case, the {@linkplain #flush()} must be called to
* flush the stream.
*/
public long sync() throws IOException {
assertOpen();
writeBlock();
return out.tell();
}
/** Calls {@linkplain #sync()} and then flushes the current state of the
* file.
*/
@Override
public void flush() throws IOException {
sync();
vout.flush();
}
/**
* If this writer was instantiated using a File or using an
* {@linkplain Syncable} instance, this method flushes all buffers for this
* writer to disk. In other cases, this method behaves exactly
* like {@linkplain #flush()}.
*
* @throws IOException
*/
public void fSync() throws IOException {
flush();
if (underlyingStream instanceof Syncable) {
((Syncable) underlyingStream).sync();
}
}
/** Flush and close the file. */
@Override
public void close() throws IOException {
if (isOpen) {
flush();
out.close();
isOpen = false;
}
}
private class BufferedFileOutputStream extends BufferedOutputStream {
private long position; // start of buffer
private class PositionFilter extends FilterOutputStream {
public PositionFilter(OutputStream out) throws IOException { super(out); }
@Override
public void write(byte[] b, int off, int len) throws IOException {
out.write(b, off, len);
position += len; // update on write
}
}
public BufferedFileOutputStream(OutputStream out) throws IOException {
super(null);
this.out = new PositionFilter(out);
}
public long tell() { return position+count; }
}
private static class NonCopyingByteArrayOutputStream extends ByteArrayOutputStream {
NonCopyingByteArrayOutputStream(int initialSize) {
super(initialSize);
}
ByteBuffer getByteArrayAsByteBuffer() {
return ByteBuffer.wrap(buf, 0, count);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy