All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.file.DataFileStream Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro.file;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Closeable;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.InvalidAvroMagicException;
import org.apache.avro.Schema;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;

/**
 * Streaming access to files written by {@link DataFileWriter}. Use
 * {@link DataFileReader} for file-based input.
 *
 * @see DataFileWriter
 */
public class DataFileStream implements Iterator, Iterable, Closeable {

  /**
   * A handle that can be used to reopen a DataFile without re-reading the header
   * of the stream.
   */
  public static final class Header {
    Schema schema;
    Map meta = new HashMap<>();
    private transient List metaKeyList = new ArrayList<>();
    byte[] sync = new byte[DataFileConstants.SYNC_SIZE];

    private Header() {
    }
  }

  private DatumReader reader;
  private long blockSize;
  private boolean availableBlock = false;
  private Header header;

  /** Decoder on raw input stream. (Used for metadata.) */
  BinaryDecoder vin;
  /**
   * Secondary decoder, for datums. (Different than vin for block segments.)
   */
  BinaryDecoder datumIn = null;

  ByteBuffer blockBuffer;
  long blockCount; // # entries in block
  long blockRemaining; // # entries remaining in block
  byte[] syncBuffer = new byte[DataFileConstants.SYNC_SIZE];
  private Codec codec;

  /**
   * Construct a reader for an input stream. For file-based input, use
   * {@link DataFileReader}. This will buffer, wrapping with a
   * {@link java.io.BufferedInputStream} is not necessary.
   */
  public DataFileStream(InputStream in, DatumReader reader) throws IOException {
    this.reader = reader;
    initialize(in);
  }

  /**
   * create an uninitialized DataFileStream
   */
  protected DataFileStream(DatumReader reader) throws IOException {
    this.reader = reader;
  }

  /** Initialize the stream by reading from its head. */
  void initialize(InputStream in) throws IOException {
    this.header = new Header();
    this.vin = DecoderFactory.get().binaryDecoder(in, vin);
    byte[] magic = new byte[DataFileConstants.MAGIC.length];
    try {
      vin.readFixed(magic); // read magic
    } catch (IOException e) {
      throw new IOException("Not an Avro data file.", e);
    }
    if (!Arrays.equals(DataFileConstants.MAGIC, magic))
      throw new InvalidAvroMagicException("Not an Avro data file.");

    long l = vin.readMapStart(); // read meta data
    if (l > 0) {
      do {
        for (long i = 0; i < l; i++) {
          String key = vin.readString(null).toString();
          ByteBuffer value = vin.readBytes(null);
          byte[] bb = new byte[value.remaining()];
          value.get(bb);
          header.meta.put(key, bb);
          header.metaKeyList.add(key);
        }
      } while ((l = vin.mapNext()) != 0);
    }
    vin.readFixed(header.sync); // read sync

    // finalize the header
    header.metaKeyList = Collections.unmodifiableList(header.metaKeyList);
    header.schema = new Schema.Parser().setValidate(false).setValidateDefaults(false)
        .parse(getMetaString(DataFileConstants.SCHEMA));
    this.codec = resolveCodec();
    reader.setSchema(header.schema);
  }

  /** Initialize the stream without reading from it. */
  void initialize(InputStream in, Header header) throws IOException {
    this.header = header;
    this.codec = resolveCodec();
    reader.setSchema(header.schema);
  }

  Codec resolveCodec() {
    String codecStr = getMetaString(DataFileConstants.CODEC);
    if (codecStr != null) {
      return CodecFactory.fromString(codecStr).createInstance();
    } else {
      return CodecFactory.nullCodec().createInstance();
    }
  }

  /**
   * A handle that can be used to reopen this stream without rereading the head.
   */
  public Header getHeader() {
    return header;
  }

  /** Return the schema used in this file. */
  public Schema getSchema() {
    return header.schema;
  }

  /** Return the list of keys in the metadata */
  public List getMetaKeys() {
    return header.metaKeyList;
  }

  /** Return the value of a metadata property. */
  public byte[] getMeta(String key) {
    return header.meta.get(key);
  }

  /** Return the value of a metadata property. */
  public String getMetaString(String key) {
    byte[] value = getMeta(key);
    if (value == null) {
      return null;
    }
    return new String(value, StandardCharsets.UTF_8);
  }

  /** Return the value of a metadata property. */
  public long getMetaLong(String key) {
    return Long.parseLong(getMetaString(key));
  }

  /**
   * Returns an iterator over entries in this file. Note that this iterator is
   * shared with other users of the file: it does not contain a separate pointer
   * into the file.
   */
  @Override
  public Iterator iterator() {
    return this;
  }

  private DataBlock block = null;

  /** True if more entries remain in this file. */
  @Override
  public boolean hasNext() {
    try {
      if (blockRemaining == 0) {
        // check that the previous block was finished
        if (null != datumIn) {
          boolean atEnd = datumIn.isEnd();
          if (!atEnd) {
            throw new IOException("Block read partially, the data may be corrupt");
          }
        }
        if (hasNextBlock()) {
          block = nextRawBlock(block);
          block.decompressUsing(codec);
          blockBuffer = block.getAsByteBuffer();
          datumIn = DecoderFactory.get().binaryDecoder(blockBuffer.array(),
              blockBuffer.arrayOffset() + blockBuffer.position(), blockBuffer.remaining(), datumIn);
        }
      }
      return blockRemaining != 0;
    } catch (EOFException e) { // at EOF
      return false;
    } catch (IOException e) {
      throw new AvroRuntimeException(e);
    }
  }

  /**
   * Read the next datum in the file.
   *
   * @throws NoSuchElementException if no more remain in the file.
   */
  @Override
  public D next() {
    try {
      return next(null);
    } catch (IOException e) {
      throw new AvroRuntimeException(e);
    }
  }

  /**
   * Read the next datum from the file.
   *
   * @param reuse an instance to reuse.
   * @throws NoSuchElementException if no more remain in the file.
   */
  public D next(D reuse) throws IOException {
    if (!hasNext())
      throw new NoSuchElementException();
    D result = reader.read(reuse, datumIn);
    if (0 == --blockRemaining) {
      blockFinished();
    }
    return result;
  }

  /** Expert: Return the next block in the file, as binary-encoded data. */
  public ByteBuffer nextBlock() throws IOException {
    if (!hasNext())
      throw new NoSuchElementException();
    if (blockRemaining != blockCount)
      throw new IllegalStateException("Not at block start.");
    blockRemaining = 0;
    datumIn = null;
    return blockBuffer;
  }

  /** Expert: Return the count of items in the current block. */
  public long getBlockCount() {
    return blockCount;
  }

  /** Expert: Return the size in bytes (uncompressed) of the current block. */
  public long getBlockSize() {
    return blockSize;
  }

  protected void blockFinished() throws IOException {
    // nothing for the stream impl
  }

  boolean hasNextBlock() {
    try {
      if (availableBlock)
        return true;
      if (vin.isEnd())
        return false;
      blockRemaining = vin.readLong(); // read block count
      blockSize = vin.readLong(); // read block size
      if (blockSize > Integer.MAX_VALUE || blockSize < 0) {
        throw new IOException("Block size invalid or too large for this " + "implementation: " + blockSize);
      }
      blockCount = blockRemaining;
      availableBlock = true;
      return true;
    } catch (EOFException eof) {
      return false;
    } catch (IOException e) {
      throw new AvroRuntimeException(e);
    }
  }

  DataBlock nextRawBlock(DataBlock reuse) throws IOException {
    if (!hasNextBlock()) {
      throw new NoSuchElementException();
    }
    if (reuse == null || reuse.data.length < (int) blockSize) {
      reuse = new DataBlock(blockRemaining, (int) blockSize);
    } else {
      reuse.numEntries = blockRemaining;
      reuse.blockSize = (int) blockSize;
    }
    // throws if it can't read the size requested
    vin.readFixed(reuse.data, 0, reuse.blockSize);
    vin.readFixed(syncBuffer);
    availableBlock = false;
    if (!Arrays.equals(syncBuffer, header.sync))
      throw new IOException("Invalid sync!");
    return reuse;
  }

  /** Not supported. */
  @Override
  public void remove() {
    throw new UnsupportedOperationException();
  }

  /** Close this reader. */
  @Override
  public void close() throws IOException {
    vin.inputStream().close();
  }

  static class DataBlock {
    private byte[] data;
    private long numEntries;
    private int blockSize;
    private int offset = 0;
    private boolean flushOnWrite = true;

    private DataBlock(long numEntries, int blockSize) {
      this.data = new byte[blockSize];
      this.numEntries = numEntries;
      this.blockSize = blockSize;
    }

    DataBlock(ByteBuffer block, long numEntries) {
      this.data = block.array();
      this.blockSize = block.remaining();
      this.offset = block.arrayOffset() + block.position();
      this.numEntries = numEntries;
    }

    byte[] getData() {
      return data;
    }

    long getNumEntries() {
      return numEntries;
    }

    int getBlockSize() {
      return blockSize;
    }

    boolean isFlushOnWrite() {
      return flushOnWrite;
    }

    void setFlushOnWrite(boolean flushOnWrite) {
      this.flushOnWrite = flushOnWrite;
    }

    ByteBuffer getAsByteBuffer() {
      return ByteBuffer.wrap(data, offset, blockSize);
    }

    void decompressUsing(Codec c) throws IOException {
      ByteBuffer result = c.decompress(getAsByteBuffer());
      data = result.array();
      blockSize = result.remaining();
    }

    void compressUsing(Codec c) throws IOException {
      ByteBuffer result = c.compress(getAsByteBuffer());
      data = result.array();
      blockSize = result.remaining();
    }

    void writeBlockTo(BinaryEncoder e, byte[] sync) throws IOException {
      e.writeLong(this.numEntries);
      e.writeLong(this.blockSize);
      e.writeFixed(this.data, offset, this.blockSize);
      e.writeFixed(sync);
      if (flushOnWrite) {
        e.flush();
      }
    }

  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy