All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.io.BinaryData Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro.io;

import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.generic.GenericDatumReader;

/** Utilities for binary-encoded data. */
public class BinaryData {

  private BinaryData() {}                      // no public ctor

  private static class Decoders {
    private final BinaryDecoder d1, d2;
    public Decoders() {
       this.d1 = new BinaryDecoder(new byte[0], 0, 0);
       this.d2 = new BinaryDecoder(new byte[0], 0, 0);
    }
    public void set(byte[] data1, int off1, int len1,
                    byte[] data2, int off2, int len2) {
      d1.setBuf(data1, off1, len1);
      d2.setBuf(data2, off2, len2);
    }
    public void clear() {
      d1.clearBuf();
      d2.clearBuf();
    }
  }                     // no public ctor

  private static final ThreadLocal DECODERS
    = new ThreadLocal() {
    @Override protected Decoders initialValue() { return new Decoders(); }
  };

  /** Compare binary encoded data.  If equal, return zero.  If greater-than,
   * return 1, if less than return -1. Order is consistent with that of {@link
   * org.apache.avro.generic.GenericData#compare(Object, Object, Schema)}.*/
  public static int compare(byte[] b1, int s1,
                            byte[] b2, int s2,
                            Schema schema) {
    return compare(b1, s1, b1.length - s1, b2, s2, b2.length - s2, schema);
  }

  /** Compare binary encoded data.  If equal, return zero.  If greater-than,
   * return 1, if less than return -1. Order is consistent with that of {@link
   * org.apache.avro.generic.GenericData#compare(Object, Object, Schema)}.*/
  public static int compare(byte[] b1, int s1, int l1,
                            byte[] b2, int s2, int l2,
                            Schema schema) {
    Decoders decoders = DECODERS.get();
    decoders.set(b1, s1, l1, b2, s2, l2);
    try {
      return compare(decoders, schema);
    } catch (IOException e) {
      throw new AvroRuntimeException(e);
    } finally {
      decoders.clear();
    }
  }

  /** If equal, return the number of bytes consumed.  If greater than, return
   * GT, if less than, return LT. */
  private static int compare(Decoders d, Schema schema) throws IOException {
    Decoder d1 = d.d1; Decoder d2 = d.d2;
    switch (schema.getType()) {
    case RECORD: {
      for (Field field : schema.getFields()) {
        if (field.order() == Field.Order.IGNORE) {
          GenericDatumReader.skip(field.schema(), d1);
          GenericDatumReader.skip(field.schema(), d2);
          continue;
        }
        int c = compare(d, field.schema());
        if (c != 0)
          return (field.order() != Field.Order.DESCENDING) ? c : -c;
      }
      return 0;
    }
    case ENUM: case INT: {
      int i1 = d1.readInt();
      int i2 = d2.readInt();
      return i1 == i2 ? 0 : (i1 > i2 ? 1 : -1);
    }
    case LONG: {
      long l1 = d1.readLong();
      long l2 = d2.readLong();
      return l1 == l2 ? 0 : (l1 > l2 ? 1 : -1);
    }
    case ARRAY: {
      long i = 0;                                 // position in array
      long r1 = 0, r2 = 0;                        // remaining in current block
      long l1 = 0, l2 = 0;                        // total array length
      while (true) {
        if (r1 == 0) {                            // refill blocks(s)
          r1 = d1.readLong();
          if (r1 < 0) { r1 = -r1; d1.readLong(); }
          l1 += r1;
        }
        if (r2 == 0) {
          r2 = d2.readLong();
          if (r2 < 0) { r2 = -r2; d2.readLong(); }
          l2 += r2;
        }
        if (r1 == 0 || r2 == 0)                   // empty block: done
          return (l1 == l2) ? 0 : ((l1 > l2) ? 1 : -1);
        long l = Math.min(l1, l2);
        while (i < l) {                           // compare to end of block
          int c = compare(d, schema.getElementType());
          if (c != 0) return c;
          i++; r1--; r2--;
        }
      }
    }
    case MAP:
      throw new AvroRuntimeException("Can't compare maps!");
    case UNION: {
      int i1 = d1.readInt();
      int i2 = d2.readInt();
      if (i1 == i2) {
        return compare(d, schema.getTypes().get(i1));
      } else {
        return i1 - i2;
      }
    }
    case FIXED: {
      int size = schema.getFixedSize();
      int c = compareBytes(d.d1.getBuf(), d.d1.getPos(), size,
                           d.d2.getBuf(), d.d2.getPos(), size);
      d.d1.skipFixed(size);
      d.d2.skipFixed(size);
      return c;
    }
    case STRING: case BYTES: {
      int l1 = d1.readInt();
      int l2 = d2.readInt();
      int c = compareBytes(d.d1.getBuf(), d.d1.getPos(), l1,
                           d.d2.getBuf(), d.d2.getPos(), l2);
      d.d1.skipFixed(l1);
      d.d2.skipFixed(l2);
      return c;
    }
    case FLOAT: {
      float f1 = d1.readFloat();
      float f2 = d2.readFloat();
      return (f1 == f2) ? 0 : ((f1 > f2) ? 1 : -1);
    }
    case DOUBLE: {
      double f1 = d1.readDouble();
      double f2 = d2.readDouble();
      return (f1 == f2) ? 0 : ((f1 > f2) ? 1 : -1);
    }
    case BOOLEAN:
      boolean b1 = d1.readBoolean();
      boolean b2 = d2.readBoolean();
      return (b1 == b2) ? 0 : (b1 ? 1 : -1);
    case NULL:
      return 0;
    default:
      throw new AvroRuntimeException("Unexpected schema to compare!");
    }
  }

  /** Lexicographically compare bytes.  If equal, return zero.  If greater-than,
   * return a positive value, if less than return a negative value. */
  public static int compareBytes(byte[] b1, int s1, int l1,
                                 byte[] b2, int s2, int l2) {
    int end1 = s1 + l1;
    int end2 = s2 + l2;
    for (int i = s1, j = s2; i < end1 && j < end2; i++, j++) {
      int a = (b1[i] & 0xff);
      int b = (b2[j] & 0xff);
      if (a != b) {
        return a - b;
      }
    }
    return l1 - l2;
  }

  private static class HashData {
    private final BinaryDecoder decoder;
    public HashData() {
      this.decoder = new BinaryDecoder(new byte[0], 0, 0);
    }
    public void set(byte[] bytes, int start, int len) {
      this.decoder.setBuf(bytes, start, len);
    }
  }

  private static final ThreadLocal HASH_DATA
    = new ThreadLocal() {
    @Override protected HashData initialValue() { return new HashData(); }
  };

  /** Hash binary encoded data. Consistent with {@link
   * org.apache.avro.generic.GenericData#hashCode(Object, Schema)}.*/
  public static int hashCode(byte[] bytes, int start, int length,
                             Schema schema) {
    HashData data = HASH_DATA.get();
    data.set(bytes, start, length);
    try {
      return hashCode(data, schema);
    } catch (IOException e) {
      throw new AvroRuntimeException(e);
    }
  }

  private static int hashCode(HashData data, Schema schema)
    throws IOException {
    Decoder decoder = data.decoder;
    switch (schema.getType()) {
    case RECORD: {
      int hashCode = 1;
      for (Field field : schema.getFields()) {
        if (field.order() == Field.Order.IGNORE) {
          GenericDatumReader.skip(field.schema(), decoder);
          continue;
        }
        hashCode = hashCode*31 + hashCode(data, field.schema());
      }
      return hashCode;
    }
    case ENUM: case INT:
      return decoder.readInt();
    case FLOAT:
      return Float.floatToIntBits(decoder.readFloat());
    case LONG: {
      long l = decoder.readLong();
      return (int)(l^(l>>>32));
    }
    case DOUBLE: {
      long l = Double.doubleToLongBits(decoder.readDouble());
      return (int)(l^(l>>>32));
    }
    case ARRAY: {
      Schema elementType = schema.getElementType();
      int hashCode = 1;
      for (long l = decoder.readArrayStart(); l != 0; l = decoder.arrayNext())
        for (long i = 0; i < l; i++)
          hashCode = hashCode*31 + hashCode(data, elementType);
      return hashCode;
    }
    case MAP:
      throw new AvroRuntimeException("Can't hashCode maps!");
    case UNION:
      return hashCode(data, schema.getTypes().get(decoder.readInt()));
    case FIXED:
      return hashBytes(1, data, schema.getFixedSize(), false);
    case STRING:
      return hashBytes(0, data, decoder.readInt(), false);
    case BYTES:
      return hashBytes(1, data, decoder.readInt(), true);
    case BOOLEAN:
      return decoder.readBoolean() ? 1231 : 1237;
    case NULL:
      return 0;
    default:
      throw new AvroRuntimeException("Unexpected schema to hashCode!");
    }
  }

  private static int hashBytes(int init, HashData data, int len, boolean rev)
    throws IOException {
    int hashCode = init;
    byte[] bytes = data.decoder.getBuf();
    int start = data.decoder.getPos();
    int end = start+len;
    if (rev)
      for (int i = end-1; i >= start; i--)
        hashCode = hashCode*31 + bytes[i];
    else
      for (int i = start; i < end; i++)
        hashCode = hashCode*31 + bytes[i];
    data.decoder.skipFixed(len);
    return hashCode;
  }

  /** Skip a binary-encoded long, returning the position after it. */
  public static int skipLong(byte[] bytes, int start) {
    int i = start;
    for (int b = bytes[i++]; ((b & 0x80) != 0); b = bytes[i++]) {}
    return i;
  }

  /** Encode a boolean to the byte array at the given position. Will throw
   * IndexOutOfBounds if the position is not valid.
   * @return The number of bytes written to the buffer, 1.
   */
  public static int encodeBoolean(boolean b, byte[] buf, int pos) {
    buf[pos] = b ? (byte) 1 : (byte) 0;
    return 1;
  }

  /** Encode an integer to the byte array at the given position. Will throw
   * IndexOutOfBounds if it overflows. Users should ensure that there are at
   * least 5 bytes left in the buffer before calling this method.
   * @return The number of bytes written to the buffer, between 1 and 5.
   */
  public static int encodeInt(int n, byte[] buf, int pos) {
  // move sign to low-order bit, and flip others if negative
    n = (n << 1) ^ (n >> 31);
    int start = pos;
    if ((n & ~0x7F) != 0) {
      buf[pos++] = (byte)((n | 0x80) & 0xFF);
      n >>>= 7;
      if (n > 0x7F) {
        buf[pos++] = (byte)((n | 0x80) & 0xFF);
        n >>>= 7;
        if (n > 0x7F) {
          buf[pos++] = (byte)((n | 0x80) & 0xFF);
          n >>>= 7;
          if (n > 0x7F) {
            buf[pos++] = (byte)((n | 0x80) & 0xFF);
            n >>>= 7;
          }
        }
      }
    }
    buf[pos++] = (byte) n;
    return pos - start;
  }

  /** Encode a long to the byte array at the given position. Will throw
   * IndexOutOfBounds if it overflows. Users should ensure that there are at
   * least 10 bytes left in the buffer before calling this method.
   * @return The number of bytes written to the buffer, between 1 and 10.
   */
  public static int encodeLong(long n, byte[] buf, int pos) {
    // move sign to low-order bit, and flip others if negative
    n = (n << 1) ^ (n >> 63);
    int start = pos;
    if ((n & ~0x7FL) != 0) {
      buf[pos++] = (byte)((n | 0x80) & 0xFF);
      n >>>= 7;
      if (n > 0x7F) {
        buf[pos++] = (byte)((n | 0x80) & 0xFF);
        n >>>= 7;
        if (n > 0x7F) {
          buf[pos++] = (byte)((n | 0x80) & 0xFF);
          n >>>= 7;
          if (n > 0x7F) {
            buf[pos++] = (byte)((n | 0x80) & 0xFF);
            n >>>= 7;
            if (n > 0x7F) {
              buf[pos++] = (byte)((n | 0x80) & 0xFF);
              n >>>= 7;
              if (n > 0x7F) {
                buf[pos++] = (byte)((n | 0x80) & 0xFF);
                n >>>= 7;
                if (n > 0x7F) {
                  buf[pos++] = (byte)((n | 0x80) & 0xFF);
                  n >>>= 7;
                  if (n > 0x7F) {
                    buf[pos++] = (byte)((n | 0x80) & 0xFF);
                    n >>>= 7;
                    if (n > 0x7F) {
                      buf[pos++] = (byte)((n | 0x80) & 0xFF);
                      n >>>= 7;
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
    buf[pos++] = (byte) n;
    return pos - start;
  }

  /** Encode a float to the byte array at the given position. Will throw
   * IndexOutOfBounds if it overflows. Users should ensure that there are at
   * least 4 bytes left in the buffer before calling this method.
   * @return Returns the number of bytes written to the buffer, 4.
   */
  public static int encodeFloat(float f, byte[] buf, int pos) {
    int len = 1;
    int bits = Float.floatToRawIntBits(f);
    // hotspot compiler works well with this variant
    buf[pos]         = (byte)((bits       ) & 0xFF);
    buf[pos + len++] = (byte)((bits >>>  8) & 0xFF);
    buf[pos + len++] = (byte)((bits >>> 16) & 0xFF);
    buf[pos + len++] = (byte)((bits >>> 24) & 0xFF);
    return 4;
  }

  /** Encode a double to the byte array at the given position. Will throw
   * IndexOutOfBounds if it overflows. Users should ensure that there are at
   * least 8 bytes left in the buffer before calling this method.
   * @return Returns the number of bytes written to the buffer, 8.
   */
  public static int encodeDouble(double d, byte[] buf, int pos) {
    long bits = Double.doubleToRawLongBits(d);
    int first = (int)(bits & 0xFFFFFFFF);
    int second = (int)((bits >>> 32) & 0xFFFFFFFF);
    // the compiler seems to execute this order the best, likely due to
    // register allocation -- the lifetime of constants is minimized.
    buf[pos]     = (byte)((first        ) & 0xFF);
    buf[pos + 4] = (byte)((second       ) & 0xFF);
    buf[pos + 5] = (byte)((second >>>  8) & 0xFF);
    buf[pos + 1] = (byte)((first >>>   8) & 0xFF);
    buf[pos + 2] = (byte)((first >>>  16) & 0xFF);
    buf[pos + 6] = (byte)((second >>> 16) & 0xFF);
    buf[pos + 7] = (byte)((second >>> 24) & 0xFF);
    buf[pos + 3] = (byte)((first >>>  24) & 0xFF);
    return 8;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy