All Downloads are FREE. Search and download functionalities are using the official Maven repository.

parquet.schema.Types Maven / Gradle / Ivy

There is a newer version: 1.6.0
Show newest version
package parquet.schema;

import java.util.ArrayList;
import java.util.List;

import parquet.Preconditions;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
import parquet.schema.Type.ID;

/**
 * This class provides fluent builders that produce Parquet schema Types.
 * 

* The most basic use is to build primitive types: *

 *   Types.required(INT64).named("id");
 *   Types.optional(INT32).named("number");
 * 
*

* The {@link #required(PrimitiveTypeName)} factory method produces a primitive * type builder, and the {@link PrimitiveBuilder#named(String)} builds the * {@link PrimitiveType}. Between {@code required} and {@code named}, other * builder methods can be used to add type annotations or other type metadata: *

 *   Types.required(BINARY).as(UTF8).named("username");
 *   Types.optional(FIXED_LEN_BYTE_ARRAY).length(20).named("sha1");
 * 
*

* Optional types are built using {@link #optional(PrimitiveTypeName)} to get * the builder. *

* Groups are built similarly, using {@code requiredGroup()} (or the optional * version) to return a group builder. Group builders provide {@code required} * and {@code optional} to add primitive types, which return primitive builders * like the versions above. *

 *   // This produces:
 *   // required group User {
 *   //   required int64 id;
 *   //   optional binary email (UTF8);
 *   // }
 *   Types.requiredGroup()
 *            .required(INT64).named("id")
 *            .required(BINARY).as(UTF8).named("email")
 *        .named("User")
 * 
*

* When {@code required} is called on a group builder, the builder it returns * will add the type to the parent group when it is built and {@code named} will * return its parent group builder (instead of the type) so more fields can be * added. *

* Sub-groups can be created using {@code requiredGroup()} to get a group * builder that will create the group type, add it to the parent builder, and * return the parent builder for more fields. *

 *   // required group User {
 *   //   required int64 id;
 *   //   optional binary email (UTF8);
 *   //   optional group address {
 *   //     required binary street (UTF8);
 *   //     required int32 zipcode;
 *   //   }
 *   // }
 *   Types.requiredGroup()
 *            .required(INT64).named("id")
 *            .required(BINARY).as(UTF8).named("email")
 *            .optionalGroup()
 *                .required(BINARY).as(UTF8).named("street")
 *                .required(INT32).named("zipcode")
 *            .named("address")
 *        .named("User")
 * 
*

* Message types are built using {@link #buildMessage()} and function just like * group builders. *

 *   // message User {
 *   //   required int64 id;
 *   //   optional binary email (UTF8);
 *   //   optional group address {
 *   //     required binary street (UTF8);
 *   //     required int32 zipcode;
 *   //   }
 *   // }
 *   Types.buildMessage()
 *            .required(INT64).named("id")
 *            .required(BINARY).as(UTF8).named("email")
 *            .optionalGroup()
 *                .required(BINARY).as(UTF8).named("street")
 *                .required(INT32).named("zipcode")
 *            .named("address")
 *        .named("User")
 * 
*

* These builders enforce consistency checks based on the specifications in * the parquet-format documentation. For example, if DECIMAL is used to annotate * a FIXED_LEN_BYTE_ARRAY that is not long enough for its maximum precision, * these builders will throw an IllegalArgumentException: *

 *   // throws IllegalArgumentException with message:
 *   // "FIXED(4) is not long enough to store 10 digits"
 *   Types.required(FIXED_LEN_BYTE_ARRAY).length(4)
 *        .as(DECIMAL).precision(10)
 *        .named("badDecimal");
 * 
*/ public class Types { private static final int NOT_SET = 0; /** * A base builder for {@link Type} objects. * * @param

The type that this builder will return from * {@link #named(String)} when the type is built. */ public abstract static class Builder { protected final P parent; protected final Class returnClass; protected Type.Repetition repetition = null; protected OriginalType originalType = null; protected Type.ID id = null; private boolean repetitionAlreadySet = false; /** * Construct a type builder that returns a "parent" object when the builder * is finished. The {@code parent} will be returned by * {@link #named(String)} so that builders can be chained. * * @param parent a non-null object to return from {@link #named(String)} */ protected Builder(P parent) { Preconditions.checkNotNull(parent, "Parent cannot be null"); this.parent = parent; this.returnClass = null; } /** * Construct a type builder that returns the {@link Type} that was built * when the builder is finished. The {@code returnClass} must be the * expected {@code Type} class. * * @param returnClass a {@code Type} to return from {@link #named(String)} */ protected Builder(Class

returnClass) { Preconditions.checkArgument(Type.class.isAssignableFrom(returnClass), "The requested return class must extend Type"); this.returnClass = returnClass; this.parent = null; } protected abstract T self(); protected final T repetition(Type.Repetition repetition) { Preconditions.checkArgument(!repetitionAlreadySet, "Repetition has already been set"); Preconditions.checkNotNull(repetition, "Repetition cannot be null"); this.repetition = repetition; this.repetitionAlreadySet = true; return self(); } /** * Adds a type annotation ({@link OriginalType}) to the type being built. *

* Type annotations are used to extend the types that parquet can store, by * specifying how the primitive types should be interpreted. This keeps the * set of primitive types to a minimum and reuses parquet's efficient * encodings. For example, strings are stored as byte arrays (binary) with * a UTF8 annotation. * * @param type an {@code OriginalType} * @return this builder for method chaining */ public T as(OriginalType type) { this.originalType = type; return self(); } /** * adds an id annotation to the type being built. *

* ids are used to capture the original id when converting from models using ids (thrift, protobufs) * * @param id the id of the field * @return this builder for method chaining */ public T id(int id) { this.id = new ID(id); return self(); } abstract protected Type build(String name); /** * Builds a {@link Type} and returns the parent builder, if given, or the * {@code Type} that was built. If returning a parent object that is a * GroupBuilder, the constructed type will be added to it as a field. *

* Note: Any configuration for this type builder should be done * before calling this method. * * @param name a name for the constructed type * @return the parent {@code GroupBuilder} or the constructed {@code Type} */ public P named(String name) { Preconditions.checkNotNull(name, "Name is required"); Preconditions.checkNotNull(repetition, "Repetition is required"); Type type = build(name); if (parent != null) { // if the parent is a GroupBuilder, add type to it if (GroupBuilder.class.isAssignableFrom(parent.getClass())) { GroupBuilder.class.cast(parent).addField(type); } return parent; } else { // no parent indicates that the Type object should be returned // the constructor check guarantees that returnClass is a Type return returnClass.cast(type); } } } /** * A builder for {@link PrimitiveType} objects. * * @param

The type that this builder will return from * {@link #named(String)} when the type is built. */ public static class PrimitiveBuilder

extends Builder, P> { private static final long MAX_PRECISION_INT32 = maxPrecision(4); private static final long MAX_PRECISION_INT64 = maxPrecision(8); private final PrimitiveTypeName primitiveType; private int length = NOT_SET; private int precision = NOT_SET; private int scale = NOT_SET; private PrimitiveBuilder(P parent, PrimitiveTypeName type) { super(parent); this.primitiveType = type; } private PrimitiveBuilder(Class

returnType, PrimitiveTypeName type) { super(returnType); this.primitiveType = type; } @Override protected PrimitiveBuilder

self() { return this; } /** * Adds the length for a FIXED_LEN_BYTE_ARRAY. * * @param length an int length * @return this builder for method chaining */ public PrimitiveBuilder

length(int length) { this.length = length; return this; } /** * Adds the precision for a DECIMAL. *

* This value is required for decimals and must be less than or equal to * the maximum number of base-10 digits in the underlying type. A 4-byte * fixed, for example, can store up to 9 base-10 digits. * * @param precision an int precision value for the DECIMAL * @return this builder for method chaining */ public PrimitiveBuilder

precision(int precision) { this.precision = precision; return this; } /** * Adds the scale for a DECIMAL. *

* This value must be less than the maximum precision of the type and must * be a positive number. If not set, the default scale is 0. *

* The scale specifies the number of digits of the underlying unscaled * that are to the right of the decimal point. The decimal interpretation * of values in this column is: {@code value*10^(-scale)}. * * @param scale an int scale value for the DECIMAL * @return this builder for method chaining */ public PrimitiveBuilder

scale(int scale) { this.scale = scale; return this; } @Override protected PrimitiveType build(String name) { if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitiveType) { Preconditions.checkArgument(length > 0, "Invalid FIXED_LEN_BYTE_ARRAY length: " + length); } DecimalMetadata meta = decimalMetadata(); // validate type annotations and required metadata if (originalType != null) { switch (originalType) { case UTF8: case JSON: case BSON: Preconditions.checkState( primitiveType == PrimitiveTypeName.BINARY, originalType.toString() + " can only annotate binary fields"); break; case DECIMAL: Preconditions.checkState( (primitiveType == PrimitiveTypeName.INT32) || (primitiveType == PrimitiveTypeName.INT64) || (primitiveType == PrimitiveTypeName.BINARY) || (primitiveType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY), "DECIMAL can only annotate INT32, INT64, BINARY, and FIXED" ); if (primitiveType == PrimitiveTypeName.INT32) { Preconditions.checkState( meta.getPrecision() <= MAX_PRECISION_INT32, "INT32 cannot store " + meta.getPrecision() + " digits " + "(max " + MAX_PRECISION_INT32 + ")"); } else if (primitiveType == PrimitiveTypeName.INT64) { Preconditions.checkState( meta.getPrecision() <= MAX_PRECISION_INT64, "INT64 cannot store " + meta.getPrecision() + " digits " + "(max " + MAX_PRECISION_INT64 + ")"); } else if (primitiveType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { Preconditions.checkState( meta.getPrecision() <= maxPrecision(length), "FIXED(" + length + ") cannot store " + meta.getPrecision() + " digits (max " + maxPrecision(length) + ")"); } break; case DATE: case TIME_MILLIS: case UINT_8: case UINT_16: case UINT_32: case INT_8: case INT_16: case INT_32: Preconditions.checkState(primitiveType == PrimitiveTypeName.INT32, originalType.toString() + " can only annotate INT32"); break; case TIMESTAMP_MILLIS: case UINT_64: case INT_64: Preconditions.checkState(primitiveType == PrimitiveTypeName.INT64, originalType.toString() + " can only annotate INT64"); break; case INTERVAL: Preconditions.checkState( (primitiveType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) && (length == 12), "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)"); break; case ENUM: Preconditions.checkState( primitiveType == PrimitiveTypeName.BINARY, "ENUM can only annotate binary fields"); break; default: throw new IllegalStateException(originalType + " can not be applied to a primitive type"); } } return new PrimitiveType(repetition, primitiveType, length, name, originalType, meta, id); } private static long maxPrecision(int numBytes) { return Math.round( // convert double to long Math.floor(Math.log10( // number of base-10 digits Math.pow(2, 8 * numBytes - 1) - 1) // max value stored in numBytes ) ); } protected DecimalMetadata decimalMetadata() { DecimalMetadata meta = null; if (OriginalType.DECIMAL == originalType) { Preconditions.checkArgument(precision > 0, "Invalid DECIMAL precision: " + precision); Preconditions.checkArgument(scale >= 0, "Invalid DECIMAL scale: " + scale); Preconditions.checkArgument(scale <= precision, "Invalid DECIMAL scale: cannot be greater than precision"); meta = new DecimalMetadata(precision, scale); } return meta; } } /** * A builder for {@link GroupType} objects. * * @param

The type that this builder will return from * {@link #named(String)} when the type is built. */ public static class GroupBuilder

extends Builder, P> { protected final List fields; private GroupBuilder(P parent) { super(parent); this.fields = new ArrayList(); } private GroupBuilder(Class

returnType) { super(returnType); this.fields = new ArrayList(); } @Override protected GroupBuilder

self() { return this; } public PrimitiveBuilder> primitive( PrimitiveTypeName type, Type.Repetition repetition) { return new PrimitiveBuilder>(this, type) .repetition(repetition); } /** * Returns a {@link PrimitiveBuilder} for the required primitive type * {@code type}. * * @param type a {@link PrimitiveTypeName} * @return a primitive builder for {@code type} that will return this * builder for additional fields. */ public PrimitiveBuilder> required( PrimitiveTypeName type) { return new PrimitiveBuilder>(this, type) .repetition(Type.Repetition.REQUIRED); } /** * Returns a {@link PrimitiveBuilder} for the optional primitive type * {@code type}. * * @param type a {@link PrimitiveTypeName} * @return a primitive builder for {@code type} that will return this * builder for additional fields. */ public PrimitiveBuilder> optional( PrimitiveTypeName type) { return new PrimitiveBuilder>(this, type) .repetition(Type.Repetition.OPTIONAL); } /** * Returns a {@link PrimitiveBuilder} for the repeated primitive type * {@code type}. * * @param type a {@link PrimitiveTypeName} * @return a primitive builder for {@code type} that will return this * builder for additional fields. */ public PrimitiveBuilder> repeated( PrimitiveTypeName type) { return new PrimitiveBuilder>(this, type) .repetition(Type.Repetition.REPEATED); } public GroupBuilder> group(Type.Repetition repetition) { return new GroupBuilder>(this) .repetition(repetition); } /** * Returns a {@link GroupBuilder} to build a required sub-group. * * @return a group builder that will return this builder for additional * fields. */ public GroupBuilder> requiredGroup() { return new GroupBuilder>(this) .repetition(Type.Repetition.REQUIRED); } /** * Returns a {@link GroupBuilder} to build an optional sub-group. * * @return a group builder that will return this builder for additional * fields. */ public GroupBuilder> optionalGroup() { return new GroupBuilder>(this) .repetition(Type.Repetition.OPTIONAL); } /** * Returns a {@link GroupBuilder} to build a repeated sub-group. * * @return a group builder that will return this builder for additional * fields. */ public GroupBuilder> repeatedGroup() { return new GroupBuilder>(this) .repetition(Type.Repetition.REPEATED); } /** * Adds {@code type} as a sub-field to the group configured by this builder. * * @return this builder for additional fields. */ public GroupBuilder

addField(Type type) { fields.add(type); return this; } /** * Adds {@code types} as sub-fields of the group configured by this builder. * * @return this builder for additional fields. */ public GroupBuilder

addFields(Type... types) { for (Type type : types) { fields.add(type); } return this; } @Override protected GroupType build(String name) { Preconditions.checkState(!fields.isEmpty(), "Cannot build an empty group"); return new GroupType(repetition, name, originalType, fields, id); } } public static class MessageTypeBuilder extends GroupBuilder { private MessageTypeBuilder() { super(MessageType.class); repetition(Type.Repetition.REQUIRED); } /** * Builds and returns the {@link MessageType} configured by this builder. *

* Note: All primitive types and sub-groups should be added before * calling this method. * * @param name a name for the constructed type * @return the final {@code MessageType} configured by this builder. */ @Override public MessageType named(String name) { Preconditions.checkNotNull(name, "Name is required"); return new MessageType(name, fields); } } /** * Returns a builder to construct a {@link MessageType}. * * @return a {@link MessageTypeBuilder} */ public static MessageTypeBuilder buildMessage() { return new MessageTypeBuilder(); } public static GroupBuilder buildGroup( Type.Repetition repetition) { return new GroupBuilder(GroupType.class).repetition(repetition); } /** * Returns a builder to construct a required {@link GroupType}. * * @return a {@link GroupBuilder} */ public static GroupBuilder requiredGroup() { return new GroupBuilder(GroupType.class) .repetition(Type.Repetition.REQUIRED); } /** * Returns a builder to construct an optional {@link GroupType}. * * @return a {@link GroupBuilder} */ public static GroupBuilder optionalGroup() { return new GroupBuilder(GroupType.class) .repetition(Type.Repetition.OPTIONAL); } /** * Returns a builder to construct a repeated {@link GroupType}. * * @return a {@link GroupBuilder} */ public static GroupBuilder repeatedGroup() { return new GroupBuilder(GroupType.class) .repetition(Type.Repetition.REPEATED); } public static PrimitiveBuilder primitive( PrimitiveTypeName type, Type.Repetition repetition) { return new PrimitiveBuilder(PrimitiveType.class, type) .repetition(repetition); } /** * Returns a builder to construct a required {@link PrimitiveType}. * * @param type a {@link PrimitiveTypeName} for the constructed type * @return a {@link PrimitiveBuilder} */ public static PrimitiveBuilder required( PrimitiveTypeName type) { return new PrimitiveBuilder(PrimitiveType.class, type) .repetition(Type.Repetition.REQUIRED); } /** * Returns a builder to construct an optional {@link PrimitiveType}. * * @param type a {@link PrimitiveTypeName} for the constructed type * @return a {@link PrimitiveBuilder} */ public static PrimitiveBuilder optional( PrimitiveTypeName type) { return new PrimitiveBuilder(PrimitiveType.class, type) .repetition(Type.Repetition.OPTIONAL); } /** * Returns a builder to construct a repeated {@link PrimitiveType}. * * @param type a {@link PrimitiveTypeName} for the constructed type * @return a {@link PrimitiveBuilder} */ public static PrimitiveBuilder repeated( PrimitiveTypeName type) { return new PrimitiveBuilder(PrimitiveType.class, type) .repetition(Type.Repetition.REPEATED); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy