com.amazonaws.athena.connector.lambda.data.BlockUtils Maven / Gradle / Ivy

Go to download
package com.amazonaws.athena.connector.lambda.data;

/*-
 * #%L
 * Amazon Athena Query Federation SDK
 * %%
 * Copyright (C) 2019 Amazon Web Services
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.VisibleForTesting;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.BitVector;
import org.apache.arrow.vector.DateDayVector;
import org.apache.arrow.vector.DateMilliVector;
import org.apache.arrow.vector.DecimalVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.SmallIntVector;
import org.apache.arrow.vector.TimeStampMicroTZVector;
import org.apache.arrow.vector.TimeStampMilliTZVector;
import org.apache.arrow.vector.TinyIntVector;
import org.apache.arrow.vector.UInt1Vector;
import org.apache.arrow.vector.UInt2Vector;
import org.apache.arrow.vector.UInt4Vector;
import org.apache.arrow.vector.UInt8Vector;
import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.complex.ListVector;
import org.apache.arrow.vector.complex.MapVector;
import org.apache.arrow.vector.complex.StructVector;
import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter;
import org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter;
import org.apache.arrow.vector.complex.writer.BigIntWriter;
import org.apache.arrow.vector.complex.writer.BitWriter;
import org.apache.arrow.vector.complex.writer.DateDayWriter;
import org.apache.arrow.vector.complex.writer.DateMilliWriter;
import org.apache.arrow.vector.complex.writer.DecimalWriter;
import org.apache.arrow.vector.complex.writer.FieldWriter;
import org.apache.arrow.vector.complex.writer.Float4Writer;
import org.apache.arrow.vector.complex.writer.Float8Writer;
import org.apache.arrow.vector.complex.writer.IntWriter;
import org.apache.arrow.vector.complex.writer.SmallIntWriter;
import org.apache.arrow.vector.complex.writer.TimeStampMicroTZWriter;
import org.apache.arrow.vector.complex.writer.TimeStampMilliTZWriter;
import org.apache.arrow.vector.complex.writer.TinyIntWriter;
import org.apache.arrow.vector.complex.writer.UInt1Writer;
import org.apache.arrow.vector.complex.writer.UInt2Writer;
import org.apache.arrow.vector.complex.writer.UInt4Writer;
import org.apache.arrow.vector.complex.writer.UInt8Writer;
import org.apache.arrow.vector.complex.writer.VarBinaryWriter;
import org.apache.arrow.vector.complex.writer.VarCharWriter;
import org.apache.arrow.vector.types.Types;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.Text;
import org.apache.commons.codec.Charsets;

import java.math.BigDecimal;
import java.math.RoundingMode;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * This utility class abstracts many facets of reading and writing values into Apache Arrow's FieldReader and FieldVector
 * objects.
 *
 * @note This class encourages a row wise approach to writing results. These interfaces are often viewed as simpler than
 * the would be columnar equivalents. Even though many of the systems that we've integrated with using this SDK do not
 * themselves support columnar access patterns, there is value in offering a a variant of these mechanisms that provide
 * the skeleton for columnar writing/reading of results.
 * 
 * The current SDK version takes the approach that experts can drop into 'native' Apache Arrow mode and simply not use
 * this abstraction. This approach of making common things easy and still enabling access to a 'power user' mode is
 * one we'd like to stick with but we'd also like to make it easier for customers that can/want a more columnar
 * experience/performance to be able to do so more easily.
 * 

 * In general the abstractions provided by this utility class also come with a performance hit when compared with native,
 * columnar, Apache Arrow access patterns. The performance overhead primarily results from Object overhead related to boxing
 * and/or type conversion. The second source of overhead is the constant lookup and branching of field types, vectors, readers,
 * etc.. Some of this second category of overhead can be mitigated by being mindful of how you use this class but a more
 * ideal solution would be to offer an interface that steers you in a better direction.
 * 

 * An issue has been opened to track the creation of a columnar variant of this utility:
 * https://github.com/awslabs/aws-athena-query-federation/issues/1
 */
public class BlockUtils
{
    public static final ZoneId UTC_ZONE_ID = ZoneId.of("UTC");

    /**
     * Creates a new Block with a single column and populated with the provided values.
     *
     * @param allocator The BlockAllocator to use when creating the Block.
     * @param columnName The name of the single column in the Block's Schema.
     * @param type The Apache Arrow Type of the column.
     * @param values The values to write to the new Block. Each value will be its own row.
     * @return The newly created Block with a single column Schema at populated with the provided values.
     */
    public static Block newBlock(BlockAllocator allocator, String columnName, ArrowType type, Object... values)
    {
        return newBlock(allocator, columnName, type, Arrays.asList(values));
    }

    /**
     * Creates a new Block with a single column and populated with the provided values.
     *
     * @param allocator The BlockAllocator to use when creating the Block.
     * @param columnName The name of the single column in the Block's Schema.
     * @param type The Apache Arrow Type of the column.
     * @param values The values to write to the new Block. Each value will be its own row.
     * @return The newly created Block with a single column Schema at populated with the provided values.
     */
    public static Block newBlock(BlockAllocator allocator, String columnName, ArrowType type, Collection