com.amazonaws.athena.connector.lambda.data.Block Maven / Gradle / Ivy
Show all versions of aws-athena-federation-sdk Show documentation
package com.amazonaws.athena.connector.lambda.data;
/*-
* #%L
* Amazon Athena Query Federation SDK
* %%
* Copyright (C) 2019 Amazon Web Services
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.amazonaws.athena.connector.lambda.domain.predicate.ConstraintEvaluator;
import com.google.common.base.MoreObjects;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VectorLoader;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.VectorUnloader;
import org.apache.arrow.vector.complex.reader.FieldReader;
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch;
import org.apache.arrow.vector.types.Types;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.beans.Transient;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import static com.amazonaws.athena.connector.lambda.data.BlockUtils.fieldToString;
import static java.util.Objects.requireNonNull;
/**
* This class is used to provide a convenient interface for working (reading/writing) Apache Arrow Batches. As such
* this class is mostly a holder for an Apache Arrow Schema and the associated VectorSchema (used for read/write).
* The class also includes helper functions for easily loading/unloading data in the form of Arrow Batches.
*
* @note While using this class as a holder to encapsulate nuances of Apache Arrow can simplify your programming model
* and make it easier to get started, using setValue(...), setComplexValue(...), and any of the related helpers to
* write data to the Apache Arrow structures is less performant than using Apache Arrow's native interfaces. If your usecase
* and source data can be read in a columnar fashion you can achieve significantly (50% - 200%) better performance by
* avoiding setValue(...) and setComplexValue(...). In our testing conversion to Apache Arrow was not a significant
* bottleneck and instead represented extra latency which could be hidden through parallelism and pipelining. This is why
* we opted to offer these convenience methods.
*
* Remember to always close your Block(s) when you are done with them. If you are using a BlockAllocator it is still
* recommended that you close() Blocks explicitly wherever possible vs. depending on BlockAllocator.close() to free
* resources. Closing Blocks earlier will reduce peak memory demands and reduce the chance that you exhaust your Apache
* Arrow memory pool.
*/
public class Block
extends SchemaAware
implements AutoCloseable
{
private static final Logger logger = LoggerFactory.getLogger(Block.class);
//Used to identify which BlockAllocator owns the underlying memory resources used in this Block for debugging purposes.
//Not included in equality or hashcode.
private final String allocatorId;
//The schema of the block
private final Schema schema;
//The VectorSchemaRoot which can be used to read/write values to/from the underlying Apache Arrow buffers that
//for the Arrow Batch of rows.
private final VectorSchemaRoot vectorSchema;
//Used to constrain writes to the block, be default we use an emptyEvaluator that allows all writes.
//Note that we will _NOT_ close this ConstraintEvaluator because we may not own it and the emptyEvaluator
//has no resources that could leak.
private ConstraintEvaluator constraintEvaluator = ConstraintEvaluator.emptyEvaluator();
/**
* Used by a BlockAllocator to construct a block by setting the key values that a Block 'holds'. Most of the meaningful
* construction actually takes place within the BlockAllocator that calls this constructor.
*
* @param allocatorId Identifier of the BlockAllocator that owns the Block's memory resources.
* @param schema The schema of the data that can be read/written to the provided VectorSchema.
* @param vectorSchema Used to read/write values from the Apache Arrow memory buffers owned by this object.
*/
public Block(String allocatorId, Schema schema, VectorSchemaRoot vectorSchema)
{
requireNonNull(allocatorId, "allocatorId is null");
requireNonNull(schema, "schema is null");
requireNonNull(vectorSchema, "vectorSchema is null");
this.allocatorId = allocatorId;
this.schema = schema;
this.vectorSchema = vectorSchema;
}
/**
* Used to constrain writes to the Block.
*
* @param constraintEvaluator The ConstraintEvaluator to use check if we should allow a value to be written to the Block.
* @note Setting the ConstraintEvaluator to null disables constraints.
*/
public void constrain(ConstraintEvaluator constraintEvaluator)
{
this.constraintEvaluator = (constraintEvaluator != null) ? constraintEvaluator : ConstraintEvaluator.emptyEvaluator();
}
/**
* Returns the ConstraintEvaluator used by the block.
*/
public ConstraintEvaluator getConstraintEvaluator()
{
return constraintEvaluator;
}
public String getAllocatorId()
{
return allocatorId;
}
public Schema getSchema()
{
return schema;
}
/**
* Writes the provided value to the specified field on the specified row. This method does _not_ update the
* row count on the underlying Apache Arrow VectorSchema. You must call setRowCount(...) to ensure the values
* your have written are considered 'valid rows' and thus available when you attempt to serialize this Block. This
* method replies on BlockUtils' field conversion/coercion logic to convert the provided value into a type that
* matches Apache Arrow's supported serialization format. For more details on coercion please see @BlockUtils
*
* @param fieldName The name of the field you wish to write to.
* @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array.
* @param value The value you wish to write.
* @return True if the value was written to the Block, False if the value was not written due to failing a constraint.
* @note This method will throw an NPE if you call with with a non-existent field. You can use offerValue(...)
* to ignore non-existent fields. This can be useful when you are writing results and want to avoid checking
* if a field has been requested. One such example is when a query projects only a subset of columns and your
* underlying data store is not columnar.
*/
public boolean setValue(String fieldName, int row, Object value)
{
if (constraintEvaluator.apply(fieldName, value)) {
BlockUtils.setValue(getFieldVector(fieldName), row, value);
return true;
}
return false;
}
/**
* Attempts to write the provided value to the specified field on the specified row. This method does _not_ update the
* row count on the underlying Apache Arrow VectorSchema. You must call setRowCount(...) to ensure the values
* your have written are considered 'valid rows' and thus available when you attempt to serialize this Block. This
* method replies on BlockUtils' field conversion/coercion logic to convert the provided value into a type that
* matches Apache Arrow's supported serialization format. For more details on coercion please see @BlockUtils
*
* @param fieldName The name of the field you wish to write to.
* @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array.
* @param value The value you wish to write.
* @return True if the value was written to the Block (even if the field is missing from the Block),
* False if the value was not written due to failing a constraint.
* @note This method will take no action if the provided fieldName is not a valid field in this Block's Schema.
* In such cases the method will return true.
*/
public boolean offerValue(String fieldName, int row, Object value)
{
if (constraintEvaluator.apply(fieldName, value)) {
FieldVector vector = getFieldVector(fieldName);
if (vector != null) {
BlockUtils.setValue(vector, row, value);
}
return true;
}
return false;
}
/**
* Attempts to set the provided value for the given field name and row. If the Block's schema does not
* contain such a field, this method does nothing and returns false.
*
* @param fieldName The name of the field you wish to write to.
* @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array.
* @param value The value you wish to write.
* @return True if the value was written to the Block, False if the value was not written due to failing a constraint.
* @note This method will throw an NPE if you call with with a non-existent field. You can use offerComplexValue(...)
* to ignore non-existent fields. This can be useful when you are writing results and want to avoid checking
* if a field has been requested. One such example is when a query projects only a subset of columns and your
* underlying data store is not columnar.
*/
public boolean setComplexValue(String fieldName, int row, FieldResolver fieldResolver, Object value)
{
FieldVector vector = getFieldVector(fieldName);
BlockUtils.setComplexValue(vector, row, fieldResolver, value);
return true;
}
/**
* Attempts to set the provided value for the given field name and row. If the Block's schema does not
* contain such a field, this method does nothing and returns false.
*
* @param fieldName The name of the field you wish to write to.
* @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array.
* @param value The value you wish to write.
* @return True if the value was written to the Block (even if the field is missing from the Block),
* False if the value was not written due to failing a constraint.
* @note This method will take no action if the provided fieldName is not a valid field in this Block's Schema.
* In such cases the method will return true.
*/
public boolean offerComplexValue(String fieldName, int row, FieldResolver fieldResolver, Object value)
{
FieldVector vector = getFieldVector(fieldName);
if (vector != null) {
BlockUtils.setComplexValue(vector, row, fieldResolver, value);
}
return true;
}
/**
* Provides access to the Apache Arrow Vector Schema when direct access to Apache Arrow is required.
*
* @return The Apache Arrow Vector Schema.
*/
protected VectorSchemaRoot getVectorSchema()
{
return vectorSchema;
}
/**
* Sets the valid row count on the underlying Apache Arrow Vector Schema.
*
* @param rowCount The row count to set.
* @Note If you do not set this value then block may not serialize correctly (too few rows) or rows may
* not be readable.
*/
public void setRowCount(int rowCount)
{
vectorSchema.setRowCount(rowCount);
}
/**
* Returns the current row count as set by calling setRowCount(...)
*
* @return The current valud row count for the Apache Arrow Vector Schema.
*/
public int getRowCount()
{
return vectorSchema.getRowCount();
}
/**
* Provides access to the Apache Arrow FieldReader for the given field name.
*
* @param fieldName The name of the field to retrieve.
* @return The FieldReader that can be used to read values from the Block for the specified field.
* @note This method throws NPE if the requested field name is not a valid field name in the block's Schema.
* Additionally, for accessing nested field you must request the parent field and then call reader(String fieldName)
* on the parent FieldReader. You can find some examples of how to use Apache Arrow for complex/nested types in
* the UnitTest for this class or BlockUtils.java.
*/
public FieldReader getFieldReader(String fieldName)
{
return vectorSchema.getVector(fieldName).getReader();
}
/**
* Provides access to the Apache Arrow FieldVector which can be used to write values for the given field name.
*
* @param fieldName The name of the field to retrieve.
* @return The FieldVector that can be used to read values from the Block for the specified field or NULL if the field
* is not in this Block's Schema.
* @note Additionally, for accessing nested field you must request the parent field and then call the apprioriate
* method (based on type) to get the child field's FieldVector. You can find some examples of how to use Apache Arrow
* for complex/nested types in the UnitTest for this class or BlockUtils.java.
*/
public FieldVector getFieldVector(String fieldName)
{
return vectorSchema.getVector(fieldName);
}
/**
* Provides access to the list of all top-level FieldReaders in this Block.
*
* @return List containing the top-level FieldReaders for this block.
*/
public List getFieldReaders()
{
List readers = new ArrayList<>();
for (FieldVector next : vectorSchema.getFieldVectors()) {
readers.add(next.getReader());
}
return readers;
}
/**
* Calculates the current used size in 'bytes' for all Apache Arrow Buffers that comprise the row data for
* this Block.
*
* @return The used bytes of row data in this Block.
* @note This value is likley smaller than the actually memory held by this Block as it only counts the 'used' portion
* of the pre-allocated Apache Arrow Buffers. It is generally safer to think about this value as the size of the Block
* if you serialize it and thus is useful for controlling the size of the Block responses sent to Athena.
*/
@Transient
public long getSize()
{
long size = 0;
for (FieldVector next : vectorSchema.getFieldVectors()) {
size += next.getBufferSize();
}
return size;
}
/**
* Provides access to the list of all top-level FieldVectors in this Block.
*
* @return List containing the top-level FieldVectors for this block.
*/
public List getFieldVectors()
{
return vectorSchema.getFieldVectors();
}
/**
* Used to unload the Apache Arrow data in this Block in preparation for Serialization.
*
* @return An ArrowRecordBatch containing all row data in this Block for use in serializing the Block.
*/
public ArrowRecordBatch getRecordBatch()
{
VectorUnloader vectorUnloader = new VectorUnloader(vectorSchema);
return vectorUnloader.getRecordBatch();
}
/**
* Used to load Apache Arrow data into this Block after it has been deserialized.
*
* @param batch An ArrowRecordBatch containing all row data you'd like to load into this Block.
* @note The batch is closed after being loaded to avoid memory leaks or data corruption since the buffers
* associated with the batch are now owned by this Block. Closing the batch essentially decrements the referrence
* count in the Arrow Allocator.
*/
public void loadRecordBatch(ArrowRecordBatch batch)
{
VectorLoader vectorLoader = new VectorLoader(vectorSchema);
vectorLoader.load(batch);
batch.close();
}
/**
* Frees all Apache Arrow Buffers and resources associated with this block.
*
* @throws Exception
*/
@Override
public void close()
throws Exception
{
this.vectorSchema.close();
}
@Override
protected Schema internalGetSchema()
{
return schema;
}
/**
* Provides some basic equality checking for a Block. This method has some draw backs in that is isn't a deep equality
* and will not work for some large complex blocks. At present this method is useful for testing purposes but may be refactored
* in a future release.
*/
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Block that = (Block) o;
if (this.schema.getFields().size() != that.schema.getFields().size()) {
return false;
}
if (this.vectorSchema.getRowCount() != that.vectorSchema.getRowCount()) {
return false;
}
try {
for (Field next : this.schema.getFields()) {
FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader();
FieldReader thatReader = that.vectorSchema.getVector(next.getName()).getReader();
for (int i = 0; i < this.vectorSchema.getRowCount(); i++) {
thisReader.setPosition(i);
thatReader.setPosition(i);
if (ArrowTypeComparator.compare(thisReader, thisReader.readObject(), thatReader.readObject()) != 0) {
return false;
}
}
}
}
catch (IllegalArgumentException ex) {
//can happen when comparator doesn't support the type
throw ex;
}
catch (RuntimeException ex) {
//There are many differences which can cause an exception, easier to handle them this way
logger.warn("equals: ", ex);
return false;
}
return true;
}
/**
* Provides some basic equality checking for a Block ignoring ordering. This method has some draw backs in that is
* isn't a deep equality and will not work for some large complex blocks. At present this method is useful for testing
* purposes but may be refactored in a future release.
*/
public boolean equalsAsSet(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Block that = (Block) o;
if (this.schema.getFields().size() != that.schema.getFields().size()) {
return false;
}
if (this.vectorSchema.getRowCount() != that.vectorSchema.getRowCount()) {
return false;
}
try {
for (Field next : this.schema.getFields()) {
FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader();
FieldReader thatReader = that.vectorSchema.getVector(next.getName()).getReader();
for (int i = 0; i < this.vectorSchema.getRowCount(); i++) {
thisReader.setPosition(i);
Types.MinorType type = thisReader.getMinorType();
Object val = thisReader.readObject();
boolean matched = false;
for (int j = 0; j < that.vectorSchema.getRowCount(); j++) {
thatReader.setPosition(j);
if (ArrowTypeComparator.compare(thatReader, val, thatReader.readObject()) == 0) {
matched = true;
}
}
if (!matched) {
return false;
}
}
}
}
catch (RuntimeException ex) {
//There are many differences which can cause an exception, easier to handle them this way
return false;
}
return true;
}
/**
* Provides some basic hashcode capabilities for the Block. This method has some draw backs in that it is difficult
* to maintain as we add new types and becomes error prone when and slow if missused. This challenge is compounded
* when understanding the right/wrong ways to use this are not easy to convey.
*/
@Override
public int hashCode()
{
int hashcode = 0;
for (Map.Entry next : this.schema.getCustomMetadata().entrySet()) {
hashcode = hashcode + Objects.hashCode(next);
}
for (Field next : this.schema.getFields()) {
FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader();
for (int i = 0; i < this.vectorSchema.getRowCount(); i++) {
thisReader.setPosition(i);
hashcode = 31 * hashcode + Objects.hashCode(thisReader.readObject());
}
}
return hashcode;
}
@Override
public String toString()
{
MoreObjects.ToStringHelper helper = MoreObjects.toStringHelper(this);
helper.add("rows", getRowCount());
int rowsToPrint = this.vectorSchema.getRowCount() > 10 ? 10 : this.vectorSchema.getRowCount();
for (Field next : this.schema.getFields()) {
FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader();
List values = new ArrayList<>();
int originalReaderPosition = thisReader.getPosition();
for (int i = 0; i < rowsToPrint; i++) {
thisReader.setPosition(i);
values.add(fieldToString(thisReader));
}
// reset the position for other reads
thisReader.setPosition(originalReaderPosition);
helper.add(next.getName(), values);
}
return helper.toString();
}
}