
org.apache.arrow.adapter.jdbc.JdbcToArrowConfig Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.arrow.adapter.jdbc;
import java.math.RoundingMode;
import java.util.Calendar;
import java.util.Map;
import java.util.function.Function;
import org.apache.arrow.adapter.jdbc.consumer.JdbcConsumer;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.types.pojo.ArrowType;
/**
* This class configures the JDBC-to-Arrow conversion process.
*
* The allocator is used to construct the {@link org.apache.arrow.vector.VectorSchemaRoot},
* and the calendar is used to define the time zone of any
* {@link org.apache.arrow.vector.types.pojo.ArrowType.Timestamp}
* fields that are created during the conversion. Neither field may be null
.
*
*
* If the includeMetadata
flag is set, the Arrow field metadata will contain information
* from the corresponding {@link java.sql.ResultSetMetaData} that was used to create the
* {@link org.apache.arrow.vector.types.pojo.FieldType} of the corresponding
* {@link org.apache.arrow.vector.FieldVector}.
*
*
* If there are any {@link java.sql.Types#ARRAY} fields in the {@link java.sql.ResultSet}, the corresponding
* {@link JdbcFieldInfo} for the array's contents must be defined here. Unfortunately, the sub-type
* information cannot be retrieved from all JDBC implementations (H2 for example, returns
* {@link java.sql.Types#NULL} for the array sub-type), so it must be configured here. The column index
* or name can be used to map to a {@link JdbcFieldInfo}, and that will be used for the conversion.
*
*/
public final class JdbcToArrowConfig {
public static final int DEFAULT_TARGET_BATCH_SIZE = 1024;
public static final int NO_LIMIT_BATCH_SIZE = -1;
private final Calendar calendar;
private final BufferAllocator allocator;
private final boolean includeMetadata;
private final boolean reuseVectorSchemaRoot;
private final Map arraySubTypesByColumnIndex;
private final Map arraySubTypesByColumnName;
private final Map explicitTypesByColumnIndex;
private final Map explicitTypesByColumnName;
private final Map schemaMetadata;
private final Map> columnMetadataByColumnIndex;
private final RoundingMode bigDecimalRoundingMode;
/**
* The maximum rowCount to read each time when partially convert data.
* Default value is 1024 and -1 means disable partial read.
* default is -1 which means disable partial read.
* Note that this flag only useful for {@link JdbcToArrow#sqlToArrowVectorIterator}
* 1) if targetBatchSize != -1, it will convert full data into multiple vectors
* with valueCount no more than targetBatchSize.
* 2) if targetBatchSize == -1, it will convert full data into a single vector in {@link ArrowVectorIterator}
*
*/
private final int targetBatchSize;
private final Function jdbcToArrowTypeConverter;
private final JdbcConsumerFactory jdbcConsumerGetter;
/**
* Constructs a new configuration from the provided allocator and calendar. The allocator
* is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define
* Arrow Timestamp fields, and to read time-based fields from the JDBC ResultSet
.
*
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param calendar The calendar to use when constructing Timestamp fields and reading time-based results.
*/
JdbcToArrowConfig(BufferAllocator allocator, Calendar calendar) {
this(allocator, calendar,
/* include metadata */ false,
/* reuse vector schema root */ false,
/* array sub-types by column index */ null,
/* array sub-types by column name */ null,
DEFAULT_TARGET_BATCH_SIZE, null, null);
}
JdbcToArrowConfig(
BufferAllocator allocator,
Calendar calendar,
boolean includeMetadata,
boolean reuseVectorSchemaRoot,
Map arraySubTypesByColumnIndex,
Map arraySubTypesByColumnName,
int targetBatchSize,
Function jdbcToArrowTypeConverter) {
this(allocator, calendar, includeMetadata, reuseVectorSchemaRoot, arraySubTypesByColumnIndex,
arraySubTypesByColumnName, targetBatchSize, jdbcToArrowTypeConverter, null);
}
/**
* Constructs a new configuration from the provided allocator and calendar. The allocator
* is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define
* Arrow Timestamp fields, and to read time-based fields from the JDBC ResultSet
.
*
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param calendar The calendar to use when constructing Timestamp fields and reading time-based results.
* @param includeMetadata Whether to include JDBC field metadata in the Arrow Schema Field metadata.
* @param reuseVectorSchemaRoot Whether to reuse the vector schema root for each data load.
* @param arraySubTypesByColumnIndex The type of the JDBC array at the column index (1-based).
* @param arraySubTypesByColumnName The type of the JDBC array at the column name.
* @param targetBatchSize The target batch size to be used in preallcation of the resulting vectors.
* @param jdbcToArrowTypeConverter The function that maps JDBC field type information to arrow type. If set to null,
* the default mapping will be used, which is defined as:
*
* - CHAR --> ArrowType.Utf8
* - NCHAR --> ArrowType.Utf8
* - VARCHAR --> ArrowType.Utf8
* - NVARCHAR --> ArrowType.Utf8
* - LONGVARCHAR --> ArrowType.Utf8
* - LONGNVARCHAR --> ArrowType.Utf8
* - NUMERIC --> ArrowType.Decimal(precision, scale)
* - DECIMAL --> ArrowType.Decimal(precision, scale)
* - BIT --> ArrowType.Bool
* - TINYINT --> ArrowType.Int(8, signed)
* - SMALLINT --> ArrowType.Int(16, signed)
* - INTEGER --> ArrowType.Int(32, signed)
* - BIGINT --> ArrowType.Int(64, signed)
* - REAL --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
* - FLOAT --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
* - DOUBLE --> ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
* - BINARY --> ArrowType.Binary
* - VARBINARY --> ArrowType.Binary
* - LONGVARBINARY --> ArrowType.Binary
* - DATE --> ArrowType.Date(DateUnit.DAY)
* - TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)
* - TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)
* - CLOB --> ArrowType.Utf8
* - BLOB --> ArrowType.Binary
* - ARRAY --> ArrowType.List
* - STRUCT --> ArrowType.Struct
* - NULL --> ArrowType.Null
*
* @param bigDecimalRoundingMode The java.math.RoundingMode to be used in coercion of a BigDecimal from a
* ResultSet having a scale which does not match that of the target vector. Use null
* (default value) to require strict scale matching.
*/
JdbcToArrowConfig(
BufferAllocator allocator,
Calendar calendar,
boolean includeMetadata,
boolean reuseVectorSchemaRoot,
Map arraySubTypesByColumnIndex,
Map arraySubTypesByColumnName,
int targetBatchSize,
Function jdbcToArrowTypeConverter,
RoundingMode bigDecimalRoundingMode) {
this(
allocator,
calendar,
includeMetadata,
reuseVectorSchemaRoot,
arraySubTypesByColumnIndex,
arraySubTypesByColumnName,
targetBatchSize,
jdbcToArrowTypeConverter,
null,
null,
null,
null,
bigDecimalRoundingMode);
}
JdbcToArrowConfig(
BufferAllocator allocator,
Calendar calendar,
boolean includeMetadata,
boolean reuseVectorSchemaRoot,
Map arraySubTypesByColumnIndex,
Map arraySubTypesByColumnName,
int targetBatchSize,
Function jdbcToArrowTypeConverter,
Map explicitTypesByColumnIndex,
Map explicitTypesByColumnName,
Map schemaMetadata,
Map> columnMetadataByColumnIndex,
RoundingMode bigDecimalRoundingMode) {
this(
allocator,
calendar,
includeMetadata,
reuseVectorSchemaRoot,
arraySubTypesByColumnIndex,
arraySubTypesByColumnName,
targetBatchSize,
jdbcToArrowTypeConverter,
null,
explicitTypesByColumnIndex,
explicitTypesByColumnName,
schemaMetadata,
columnMetadataByColumnIndex,
bigDecimalRoundingMode);
}
JdbcToArrowConfig(
BufferAllocator allocator,
Calendar calendar,
boolean includeMetadata,
boolean reuseVectorSchemaRoot,
Map arraySubTypesByColumnIndex,
Map arraySubTypesByColumnName,
int targetBatchSize,
Function jdbcToArrowTypeConverter,
JdbcConsumerFactory jdbcConsumerGetter,
Map explicitTypesByColumnIndex,
Map explicitTypesByColumnName,
Map schemaMetadata,
Map> columnMetadataByColumnIndex,
RoundingMode bigDecimalRoundingMode) {
Preconditions.checkNotNull(allocator, "Memory allocator cannot be null");
this.allocator = allocator;
this.calendar = calendar;
this.includeMetadata = includeMetadata;
this.reuseVectorSchemaRoot = reuseVectorSchemaRoot;
this.arraySubTypesByColumnIndex = arraySubTypesByColumnIndex;
this.arraySubTypesByColumnName = arraySubTypesByColumnName;
this.targetBatchSize = targetBatchSize;
this.explicitTypesByColumnIndex = explicitTypesByColumnIndex;
this.explicitTypesByColumnName = explicitTypesByColumnName;
this.schemaMetadata = schemaMetadata;
this.columnMetadataByColumnIndex = columnMetadataByColumnIndex;
this.bigDecimalRoundingMode = bigDecimalRoundingMode;
// set up type converter
this.jdbcToArrowTypeConverter = jdbcToArrowTypeConverter != null ? jdbcToArrowTypeConverter :
(jdbcFieldInfo) -> JdbcToArrowUtils.getArrowTypeFromJdbcType(jdbcFieldInfo, calendar);
this.jdbcConsumerGetter = jdbcConsumerGetter != null ? jdbcConsumerGetter : JdbcToArrowUtils::getConsumer;
}
/**
* The calendar to use when defining Arrow Timestamp fields
* and retrieving {@link java.sql.Date}, {@link java.sql.Time}, or {@link java.sql.Timestamp}
* data types from the {@link java.sql.ResultSet}, or null
if not converting.
*
* @return the calendar.
*/
public Calendar getCalendar() {
return calendar;
}
/**
* The Arrow memory allocator.
*
* @return the allocator.
*/
public BufferAllocator getAllocator() {
return allocator;
}
/**
* Whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata.
*
* @return true
to include field metadata, false
to exclude it.
*/
public boolean shouldIncludeMetadata() {
return includeMetadata;
}
/**
* Get the target batch size for partial read.
*/
public int getTargetBatchSize() {
return targetBatchSize;
}
/**
* Get whether it is allowed to reuse the vector schema root.
*/
public boolean isReuseVectorSchemaRoot() {
return reuseVectorSchemaRoot;
}
/**
* Gets the mapping between JDBC type information to Arrow type.
*/
public Function getJdbcToArrowTypeConverter() {
return jdbcToArrowTypeConverter;
}
/**
* Gets the JDBC consumer getter.
*/
public JdbcConsumerFactory getJdbcConsumerGetter() {
return jdbcConsumerGetter;
}
/**
* Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column index.
*
* @param index The {@link java.sql.ResultSetMetaData} column index of an {@link java.sql.Types#ARRAY} type.
* @return The {@link JdbcFieldInfo} for that array's sub-type, or null
if not defined.
*/
public JdbcFieldInfo getArraySubTypeByColumnIndex(int index) {
if (arraySubTypesByColumnIndex == null) {
return null;
} else {
return arraySubTypesByColumnIndex.get(index);
}
}
/**
* Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column name.
*
* @param name The {@link java.sql.ResultSetMetaData} column name of an {@link java.sql.Types#ARRAY} type.
* @return The {@link JdbcFieldInfo} for that array's sub-type, or null
if not defined.
*/
public JdbcFieldInfo getArraySubTypeByColumnName(String name) {
if (arraySubTypesByColumnName == null) {
return null;
} else {
return arraySubTypesByColumnName.get(name);
}
}
/**
* Returns the type {@link JdbcFieldInfo} explicitly defined for the provided column index.
*
* @param index The {@link java.sql.ResultSetMetaData} column index to evaluate for explicit type mapping.
* @return The {@link JdbcFieldInfo} defined for the column, or null
if not defined.
*/
public JdbcFieldInfo getExplicitTypeByColumnIndex(int index) {
if (explicitTypesByColumnIndex == null) {
return null;
} else {
return explicitTypesByColumnIndex.get(index);
}
}
/**
* Returns the type {@link JdbcFieldInfo} explicitly defined for the provided column name.
*
* @param name The {@link java.sql.ResultSetMetaData} column name to evaluate for explicit type mapping.
* @return The {@link JdbcFieldInfo} defined for the column, or null
if not defined.
*/
public JdbcFieldInfo getExplicitTypeByColumnName(String name) {
if (explicitTypesByColumnName == null) {
return null;
} else {
return explicitTypesByColumnName.get(name);
}
}
/**
* Return schema level metadata or null if not provided.
*/
public Map getSchemaMetadata() {
return schemaMetadata;
}
/**
* Return metadata from columnIndex->meta map on per field basis
* or null if not provided.
*/
public Map> getColumnMetadataByColumnIndex() {
return columnMetadataByColumnIndex;
}
public RoundingMode getBigDecimalRoundingMode() {
return bigDecimalRoundingMode;
}
/**
* Interface for a function that gets a JDBC consumer for the given values.
*/
@FunctionalInterface
public interface JdbcConsumerFactory {
JdbcConsumer apply(ArrowType arrowType, int columnIndex, boolean nullable, FieldVector vector,
JdbcToArrowConfig config);
}
}