org.apache.arrow.adapter.jdbc.JdbcToArrowConfig Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.arrow.adapter.jdbc;

import java.math.RoundingMode;
import java.util.Calendar;
import java.util.Map;
import java.util.function.Function;

import org.apache.arrow.adapter.jdbc.consumer.JdbcConsumer;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.types.pojo.ArrowType;

/**
 * This class configures the JDBC-to-Arrow conversion process.
 * 
 * The allocator is used to construct the {@link org.apache.arrow.vector.VectorSchemaRoot},
 * and the calendar is used to define the time zone of any
 * {@link org.apache.arrow.vector.types.pojo.ArrowType.Timestamp}
 * fields that are created during the conversion.  Neither field may be null.
 * 
 * 
 * If the includeMetadata flag is set, the Arrow field metadata will contain information
 * from the corresponding {@link java.sql.ResultSetMetaData} that was used to create the
 * {@link org.apache.arrow.vector.types.pojo.FieldType} of the corresponding
 * {@link org.apache.arrow.vector.FieldVector}.
 * 
 * 
 * If there are any {@link java.sql.Types#ARRAY} fields in the {@link java.sql.ResultSet}, the corresponding
 * {@link JdbcFieldInfo} for the array's contents must be defined here.  Unfortunately, the sub-type
 * information cannot be retrieved from all JDBC implementations (H2 for example, returns
 * {@link java.sql.Types#NULL} for the array sub-type), so it must be configured here.  The column index
 * or name can be used to map to a {@link JdbcFieldInfo}, and that will be used for the conversion.
 * 
 */
public final class JdbcToArrowConfig {

  public static final int DEFAULT_TARGET_BATCH_SIZE = 1024;
  public static final int NO_LIMIT_BATCH_SIZE = -1;
  private final Calendar calendar;
  private final BufferAllocator allocator;
  private final boolean includeMetadata;
  private final boolean reuseVectorSchemaRoot;
  private final Map arraySubTypesByColumnIndex;
  private final Map arraySubTypesByColumnName;
  private final Map explicitTypesByColumnIndex;
  private final Map explicitTypesByColumnName;
  private final Map schemaMetadata;
  private final Map> columnMetadataByColumnIndex;
  private final RoundingMode bigDecimalRoundingMode;
  /**
   * The maximum rowCount to read each time when partially convert data.
   * Default value is 1024 and -1 means disable partial read.
   * default is -1 which means disable partial read.
   * Note that this flag only useful for {@link JdbcToArrow#sqlToArrowVectorIterator}
   * 1) if targetBatchSize != -1, it will convert full data into multiple vectors
   * with valueCount no more than targetBatchSize.
   * 2) if targetBatchSize == -1, it will convert full data into a single vector in {@link ArrowVectorIterator}
   * 
   */
  private final int targetBatchSize;

  private final Function jdbcToArrowTypeConverter;
  private final JdbcConsumerFactory jdbcConsumerGetter;

  /**
   * Constructs a new configuration from the provided allocator and calendar.  The allocator
   * is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define
   * Arrow Timestamp fields, and to read time-based fields from the JDBC ResultSet.
   *
   * @param allocator       The memory allocator to construct the Arrow vectors with.
   * @param calendar        The calendar to use when constructing Timestamp fields and reading time-based results.
   */
  JdbcToArrowConfig(BufferAllocator allocator, Calendar calendar) {
    this(allocator, calendar,
        /* include metadata */ false,
        /* reuse vector schema root */ false,
        /* array sub-types by column index */ null,
        /* array sub-types by column name */ null,
        DEFAULT_TARGET_BATCH_SIZE, null, null);
  }

  JdbcToArrowConfig(
          BufferAllocator allocator,
          Calendar calendar,
          boolean includeMetadata,
          boolean reuseVectorSchemaRoot,
          Map arraySubTypesByColumnIndex,
          Map arraySubTypesByColumnName,
          int targetBatchSize,
          Function jdbcToArrowTypeConverter) {
    this(allocator, calendar, includeMetadata, reuseVectorSchemaRoot, arraySubTypesByColumnIndex,
        arraySubTypesByColumnName, targetBatchSize, jdbcToArrowTypeConverter, null);
  }

  /**
   * Constructs a new configuration from the provided allocator and calendar.  The allocator
   * is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define
   * Arrow Timestamp fields, and to read time-based fields from the JDBC ResultSet.
   *
   * @param allocator       The memory allocator to construct the Arrow vectors with.
   * @param calendar        The calendar to use when constructing Timestamp fields and reading time-based results.
   * @param includeMetadata Whether to include JDBC field metadata in the Arrow Schema Field metadata.
   * @param reuseVectorSchemaRoot Whether to reuse the vector schema root for each data load.
   * @param arraySubTypesByColumnIndex The type of the JDBC array at the column index (1-based).
   * @param arraySubTypesByColumnName  The type of the JDBC array at the column name.
   * @param targetBatchSize The target batch size to be used in preallcation of the resulting vectors.
   * @param jdbcToArrowTypeConverter The function that maps JDBC field type information to arrow type. If set to null,
   *                                 the default mapping will be used, which is defined as:
   *  
   *    CHAR --> ArrowType.Utf8
   *    NCHAR --> ArrowType.Utf8
   *    VARCHAR --> ArrowType.Utf8
   *    NVARCHAR --> ArrowType.Utf8
   *    LONGVARCHAR --> ArrowType.Utf8
   *    LONGNVARCHAR --> ArrowType.Utf8
   *    NUMERIC --> ArrowType.Decimal(precision, scale)
   *    DECIMAL --> ArrowType.Decimal(precision, scale)
   *    BIT --> ArrowType.Bool
   *    TINYINT --> ArrowType.Int(8, signed)
   *    SMALLINT --> ArrowType.Int(16, signed)
   *    INTEGER --> ArrowType.Int(32, signed)
   *    BIGINT --> ArrowType.Int(64, signed)
   *    REAL --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
   *    FLOAT --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
   *    DOUBLE --> ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
   *    BINARY --> ArrowType.Binary
   *    VARBINARY --> ArrowType.Binary
   *    LONGVARBINARY --> ArrowType.Binary
   *    DATE --> ArrowType.Date(DateUnit.DAY)
   *    TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)
   *    TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)
   *    CLOB --> ArrowType.Utf8
   *    BLOB --> ArrowType.Binary
   *    ARRAY --> ArrowType.List
   *    STRUCT --> ArrowType.Struct
   *    NULL --> ArrowType.Null
   *  
   * @param bigDecimalRoundingMode The java.math.RoundingMode to be used in coercion of a BigDecimal from a
   *                               ResultSet having a scale which does not match that of the target vector. Use null
   *                               (default value) to require strict scale matching.
   */
  JdbcToArrowConfig(
      BufferAllocator allocator,
      Calendar calendar,
      boolean includeMetadata,
      boolean reuseVectorSchemaRoot,
      Map arraySubTypesByColumnIndex,
      Map arraySubTypesByColumnName,
      int targetBatchSize,
      Function jdbcToArrowTypeConverter,
      RoundingMode bigDecimalRoundingMode) {

    this(
        allocator,
        calendar,
        includeMetadata,
        reuseVectorSchemaRoot,
        arraySubTypesByColumnIndex,
        arraySubTypesByColumnName,
        targetBatchSize,
        jdbcToArrowTypeConverter,
        null,
        null,
        null,
        null,
        bigDecimalRoundingMode);
  }

  JdbcToArrowConfig(
      BufferAllocator allocator,
      Calendar calendar,
      boolean includeMetadata,
      boolean reuseVectorSchemaRoot,
      Map arraySubTypesByColumnIndex,
      Map arraySubTypesByColumnName,
      int targetBatchSize,
      Function jdbcToArrowTypeConverter,
      Map explicitTypesByColumnIndex,
      Map explicitTypesByColumnName,
      Map schemaMetadata,
      Map> columnMetadataByColumnIndex,
      RoundingMode bigDecimalRoundingMode) {
    this(
        allocator,
        calendar,
        includeMetadata,
        reuseVectorSchemaRoot,
        arraySubTypesByColumnIndex,
        arraySubTypesByColumnName,
        targetBatchSize,
        jdbcToArrowTypeConverter,
        null,
        explicitTypesByColumnIndex,
        explicitTypesByColumnName,
        schemaMetadata,
        columnMetadataByColumnIndex,
        bigDecimalRoundingMode);
  }

  JdbcToArrowConfig(
      BufferAllocator allocator,
      Calendar calendar,
      boolean includeMetadata,
      boolean reuseVectorSchemaRoot,
      Map arraySubTypesByColumnIndex,
      Map arraySubTypesByColumnName,
      int targetBatchSize,
      Function jdbcToArrowTypeConverter,
      JdbcConsumerFactory jdbcConsumerGetter,
      Map explicitTypesByColumnIndex,
      Map explicitTypesByColumnName,
      Map schemaMetadata,
      Map> columnMetadataByColumnIndex,
      RoundingMode bigDecimalRoundingMode) {
    Preconditions.checkNotNull(allocator, "Memory allocator cannot be null");
    this.allocator = allocator;
    this.calendar = calendar;
    this.includeMetadata = includeMetadata;
    this.reuseVectorSchemaRoot = reuseVectorSchemaRoot;
    this.arraySubTypesByColumnIndex = arraySubTypesByColumnIndex;
    this.arraySubTypesByColumnName = arraySubTypesByColumnName;
    this.targetBatchSize = targetBatchSize;
    this.explicitTypesByColumnIndex = explicitTypesByColumnIndex;
    this.explicitTypesByColumnName = explicitTypesByColumnName;
    this.schemaMetadata = schemaMetadata;
    this.columnMetadataByColumnIndex = columnMetadataByColumnIndex;
    this.bigDecimalRoundingMode = bigDecimalRoundingMode;

    // set up type converter
    this.jdbcToArrowTypeConverter = jdbcToArrowTypeConverter != null ? jdbcToArrowTypeConverter :
        (jdbcFieldInfo) -> JdbcToArrowUtils.getArrowTypeFromJdbcType(jdbcFieldInfo, calendar);

    this.jdbcConsumerGetter = jdbcConsumerGetter != null ? jdbcConsumerGetter : JdbcToArrowUtils::getConsumer;
  }

  /**
   * The calendar to use when defining Arrow Timestamp fields
   * and retrieving {@link java.sql.Date}, {@link java.sql.Time}, or {@link java.sql.Timestamp}
   * data types from the {@link java.sql.ResultSet}, or null if not converting.
   *
   * @return the calendar.
   */
  public Calendar getCalendar() {
    return calendar;
  }

  /**
   * The Arrow memory allocator.
   *
   * @return the allocator.
   */
  public BufferAllocator getAllocator() {
    return allocator;
  }

  /**
   * Whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata.
   *
   * @return true to include field metadata, false to exclude it.
   */
  public boolean shouldIncludeMetadata() {
    return includeMetadata;
  }

  /**
   * Get the target batch size for partial read.
   */
  public int getTargetBatchSize() {
    return targetBatchSize;
  }

  /**
   * Get whether it is allowed to reuse the vector schema root.
   */
  public boolean isReuseVectorSchemaRoot() {
    return reuseVectorSchemaRoot;
  }

  /**
   * Gets the mapping between JDBC type information to Arrow type.
   */
  public Function getJdbcToArrowTypeConverter() {
    return jdbcToArrowTypeConverter;
  }

  /**
   * Gets the JDBC consumer getter.
   */
  public JdbcConsumerFactory getJdbcConsumerGetter() {
    return jdbcConsumerGetter;
  }

  /**
   * Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column index.
   *
   * @param index The {@link java.sql.ResultSetMetaData} column index of an {@link java.sql.Types#ARRAY} type.
   * @return The {@link JdbcFieldInfo} for that array's sub-type, or null if not defined.
   */
  public JdbcFieldInfo getArraySubTypeByColumnIndex(int index) {
    if (arraySubTypesByColumnIndex == null) {
      return null;
    } else {
      return arraySubTypesByColumnIndex.get(index);
    }
  }

  /**
   * Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column name.
   *
   * @param name The {@link java.sql.ResultSetMetaData} column name of an {@link java.sql.Types#ARRAY} type.
   * @return The {@link JdbcFieldInfo} for that array's sub-type, or null if not defined.
   */
  public JdbcFieldInfo getArraySubTypeByColumnName(String name) {
    if (arraySubTypesByColumnName == null) {
      return null;
    } else {
      return arraySubTypesByColumnName.get(name);
    }
  }

  /**
   * Returns the type {@link JdbcFieldInfo} explicitly defined for the provided column index.
   *
   * @param index The {@link java.sql.ResultSetMetaData} column index to evaluate for explicit type mapping.
   * @return The {@link JdbcFieldInfo} defined for the column, or null if not defined.
   */
  public JdbcFieldInfo getExplicitTypeByColumnIndex(int index) {
    if (explicitTypesByColumnIndex == null) {
      return null;
    } else {
      return explicitTypesByColumnIndex.get(index);
    }
  }

  /**
   * Returns the type {@link JdbcFieldInfo} explicitly defined for the provided column name.
   *
   * @param name The {@link java.sql.ResultSetMetaData} column name to evaluate for explicit type mapping.
   * @return The {@link JdbcFieldInfo} defined for the column, or null if not defined.
   */
  public JdbcFieldInfo getExplicitTypeByColumnName(String name) {
    if (explicitTypesByColumnName == null) {
      return null;
    } else {
      return explicitTypesByColumnName.get(name);
    }
  }

  /**
   * Return schema level metadata or null if not provided.
   */
  public Map getSchemaMetadata() {
    return schemaMetadata;
  }

  /**
   * Return metadata from columnIndex->meta map on per field basis
   * or null if not provided.
   */
  public Map> getColumnMetadataByColumnIndex() {
    return columnMetadataByColumnIndex;
  }

  public RoundingMode getBigDecimalRoundingMode() {
    return bigDecimalRoundingMode;
  }

  /**
   * Interface for a function that gets a JDBC consumer for the given values.
   */
  @FunctionalInterface
  public interface JdbcConsumerFactory {
    JdbcConsumer apply(ArrowType arrowType, int columnIndex, boolean nullable, FieldVector vector,
                       JdbcToArrowConfig config);
  }
}