org.apache.hadoop.mapreduce.lib.db.DateSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-apache Show documentation
Shaded version of Apache Hadoop for Presto
There is a newer version: 3.2.0-9
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce.lib.db;

import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Time;
import java.sql.Timestamp;
import java.sql.Types;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import io.prestosql.hadoop.$internal.org.apache.commons.logging.Log;
import io.prestosql.hadoop.$internal.org.apache.commons.logging.LogFactory;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.MRJobConfig;

/**
 * Implement DBSplitter over date/time values.
 * Make use of logic from IntegerSplitter, since date/time are just longs
 * in Java.
 */
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class DateSplitter extends IntegerSplitter {

  private static final Log LOG = LogFactory.getLog(DateSplitter.class);

  public List split(Configuration conf, ResultSet results, String colName)
      throws SQLException {

    long minVal;
    long maxVal;

    int sqlDataType = results.getMetaData().getColumnType(1);
    minVal = resultSetColToLong(results, 1, sqlDataType);
    maxVal = resultSetColToLong(results, 2, sqlDataType);

    String lowClausePrefix = colName + " >= ";
    String highClausePrefix = colName + " < ";

    int numSplits = conf.getInt(MRJobConfig.NUM_MAPS, 1);
    if (numSplits < 1) {
      numSplits = 1;
    }

    if (minVal == Long.MIN_VALUE && maxVal == Long.MIN_VALUE) {
      // The range of acceptable dates is NULL to NULL. Just create a single split.
      List splits = new ArrayList();
      splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
          colName + " IS NULL", colName + " IS NULL"));
      return splits;
    }

    // Gather the split point integers
    List splitPoints = split(numSplits, minVal, maxVal);
    List splits = new ArrayList();

    // Turn the split points into a set of intervals.
    long start = splitPoints.get(0);
    Date startDate = longToDate(start, sqlDataType);
    if (sqlDataType == Types.TIMESTAMP) {
      // The lower bound's nanos value needs to match the actual lower-bound nanos.
      try {
        ((java.sql.Timestamp) startDate).setNanos(results.getTimestamp(1).getNanos());
      } catch (NullPointerException npe) {
        // If the lower bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
      }
    }

    for (int i = 1; i < splitPoints.size(); i++) {
      long end = splitPoints.get(i);
      Date endDate = longToDate(end, sqlDataType);

      if (i == splitPoints.size() - 1) {
        if (sqlDataType == Types.TIMESTAMP) {
          // The upper bound's nanos value needs to match the actual upper-bound nanos.
          try {
            ((java.sql.Timestamp) endDate).setNanos(results.getTimestamp(2).getNanos());
          } catch (NullPointerException npe) {
            // If the upper bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
          }
        }
        // This is the last one; use a closed interval.
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
            lowClausePrefix + dateToString(startDate),
            colName + " <= " + dateToString(endDate)));
      } else {
        // Normal open-interval case.
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
            lowClausePrefix + dateToString(startDate),
            highClausePrefix + dateToString(endDate)));
      }

      start = end;
      startDate = endDate;
    }

    if (minVal == Long.MIN_VALUE || maxVal == Long.MIN_VALUE) {
      // Add an extra split to handle the null case that we saw.
      splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
          colName + " IS NULL", colName + " IS NULL"));
    }

    return splits;
  }

  /** Retrieve the value from the column in a type-appropriate manner and return
      its timestamp since the epoch. If the column is null, then return Long.MIN_VALUE.
      This will cause a special split to be generated for the NULL case, but may also
      cause poorly-balanced splits if most of the actual dates are positive time
      since the epoch, etc.
    */
  private long resultSetColToLong(ResultSet rs, int colNum, int sqlDataType) throws SQLException {
    try {
      switch (sqlDataType) {
      case Types.DATE:
        return rs.getDate(colNum).getTime();
      case Types.TIME:
        return rs.getTime(colNum).getTime();
      case Types.TIMESTAMP:
        return rs.getTimestamp(colNum).getTime();
      default:
        throw new SQLException("Not a date-type field");
      }
    } catch (NullPointerException npe) {
      // null column. return minimum long value.
      LOG.warn("Encountered a NULL date in the split column. Splits may be poorly balanced.");
      return Long.MIN_VALUE;
    }
  }

  /**  Parse the long-valued timestamp into the appropriate SQL date type. */
  private Date longToDate(long val, int sqlDataType) {
    switch (sqlDataType) {
    case Types.DATE:
      return new java.sql.Date(val);
    case Types.TIME:
      return new java.sql.Time(val);
    case Types.TIMESTAMP:
      return new java.sql.Timestamp(val);
    default: // Shouldn't ever hit this case.
      return null;
    }
  }

  /**
   * Given a Date 'd', format it as a string for use in a SQL date
   * comparison operation.
   * @param d the date to format.
   * @return the string representing this date in SQL with any appropriate
   * quotation characters, etc.
   */
  protected String dateToString(Date d) {
    return "'" + d.toString() + "'";
  }
}