org.apache.hadoop.mapreduce.lib.db.TextSplitter Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapreduce.lib.db;

import java.sql.ResultSet;
import java.sql.SQLException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.MRJobConfig;

/**
 * Implement DBSplitter over text strings.
 */
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class TextSplitter extends BigDecimalSplitter {

  private static final Log LOG = LogFactory.getLog(TextSplitter.class);

  /**
   * This method needs to determine the splits between two user-provided strings.
   * In the case where the user's strings are 'A' and 'Z', this is not hard; we 
   * could create two splits from ['A', 'M') and ['M', 'Z'], 26 splits for strings
   * beginning with each letter, etc.
   *
   * If a user has provided us with the strings "Ham" and "Haze", however, we need
   * to create splits that differ in the third letter.
   *
   * The algorithm used is as follows:
   * Since there are 2**16 unicode characters, we interpret characters as digits in
   * base 65536. Given a string 's' containing characters s_0, s_1 .. s_n, we interpret
   * the string as the number: 0.s_0 s_1 s_2.. s_n in base 65536. Having mapped the
   * low and high strings into floating-point values, we then use the BigDecimalSplitter
   * to establish the even split points, then map the resulting floating point values
   * back into strings.
   */
  public List split(Configuration conf, ResultSet results, String colName)
      throws SQLException {

    LOG.warn("Generating splits for a textual index column.");
    LOG.warn("If your database sorts in a case-insensitive order, "
        + "this may result in a partial import or duplicate records.");
    LOG.warn("You are strongly encouraged to choose an integral split column.");

    String minString = results.getString(1);
    String maxString = results.getString(2);

    boolean minIsNull = false;

    // If the min value is null, switch it to an empty string instead for purposes
    // of interpolation. Then add [null, null] as a special case split.
    if (null == minString) {
      minString = "";
      minIsNull = true;
    }

    if (null == maxString) {
      // If the max string is null, then the min string has to be null too.
      // Just return a special split for this case.
      List splits = new ArrayList();
      splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
          colName + " IS NULL", colName + " IS NULL"));
      return splits;
    }

    // Use this as a hint. May need an extra task if the size doesn't
    // divide cleanly.
    int numSplits = conf.getInt(MRJobConfig.NUM_MAPS, 1);

    String lowClausePrefix = colName + " >= '";
    String highClausePrefix = colName + " < '";

    // If there is a common prefix between minString and maxString, establish it
    // and pull it out of minString and maxString.
    int maxPrefixLen = Math.min(minString.length(), maxString.length());
    int sharedLen;
    for (sharedLen = 0; sharedLen < maxPrefixLen; sharedLen++) {
      char c1 = minString.charAt(sharedLen);
      char c2 = maxString.charAt(sharedLen);
      if (c1 != c2) {
        break;
      }
    }

    // The common prefix has length 'sharedLen'. Extract it from both.
    String commonPrefix = minString.substring(0, sharedLen);
    minString = minString.substring(sharedLen);
    maxString = maxString.substring(sharedLen);

    List splitStrings = split(numSplits, minString, maxString, commonPrefix);
    List splits = new ArrayList();

    // Convert the list of split point strings into an actual set of InputSplits.
    String start = splitStrings.get(0);
    for (int i = 1; i < splitStrings.size(); i++) {
      String end = splitStrings.get(i);

      if (i == splitStrings.size() - 1) {
        // This is the last one; use a closed interval.
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
            lowClausePrefix + start + "'", colName + " <= '" + end + "'"));
      } else {
        // Normal open-interval case.
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
            lowClausePrefix + start + "'", highClausePrefix + end + "'"));
      }
    }

    if (minIsNull) {
      // Add the special null split at the end.
      splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
          colName + " IS NULL", colName + " IS NULL"));
    }

    return splits;
  }

  List split(int numSplits, String minString, String maxString, String commonPrefix)
      throws SQLException {

    BigDecimal minVal = stringToBigDecimal(minString);
    BigDecimal maxVal = stringToBigDecimal(maxString);

    List splitPoints = split(new BigDecimal(numSplits), minVal, maxVal);
    List splitStrings = new ArrayList();

    // Convert the BigDecimal splitPoints into their string representations.
    for (BigDecimal bd : splitPoints) {
      splitStrings.add(commonPrefix + bigDecimalToString(bd));
    }

    // Make sure that our user-specified boundaries are the first and last entries
    // in the array.
    if (splitStrings.size() == 0 || !splitStrings.get(0).equals(commonPrefix + minString)) {
      splitStrings.add(0, commonPrefix + minString);
    }
    if (splitStrings.size() == 1
        || !splitStrings.get(splitStrings.size() - 1).equals(commonPrefix + maxString)) {
      splitStrings.add(commonPrefix + maxString);
    }

    return splitStrings;
  }

  private final static BigDecimal ONE_PLACE = new BigDecimal(65536);

  // Maximum number of characters to convert. This is to prevent rounding errors
  // or repeating fractions near the very bottom from getting out of control. Note
  // that this still gives us a huge number of possible splits.
  private final static int MAX_CHARS = 8;

  /**
   * Return a BigDecimal representation of string 'str' suitable for use
   * in a numerically-sorting order.
   */
  BigDecimal stringToBigDecimal(String str) {
    BigDecimal result = BigDecimal.ZERO;
    BigDecimal curPlace = ONE_PLACE; // start with 1/65536 to compute the first digit.

    int len = Math.min(str.length(), MAX_CHARS);

    for (int i = 0; i < len; i++) {
      int codePoint = str.codePointAt(i);
      result = result.add(tryDivide(new BigDecimal(codePoint), curPlace));
      // advance to the next less significant place. e.g., 1/(65536^2) for the second char.
      curPlace = curPlace.multiply(ONE_PLACE);
    }

    return result;
  }

  /**
   * Return the string encoded in a BigDecimal.
   * Repeatedly multiply the input value by 65536; the integer portion after such a multiplication
   * represents a single character in base 65536. Convert that back into a char and create a
   * string out of these until we have no data left.
   */
  String bigDecimalToString(BigDecimal bd) {
    BigDecimal cur = bd.stripTrailingZeros();
    StringBuilder sb = new StringBuilder();

    for (int numConverted = 0; numConverted < MAX_CHARS; numConverted++) {
      cur = cur.multiply(ONE_PLACE);
      int curCodePoint = cur.intValue();
      if (0 == curCodePoint) {
        break;
      }

      cur = cur.subtract(new BigDecimal(curCodePoint));
      sb.append(Character.toChars(curCodePoint));
    }

    return sb.toString();
  }
}