org.apache.hadoop.mapreduce.lib.db.DateSplitter Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.db;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Time;
import java.sql.Timestamp;
import java.sql.Types;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.shaded.org.slf4j.Logger;
import org.apache.hadoop.shaded.org.slf4j.LoggerFactory;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.shaded.org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.MRJobConfig;
/**
* Implement DBSplitter over date/time values.
* Make use of logic from IntegerSplitter, since date/time are just longs
* in Java.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class DateSplitter extends IntegerSplitter {
private static final Logger LOG = LoggerFactory.getLogger(DateSplitter.class);
public List split(Configuration conf, ResultSet results, String colName)
throws SQLException {
long minVal;
long maxVal;
int sqlDataType = results.getMetaData().getColumnType(1);
minVal = resultSetColToLong(results, 1, sqlDataType);
maxVal = resultSetColToLong(results, 2, sqlDataType);
String lowClausePrefix = colName + " >= ";
String highClausePrefix = colName + " < ";
int numSplits = conf.getInt(MRJobConfig.NUM_MAPS, 1);
if (numSplits < 1) {
numSplits = 1;
}
if (minVal == Long.MIN_VALUE && maxVal == Long.MIN_VALUE) {
// The range of acceptable dates is NULL to NULL. Just create a single split.
List splits = new ArrayList();
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
colName + " IS NULL", colName + " IS NULL"));
return splits;
}
// Gather the split point integers
List splitPoints = split(numSplits, minVal, maxVal);
List splits = new ArrayList();
// Turn the split points into a set of intervals.
long start = splitPoints.get(0);
Date startDate = longToDate(start, sqlDataType);
if (sqlDataType == Types.TIMESTAMP) {
// The lower bound's nanos value needs to match the actual lower-bound nanos.
try {
((java.sql.Timestamp) startDate).setNanos(results.getTimestamp(1).getNanos());
} catch (NullPointerException npe) {
// If the lower bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
}
}
for (int i = 1; i < splitPoints.size(); i++) {
long end = splitPoints.get(i);
Date endDate = longToDate(end, sqlDataType);
if (i == splitPoints.size() - 1) {
if (sqlDataType == Types.TIMESTAMP) {
// The upper bound's nanos value needs to match the actual upper-bound nanos.
try {
((java.sql.Timestamp) endDate).setNanos(results.getTimestamp(2).getNanos());
} catch (NullPointerException npe) {
// If the upper bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
}
}
// This is the last one; use a closed interval.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
lowClausePrefix + dateToString(startDate),
colName + " <= " + dateToString(endDate)));
} else {
// Normal open-interval case.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
lowClausePrefix + dateToString(startDate),
highClausePrefix + dateToString(endDate)));
}
start = end;
startDate = endDate;
}
if (minVal == Long.MIN_VALUE || maxVal == Long.MIN_VALUE) {
// Add an extra split to handle the null case that we saw.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(
colName + " IS NULL", colName + " IS NULL"));
}
return splits;
}
/** Retrieve the value from the column in a type-appropriate manner and return
its timestamp since the epoch. If the column is null, then return Long.MIN_VALUE.
This will cause a special split to be generated for the NULL case, but may also
cause poorly-balanced splits if most of the actual dates are positive time
since the epoch, etc.
*/
private long resultSetColToLong(ResultSet rs, int colNum, int sqlDataType) throws SQLException {
try {
switch (sqlDataType) {
case Types.DATE:
return rs.getDate(colNum).getTime();
case Types.TIME:
return rs.getTime(colNum).getTime();
case Types.TIMESTAMP:
return rs.getTimestamp(colNum).getTime();
default:
throw new SQLException("Not a date-type field");
}
} catch (NullPointerException npe) {
// null column. return minimum long value.
LOG.warn("Encountered a NULL date in the split column. Splits may be poorly balanced.");
return Long.MIN_VALUE;
}
}
/** Parse the long-valued timestamp into the appropriate SQL date type. */
private Date longToDate(long val, int sqlDataType) {
switch (sqlDataType) {
case Types.DATE:
return new java.sql.Date(val);
case Types.TIME:
return new java.sql.Time(val);
case Types.TIMESTAMP:
return new java.sql.Timestamp(val);
default: // Shouldn't ever hit this case.
return null;
}
}
/**
* Given a Date 'd', format it as a string for use in a SQL date
* org.apache.hadoop.shaded.com.arison operation.
* @param d the date to format.
* @return the string representing this date in SQL with any appropriate
* quotation characters, etc.
*/
protected String dateToString(Date d) {
return "'" + d.toString() + "'";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy