org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
/**
* BucketizedHiveInputFormat serves the similar function as hiveInputFormat but
* its getSplits() always group splits from one input file into one wrapper
* split. It is useful for the applications that requires input files to fit in
* one mapper.
*/
public class BucketizedHiveInputFormat
extends HiveInputFormat {
public static final Log LOG = LogFactory
.getLog("org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat");
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
BucketizedHiveInputSplit hsplit = (BucketizedHiveInputSplit) split;
String inputFormatClassName = null;
Class inputFormatClass = null;
try {
inputFormatClassName = hsplit.inputFormatClassName();
inputFormatClass = job.getClassByName(inputFormatClassName);
} catch (Exception e) {
throw new IOException("cannot find class " + inputFormatClassName);
}
// clone a jobConf for setting needed columns for reading
JobConf cloneJobConf = new JobConf(job);
pushProjectionsAndFilters(cloneJobConf, inputFormatClass, hsplit.getPath()
.toString(), hsplit.getPath().toUri().getPath());
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass,
cloneJobConf);
BucketizedHiveRecordReader rr= new BucketizedHiveRecordReader(inputFormat, hsplit, cloneJobConf,
reporter);
rr.initIOContext(hsplit, cloneJobConf, inputFormatClass);
return rr;
}
protected FileStatus[] listStatus(JobConf job, Path path) throws IOException {
ArrayList result = new ArrayList();
List errors = new ArrayList();
FileSystem fs = path.getFileSystem(job);
FileStatus[] matches = fs.globStatus(path);
if (matches == null) {
errors.add(new IOException("Input path does not exist: " + path));
} else if (matches.length == 0) {
errors.add(new IOException("Input Pattern " + path + " matches 0 files"));
} else {
for (FileStatus globStat : matches) {
FileUtils.listStatusRecursively(fs, globStat, result);
}
}
if (!errors.isEmpty()) {
throw new InvalidInputException(errors);
}
LOG.info("Total input paths to process : " + result.size());
return result.toArray(new FileStatus[result.size()]);
}
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
init(job);
Path[] dirs = FileInputFormat.getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
JobConf newjob = new JobConf(job);
ArrayList result = new ArrayList();
int numOrigSplits = 0;
// for each dir, get all files under the dir, do getSplits to each
// individual file,
// and then create a BucketizedHiveInputSplit on it
for (Path dir : dirs) {
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
// create a new InputFormat instance if this is the first time to see this
// class
Class inputFormatClass = part.getInputFileFormatClass();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
newjob.setInputFormat(inputFormat.getClass());
FileStatus[] listStatus = listStatus(newjob, dir);
for (FileStatus status : listStatus) {
LOG.info("block size: " + status.getBlockSize());
LOG.info("file length: " + status.getLen());
FileInputFormat.setInputPaths(newjob, status.getPath());
InputSplit[] iss = inputFormat.getSplits(newjob, 0);
if (iss != null && iss.length > 0) {
numOrigSplits += iss.length;
result.add(new BucketizedHiveInputSplit(iss, inputFormatClass
.getName()));
}
}
}
LOG.info(result.size() + " bucketized splits generated from "
+ numOrigSplits + " original splits.");
return result.toArray(new BucketizedHiveInputSplit[result.size()]);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy