org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
/**
* Symlink file is a text file which contains a list of filename / dirname.
* This input method reads symlink files from specified job input paths and
* takes the files / directories specified in those symlink files as
* actual map-reduce input. The target input data should be in TextInputFormat.
*/
@SuppressWarnings("deprecation")
public class SymlinkTextInputFormat extends SymbolicInputFormat implements
InputFormat, JobConfigurable,
ContentSummaryInputFormat, ReworkMapredInputFormat {
/**
* This input split wraps the FileSplit generated from
* TextInputFormat.getSplits(), while setting the original link file path
* as job input path. This is needed because MapOperator relies on the
* job input path to lookup correct child operators. The target data file
* is encapsulated in the wrapped FileSplit.
*/
public static class SymlinkTextInputSplit extends FileSplit {
private final FileSplit split;
public SymlinkTextInputSplit() {
super((Path)null, 0, 0, (String[])null);
split = new FileSplit((Path)null, 0, 0, (String[])null);
}
public SymlinkTextInputSplit(Path symlinkPath, FileSplit split) throws IOException {
super(symlinkPath, 0, 0, split.getLocations());
this.split = split;
}
/**
* Gets the target split, i.e. the split of target data.
*/
public FileSplit getTargetSplit() {
return split;
}
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
split.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
split.readFields(in);
}
}
@Override
public RecordReader getRecordReader(
InputSplit split, JobConf job, Reporter reporter) throws IOException {
InputSplit targetSplit = ((SymlinkTextInputSplit)split).getTargetSplit();
// The target data is in TextInputFormat.
TextInputFormat inputFormat = new TextInputFormat();
inputFormat.configure(job);
RecordReader innerReader = null;
try {
innerReader = inputFormat.getRecordReader(targetSplit, job,
reporter);
} catch (Exception e) {
innerReader = HiveIOExceptionHandlerUtil
.handleRecordReaderCreationException(e, job);
}
HiveRecordReader rr = new HiveRecordReader(innerReader, job);
rr.initIOContext((FileSplit)targetSplit, job, TextInputFormat.class, innerReader);
return rr;
}
/**
* Parses all target paths from job input directory which contains symlink
* files, and splits the target data using TextInputFormat.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
if (symlinksDirs.length == 0) {
throw new IOException("No input paths specified in job.");
}
// Get all target paths first, because the number of total target paths
// is used to determine number of splits of each target path.
List targetPaths = new ArrayList();
List symlinkPaths = new ArrayList();
try {
getTargetPathsFromSymlinksDirs(
job,
symlinksDirs,
targetPaths,
symlinkPaths);
} catch (Exception e) {
throw new IOException(
"Error parsing symlinks from specified job input path.", e);
}
if (targetPaths.size() == 0) {
return new InputSplit[0];
}
// The input should be in TextInputFormat.
TextInputFormat inputFormat = new TextInputFormat();
JobConf newjob = new JobConf(job);
newjob.setInputFormat(TextInputFormat.class);
inputFormat.configure(newjob);
List result = new ArrayList();
// ceil(numSplits / numPaths), so we can get at least numSplits splits.
int numPaths = targetPaths.size();
int numSubSplits = (numSplits + numPaths - 1) / numPaths;
// For each path, do getSplits().
for (int i = 0; i < numPaths; ++i) {
Path targetPath = targetPaths.get(i);
Path symlinkPath = symlinkPaths.get(i);
FileInputFormat.setInputPaths(newjob, targetPath);
InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
for (InputSplit is : iss) {
result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit)is));
}
}
return result.toArray(new InputSplit[result.size()]);
}
@Override
public void configure(JobConf job) {
// empty
}
/**
* Given list of directories containing symlink files, read all target
* paths from symlink files and return as targetPaths list. And for each
* targetPaths[i], symlinkPaths[i] will be the path to the symlink file
* containing the target path.
*/
private static void getTargetPathsFromSymlinksDirs(
Configuration conf, Path[] symlinksDirs,
List targetPaths, List symlinkPaths) throws IOException {
for (Path symlinkDir : symlinksDirs) {
FileSystem fileSystem = symlinkDir.getFileSystem(conf);
FileStatus[] symlinks = fileSystem.listStatus(symlinkDir);
// Read paths from each symlink file.
for (FileStatus symlink : symlinks) {
BufferedReader reader = null;
try {
reader = new BufferedReader(
new InputStreamReader(
fileSystem.open(symlink.getPath())));
String line;
while ((line = reader.readLine()) != null) {
targetPaths.add(new Path(line));
symlinkPaths.add(symlink.getPath());
}
} finally {
org.apache.hadoop.io.IOUtils.closeStream(reader);
}
}
}
}
/**
* For backward compatibility with hadoop 0.17.
*/
public void validateInput(JobConf job) throws IOException {
// do nothing
}
@Override
public ContentSummary getContentSummary(Path p, JobConf job)
throws IOException {
//length, file count, directory count
long[] summary = {0, 0, 0};
List targetPaths = new ArrayList();
List symlinkPaths = new ArrayList();
try {
getTargetPathsFromSymlinksDirs(
job,
new Path[]{p},
targetPaths,
symlinkPaths);
} catch (Exception e) {
throw new IOException(
"Error parsing symlinks from specified job input path.", e);
}
for(Path path : targetPaths) {
FileSystem fs = path.getFileSystem(job);
ContentSummary cs = fs.getContentSummary(path);
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
}
return new ContentSummary(summary[0], summary[1], summary[2]);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy