com.marklogic.contentpump.CombineDocumentInputFormat Maven / Gradle / Ivy
/*
* Copyright (c) 2019 MarkLogic Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.marklogic.contentpump;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.marklogic.mapreduce.DocumentURIWithSourceInfo;
/**
* InputFormat for reading documents from file systems. One document on the
* file system is one record (key value pair).
*
* @author jchen
*
*/
public class CombineDocumentInputFormat
extends FileAndDirectoryInputFormat {
public static final Log LOG =
LogFactory.getLog(CombineDocumentInputFormat.class);
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
@Override
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new CombineDocumentReader();
}
@Override
public List getSplits(JobContext job) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List splits = super.getSplits(job);
List combinedSplits = new ArrayList();
CombineDocumentSplit split = null;
int skippedEmptyFiles = 0;
for (InputSplit file: splits) {
Path path = ((FileSplit)file).getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
FileStatus status = fs.getFileStatus(path);
long length = status.getLen();
long blockSize = status.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
if (length != 0) {
if (split == null) {
split = new CombineDocumentSplit();
}
try {
if (split.getLength() + length < splitSize ||
split.getLength() < minSize ) {
split.addSplit((FileSplit) file);
} else {
combinedSplits.add(split);
split = new CombineDocumentSplit();
split.addSplit((FileSplit) file);
}
} catch (InterruptedException e) {
LOG.error(e);
throw new RuntimeException(e);
}
} else {
skippedEmptyFiles++;
if (LOG.isDebugEnabled()) {
LOG.debug("Skipped empty file: " + path.toUri());
}
}
}
if (split != null) {
combinedSplits.add(split);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits: " + splits.size());
LOG.debug("Total # of combined splits: " + combinedSplits.size());
}
if (skippedEmptyFiles > 0) {
LOG.warn("Total empty file skipped during getSplits: " + skippedEmptyFiles);
}
return combinedSplits;
}
}