com.marklogic.mapreduce.ForestInputFormat Maven / Gradle / Ivy
/*
* Copyright (c) 2021 MarkLogic Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.marklogic.mapreduce;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.marklogic.io.BiendianDataInputStream;
import com.marklogic.mapreduce.utilities.InternalUtilities;
/**
* {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat FileInputFormat}
* subclass for reading documents from a forest using DirectAccess.
*
*
* Direct Access is intended primarily for extracting documents in offline or
* read-only forests, such as forests containing archived data that are
* part of a Tiered Storage data management strategy.
*
* This format produces key-value pairs where the key is a {@link DocumentURI}
* and the value is a {@link ForestDocument}. The type of ForestDocument
* depends on the underlying document content type: {@link DOMDocument}
* for XML or text, or {@link BinaryDocument} for binaries. Binary
* documents can be further specialized to {@link RegularBinaryDocument} or
* {@link LargeBinaryDocument}, depending on size and the database
* configuration.
*
*
* @author jchen
*
* @param Only ForestDocument is currently supported, but types
* such as Text or BytesWritable are possible candidates to be added.
*/
public class ForestInputFormat
extends FileInputFormat
implements MarkLogicConstants {
public static final Log LOG = LogFactory.getLog(ForestInputFormat.class);
@Override
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
return new ForestReader<>();
}
protected List listStatus(JobContext job) throws IOException {
List result = super.listStatus(job);
for (Iterator it = result.iterator(); it.hasNext();) {
FileStatus file = it.next();
String fileName = file.getPath().getName();
if (!file.isDirectory() && fileName.equals("Obsolete")) {
LOG.warn(
"Obsolete file found. The forest is either live or isn't "
+ "dismounted cleanly. Ignoring forest " +
file.getPath().getParent());
return Collections.emptyList();
}
if (!file.isDirectory() || fileName.equals("Journals")
|| fileName.equals("Large")) {
it.remove();
}
}
return result;
}
@Override
public List getSplits(JobContext job) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List splits = new ArrayList<>();
List files = listStatus(job);
for (FileStatus file : files) { // stand directories
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
FileStatus children[] = fs.listStatus(path);
FileStatus treeIndexStatus = null, treeDataStatus = null,
ordinalsStatus = null, timestampsStatus = null,
qualStatus = null;
boolean obsolete = false;
for (FileStatus child : children) {
String fileName = child.getPath().getName();
if (fileName.equals("TreeData")) { // inside a stand
treeDataStatus = child;
} else if (fileName.equals("TreeIndex")) {
treeIndexStatus = child;
} else if (fileName.equals("Ordinals")) {
ordinalsStatus = child;
} else if (fileName.equals("Timestamps")) {
timestampsStatus = child;
} else if (fileName.equals("Obsolete")) {
obsolete = true;
break;
} else if (fileName.equals("Qualities")) {
qualStatus = child;
}
}
if (obsolete) {
LOG.warn(
"Obsolete file found. The forest is either live or isn't "
+ "dismounted cleanly. Ignoring stand " + path);
break;
}
if (treeDataStatus == null) {
throw new RuntimeException("TreeData file not found.");
} else if (treeIndexStatus == null) {
throw new RuntimeException("TreeIndex file not found.");
} else if (ordinalsStatus == null) {
throw new RuntimeException("Ordinals file not found.");
} else if (timestampsStatus == null) {
throw new RuntimeException("Timestamps file not found.");
} else if (qualStatus == null) {
LOG.warn("Qualities file is not found.");
}
long treeDataSize = treeDataStatus.getLen();
if (treeDataSize == 0) {
// unexpected, give up this stand
LOG.warn("Found empty TreeData file. Skipping...");
continue; // skipping this stand
}
Path treeDataPath = treeDataStatus.getPath();
long blockSize = treeDataStatus.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
// make splits based on TreeIndex
FSDataInputStream is = fs.open(treeIndexStatus.getPath());
BiendianDataInputStream in = new BiendianDataInputStream(is);
int prevDocid = -1, docid = -1, position = 0;
long prevOffset = -1L, offset = 0, splitStart = 0;
BlockLocation[] blkLocations = fs.getFileBlockLocations(
treeDataStatus, 0, treeDataSize);
try {
for (;; ++position) {
try {
docid = in.readInt();
in.readInt();
offset = in.readLong();
} catch (EOFException e) {
break;
}
int comp = InternalUtilities.compareUnsignedLong(offset,
treeDataSize);
if (comp > 0) {
throw new RuntimeException(
"TreeIndex offset is out of bound: position = "
+ position + ", offset = " + offset
+ ", treeDataSize = " + treeDataSize);
}
if (prevDocid != -1 &&
(docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
throw new RuntimeException(
"docid out of order, position = " + position
+ ", docid = " + docid
+ ", prevDocid = " + prevDocid);
}
prevDocid = docid;
if (prevOffset != -1L
&& InternalUtilities.compareUnsignedLong(offset,
prevOffset) <= 0) {
throw new RuntimeException(
"offset out of order, position = " + position
+ ", offset = " + offset
+ ", prevOffset = " + prevOffset);
}
long splitLen = offset - splitStart;
if (splitLen == splitSize ||
(splitLen > splitSize &&
splitLen - splitSize <=
splitSize - (prevOffset - splitStart))) {
int blkIndex = getBlockIndex(blkLocations, offset);
InputSplit split = new FileSplit(treeDataPath,
splitStart, splitLen,
blkLocations[blkIndex].getHosts());
if (LOG.isDebugEnabled()) {
LOG.debug("Created split: start=" + splitStart +
" len=" + splitLen + " last docid=" + docid);
}
splits.add(split);
splitStart = offset;
} else if (splitLen > splitSize) {
int blkIndex = getBlockIndex(blkLocations, prevOffset);
InputSplit split = new FileSplit(treeDataPath,
splitStart, prevOffset - splitStart,
blkLocations[blkIndex].getHosts());
if (LOG.isDebugEnabled()) {
LOG.debug("Created split: start=" + splitStart +
" len=" + (prevOffset - splitStart) +
" last docid=" + docid);
}
splits.add(split);
splitStart = prevOffset;
}
}
} finally {
in.close();
}
if (offset > splitStart) {
int blkIndex = getBlockIndex(blkLocations, offset - 1);
InputSplit split = new FileSplit(treeDataPath, splitStart,
offset - splitStart, blkLocations[blkIndex].getHosts());
if (LOG.isDebugEnabled()) {
LOG.debug("Created split: start=" + splitStart +
" len=" + (offset - splitStart) + " last docid=" +
docid);
}
splits.add(split);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Made " + splits.size() + " splits.");
}
return splits;
}
}