All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.mapreduce.ForestInputFormat Maven / Gradle / Ivy

/*
 * Copyright 2003-2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce;

import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.marklogic.io.BiendianDataInputStream;
import com.marklogic.mapreduce.utilities.InternalUtilities;

/**
 * {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat FileInputFormat} 
 * subclass for reading documents from a forest using DirectAccess.
 * 
 * 

* Direct Access is intended primarily for extracting documents in offline or * read-only forests, such as forests containing archived data that are * part of a Tiered Storage data management strategy. *

* This format produces key-value pairs where the key is a {@link DocumentURI} * and the value is a {@link ForestDocument}. The type of ForestDocument * depends on the underlying document content type: {@link DOMDocument} * for XML or text, or {@link BinaryDocument} for binaries. Binary * documents can be further specialized to {@link RegularBinaryDocument} or * {@link LargeBinaryDocument}, depending on size and the database * configuration. *

* * @author jchen * * @param Only ForestDocument is currently supported, but types * such as Text or BytesWritable are possible candidates to be added. */ public class ForestInputFormat extends FileInputFormat implements MarkLogicConstants { public static final Log LOG = LogFactory.getLog(ForestInputFormat.class); @Override public RecordReader createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new ForestReader(); } protected List listStatus(JobContext job) throws IOException { List result = super.listStatus(job); for (Iterator it = result.iterator(); it.hasNext();) { FileStatus file = it.next(); String fileName = file.getPath().getName(); if (!file.isDirectory() && fileName.equals("Obsolete")) { LOG.warn( "Obsolete file found. The forest is either live or isn't " + "dismounted cleanly. Ignoring forest " + file.getPath().getParent()); return Collections.emptyList(); } if (!file.isDirectory() || fileName.equals("Journals") || fileName.equals("Large")) { it.remove(); } } return result; } @Override public List getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List splits = new ArrayList(); List files = listStatus(job); for (FileStatus file : files) { // stand directories Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); FileStatus children[] = fs.listStatus(path); FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null, timestampsStatus = null, qualStatus = null; boolean obsolete = false; for (FileStatus child : children) { String fileName = child.getPath().getName(); if (fileName.equals("TreeData")) { // inside a stand treeDataStatus = child; } else if (fileName.equals("TreeIndex")) { treeIndexStatus = child; } else if (fileName.equals("Ordinals")) { ordinalsStatus = child; } else if (fileName.equals("Timestamps")) { timestampsStatus = child; } else if (fileName.equals("Obsolete")) { obsolete = true; break; } else if (fileName.equals("Qualities")) { qualStatus = child; } } if (obsolete) { LOG.warn( "Obsolete file found. The forest is either live or isn't " + "dismounted cleanly. Ignoring stand " + path); break; } if (treeDataStatus == null) { throw new RuntimeException("TreeData file not found."); } else if (treeIndexStatus == null) { throw new RuntimeException("TreeIndex file not found."); } else if (ordinalsStatus == null) { throw new RuntimeException("Ordinals file not found."); } else if (timestampsStatus == null) { throw new RuntimeException("Timestamps file not found."); } else if (qualStatus == null) { LOG.warn("Qualities file is not found."); } long treeDataSize = treeDataStatus.getLen(); if (treeDataSize == 0) { // unexpected, give up this stand LOG.warn("Found empty TreeData file. Skipping..."); continue; // skipping this stand } Path treeDataPath = treeDataStatus.getPath(); long blockSize = treeDataStatus.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); // make splits based on TreeIndex FSDataInputStream is = fs.open(treeIndexStatus.getPath()); BiendianDataInputStream in = new BiendianDataInputStream(is); int prevDocid = -1, docid = -1, position = 0; long prevOffset = -1L, offset = 0, splitStart = 0; BlockLocation[] blkLocations = fs.getFileBlockLocations( treeDataStatus, 0, treeDataSize); try { for (;; ++position) { try { docid = in.readInt(); in.readInt(); offset = in.readLong(); } catch (EOFException e) { break; } int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize); if (comp > 0) { throw new RuntimeException( "TreeIndex offset is out of bound: position = " + position + ", offset = " + offset + ", treeDataSize = " + treeDataSize); } if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) { throw new RuntimeException( "docid out of order, position = " + position + ", docid = " + docid + ", prevDocid = " + prevDocid); } prevDocid = docid; if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) { throw new RuntimeException( "offset out of order, position = " + position + ", offset = " + offset + ", prevOffset = " + prevOffset); } long splitLen = offset - splitStart; if (splitLen == splitSize || (splitLen > splitSize && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) { int blkIndex = getBlockIndex(blkLocations, offset); InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid=" + docid); } splits.add(split); splitStart = offset; } else if (splitLen > splitSize) { int blkIndex = getBlockIndex(blkLocations, prevOffset); InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart) + " last docid=" + docid); } splits.add(split); splitStart = prevOffset; } } } finally { in.close(); } if (offset > splitStart) { int blkIndex = getBlockIndex(blkLocations, offset - 1); InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart) + " last docid=" + docid); } splits.add(split); } } if (LOG.isDebugEnabled()) { LOG.debug("Made " + splits.size() + " splits."); } return splits; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy