com.marklogic.mapreduce.ForestReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mlcp Show documentation
MarkLogic Content Pump
There is a newer version: 11.3.1
/*
 * Copyright (c) 2020 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.marklogic.mapreduce.utilities.URIUtil;
import com.marklogic.io.BiendianDataInputStream;
import com.marklogic.tree.CompressedTreeDecoder;
import com.marklogic.tree.ExpandedTree;
import com.marklogic.tree.NodeKind;

/**
 * RecordReader that reads from forest data dirs.
 * 
 * @author jchen
 *
 * @param  Currently only support ForestDocument, but other types like
 * Text or BytesWritable are possible candidates to be added.
 */
public class ForestReader 
extends RecordReader
implements MarkLogicConstants {
    public static final Log LOG = LogFactory.getLog(ForestReader.class);
    protected FileSplit split;
    protected long bytesRead;
    protected Configuration conf;
    protected BiendianDataInputStream dataIs;
    protected BiendianDataInputStream ordIs;
    protected BiendianDataInputStream tsIs;
    protected BiendianDataInputStream qualIs;
    protected DocumentURIWithSourceInfo key;
    protected VALUEIN value;
    protected Class valueClass;
    protected int position;
    protected int prevDocid = -1;
    protected boolean done = false;
    protected Path largeForestDir;
    protected int nascentCnt = 0;
    protected int deletedCnt = 0;
    protected int fragCnt = 0;
    protected Collection colFilters;
    protected Collection dirFilters;
    protected Collection typeFilters;
    protected String srcId = null;

    @Override
    public void close() throws IOException {
        if (dataIs != null) {
            dataIs.close();
        }
        if (ordIs != null) {
            ordIs.close();
        }
        if (tsIs != null) {
            tsIs.close();
        }
        if (qualIs != null) {
            qualIs.close();
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("Nascent count: " + nascentCnt + 
                    " Deleted count: " + deletedCnt + " Bytes read = " +
                    bytesRead + " Fragment count: " + fragCnt +
                    " Last docid: " + prevDocid);
        }
    }

    @Override
    public DocumentURIWithSourceInfo getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public VALUEIN getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return done ? 1 : bytesRead / (float) split.getLength();
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        this.split = (FileSplit) split;
        conf = context.getConfiguration();
        Path dataPath = this.split.getPath();
        FileSystem fs = dataPath.getFileSystem(conf);
        dataIs = new BiendianDataInputStream(fs.open(dataPath));
        dataIs.skipBytes(this.split.getStart());
        Path ordPath = new Path(dataPath.getParent(), "Ordinals");
        ordIs = new BiendianDataInputStream(fs.open(ordPath));
        Path qualPath = new Path(dataPath.getParent(), "Qualities");
        qualIs = new BiendianDataInputStream(fs.open(qualPath));
        Path tsPath = new Path(dataPath.getParent(), "Timestamps");
        tsIs = new BiendianDataInputStream(fs.open(tsPath));
        valueClass = conf.getClass(INPUT_VALUE_CLASS, ForestDocument.class, 
                               Writable.class);
        if (!ForestDocument.class.isAssignableFrom(valueClass)) {
            throw new IllegalArgumentException("Unsupported " + 
                    INPUT_VALUE_CLASS);
        }
        largeForestDir = new Path(dataPath.getParent().getParent(), "Large");
        colFilters = conf.getStringCollection(COLLECTION_FILTER);
        dirFilters = conf.getStringCollection(DIRECTORY_FILTER);
        Collection addedDirs = null;
        for (Iterator it = dirFilters.iterator(); it.hasNext();) {
            String dir = it.next();
            if (!dir.endsWith("/")) {
                String newDir = dir + "/";
                it.remove();
                if (addedDirs == null) {
                    addedDirs = new ArrayList<>();
                }
                addedDirs.add(newDir);
            }
        }
        if (addedDirs != null) {
            dirFilters.addAll(addedDirs);
        }   
        typeFilters = conf.getStringCollection(TYPE_FILTER);
    }

    @SuppressWarnings("unchecked")
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        while (bytesRead < split.getLength() && !done) {
            ExpandedTree tree = getNextTree();
            if (tree == null) {
                continue;
            }
            String uri = tree.getDocumentURI();
            // no support for fragments
            if (tree.containLinks() || tree.getFragmentOrdinal() != 0) {
                // send to DocumentMapper for book keeping
                setSkipKey(uri, 0, 0, "fragment or link");
                value = null;
                return true;
            }
            if (!applyFilter(uri, tree)) {
                continue;
            }
            value = (VALUEIN) ForestDocument.createDocument(conf, 
                    largeForestDir, tree, uri);
            if (value == null) { // send to DocumentMapper for book keeping
                setSkipKey(uri, 0, 0, "unsupported node type");
                return true;
            }
            setKey(uri, uri, 0, 0);
            return true;
        }
        return false;
    }
    
    /**
     * Apply URI prefix and suffix configuration options and set the result as 
     * DocumentURI key.
     * 
     * @param uri Source string of document URI.
     * @param sub Sub-entry of the source of the document origin.
     * @param line Line number in the source if applicable; -1 otherwise.
     * @param col Column number in the source if applicable; -1 otherwise.
     */
    protected void setKey(String uri, String sub, int line, int col) {
        if (srcId == null) {
            srcId = split.getPath().toString();
        }
        // apply prefix and suffix for URI
        uri = URIUtil.applyUriReplace(uri, conf);
        uri = URIUtil.applyPrefixSuffix(uri, conf);
        if (key == null) {
            key = new DocumentURIWithSourceInfo(uri, srcId, sub, line, col);
        } else {   
            key.setSkipReason("");
            key.setUri(uri);   
            key.setSrcId(srcId);
            key.setSubId(sub);
            key.setColNumber(col);
            key.setLineNumber(line);
        }
    }
    
    /**
     * Set the result as 
     * DocumentURI key.
     * 
     * @param sub Source string of document URI.
     * @param line Line number in the source if applicable; -1 otherwise.
     * @param col Column number in the source if applicable; -1 otherwise.
     * @param reason Reason for skipping.
     * 
     * @return true if key indicates the record is to be skipped; false 
     * otherwise.
     */
    protected void setSkipKey(String sub, int line, int col, String reason) {
        if (srcId == null) {
            srcId = split.getPath().toString();
        }
        if (key == null) {
            key = new DocumentURIWithSourceInfo("", srcId, sub, line, col);
        } else {
            key.setUri("");   
            key.setSrcId(srcId);
            key.setSubId(sub);
            key.setColNumber(col);
            key.setLineNumber(line);
        }
        key.setSkipReason(reason);

        if (LOG.isDebugEnabled()) {
            LOG.debug("Set key: " + key);
        }
    }

    protected boolean applyFilter(String uri, ExpandedTree tree) {
        // apply directory filter
        if (!dirFilters.isEmpty()) {
            boolean match = false;
            for (String dir : dirFilters) {
                if (uri.startsWith(dir)) {
                    match = true;
                    break;
                }
            }
            if (!match) return false;
        }
        // apply type filter
        if (!typeFilters.isEmpty()) {
            byte kind = tree.rootNodeKind();
            boolean match = false;
            for (String type : typeFilters) {
                if (type.equalsIgnoreCase("BINARY") && 
                    kind == NodeKind.BINARY) {
                    match = true;
                    break;
                } else if (type.equalsIgnoreCase("TEXT") && 
                    kind == NodeKind.TEXT) {
                    match = true;
                    break;
                } else if (type.equalsIgnoreCase("JSON") && 
                    (kind == NodeKind.ARRAY ||
                     kind == NodeKind.OBJECT)) {
                    match = true;
                    break;
                } else if (type.equalsIgnoreCase("XML") && 
                    (kind == NodeKind.ELEM ||
                     kind == NodeKind.COMMENT ||
                     kind == NodeKind.PI)) {
                    match = true;
                    break;
                }
            }
            if (!match) return false;
        }
        // apply collection filter
        if (!colFilters.isEmpty()) {
            String[] cols = tree.getCollections();
            boolean match = false;
            for (String col : cols) {
                if (colFilters.contains(col)) {
                    match = true;
                    break;
                }
            }
            if (!match) return false;
        }
        return true;
    }

    private ExpandedTree getNextTree() throws IOException {
        int j;
        try {
            int docid = dataIs.readInt();
            int csword = dataIs.readInt();
            int fdatw = dataIs.readInt();
            bytesRead += 12;
            int datWords = csword & 0x0000000f;
            int hdrWords = 2;
            if (datWords == 0) {
                datWords = fdatw;
                hdrWords = 3;
                LOG.trace("3 header words");
            }
            if (docid == 0xffffffff && csword == 0xffffffff
                    && fdatw == 0xffffffff) {
                LOG.trace("Reached the end");
                done = true;
                return null;
            }
            if (prevDocid != -1 && docid <= prevDocid) {
                throw new RuntimeException("docid out of order, postition = "
                        + position + ", docid = " + docid + ", prevDocid = "
                        + prevDocid);
            }
            if (prevDocid == -1 && docid != 0) { // need to skip tsIs and ordIs
                ordIs.skipBytes(docid * 8);
                tsIs.skipBytes(docid * 8 * 2);
                qualIs.skipBytes(docid * 4);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("First docid: " + docid);
                }
            } else {
                int docidDiff = docid - prevDocid;
                if (docidDiff > 1) {
                    int toSkip = docidDiff * 8;
                    ordIs.skipBytes(toSkip);
                    tsIs.skipBytes(toSkip * 2);
                    qualIs.skipBytes(docidDiff * 4);
                }
            }
            prevDocid = docid;
            if (hdrWords == 2) {
                j = datWords - 1;
            } else {
                j = datWords;
            }
            j *= 4;
            fragCnt++;
            long nascent = tsIs.readLong();
            long deleted = tsIs.readLong();
            
            if (LOG.isTraceEnabled()) {
                LOG.trace(String.format("TreeData p %d d %d w %d nt %d dt %d",
                        position, docid, datWords, nascent, deleted));
            }
            
            if (nascent == 0L || deleted != -1L) { // skip
                bytesRead += dataIs.skipBytes(j);
                if (nascent == 0L) nascentCnt++;
                if (deleted != -1L) deletedCnt++;
                ordIs.skipBytes(8);
                qualIs.skipBytes(4);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Skipped a " + 
                        (nascent == 0L ? "nascent" : "deleted") + 
                        " document at position " + position);
                }
                position++;
                return null;
            }
        } catch (EOFException e) {
            done = true;
            return null;
        }
        try {
            // TODO: Is it better to read into a buffer or directly from the 
            // stream then reset and skip?
            byte[] buf = new byte[j];  
            InputStream in = dataIs.getInputStream();
            for (int read = 0; read < j; ) {
                read += in.read(buf, read, j - read);
            } 
            bytesRead += j;
            position++;
            ExpandedTree tree = new CompressedTreeDecoder().decode(buf,j);
            tree.setFragmentOrdinal(ordIs.readLong());
            tree.setQuality(qualIs.readInt());
            return tree;
        } catch (Exception e) {
            LOG.error("Unexpected error occurred reading forest data", e);
            done = true;
            return null;
        }
    }
}