com.marklogic.contentpump.LocalJobRunner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mlcp Show documentation
MarkLogic Content Pump
There is a newer version: 11.3.1
/*
 * Copyright (c) 2021 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;


import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;

import com.marklogic.contentpump.utilities.ReflectionUtil;

/**
 * Runs a job in-process, potentially multi-threaded.  Only supports map-only
 * jobs.
 * 
 * @author jchen
 *
 */
public class LocalJobRunner implements ConfigConstants {
    public static final Log LOG = LogFactory.getLog(LocalJobRunner.class);
    
    private LocalJob job;
    private AtomicInteger[] progress;
    private long startTime;
    private ContentPumpReporter reporter;
    private ThreadManager threadManager;
    private ThreadPoolExecutor pool;
    
    public LocalJobRunner(LocalJob job, CommandLine cmdline, Command cmd) {
        this.job = job;
        this.threadManager = job.getThreadManager();
        threadManager.parseCmdlineOptions(cmdline, cmd);
        startTime = System.currentTimeMillis();
    }

    /**
     * Run the job.  Get the input splits, create map tasks and submit it to
     * the thread pool if there is one; otherwise, runs the the task one by
     * one.
     * 
     * @param 
     * @param 
     * @param 
     * @param 
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    public  
    void run() throws Exception {
        Configuration conf = job.getConfiguration();
        reporter = new ContentPumpReporter();
        InputFormat inputFormat = 
            (InputFormat)ReflectionUtils.newInstance(
                job.getInputFormatClass(), conf);
        List splits;
        T[] array;
        try {
            splits = inputFormat.getSplits(job);
            array = (T[])splits.toArray(
                    new org.apache.hadoop.mapreduce.InputSplit[splits.size()]);
            // sort the splits into order based on size, so that the biggest
            // goes first
            Arrays.sort(array, new SplitLengthComparator());
        } catch (Exception ex) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Error getting input splits: ", ex);
            } else {
                LOG.error("Error getting input splits: ");
                LOG.error(ex.getMessage());
            }
            job.setJobState(JobStatus.State.FAILED);
            return;
        }               
        OutputFormat outputFormat = 
            (OutputFormat)ReflectionUtils.newInstance(
                job.getOutputFormatClass(), conf);
        Class> mapperClass = job.getMapperClass();
        Mapper mapper = 
            (Mapper)ReflectionUtils.newInstance(
                mapperClass, conf);
        try {
            // Set newServerThreads and restrictHosts in ThreadManager for
            // initializing thread pool
            outputFormat.checkOutputSpecs(job);
        } catch (Exception ex) {         
            if (LOG.isDebugEnabled()) {
                LOG.debug("Error checking output specification: ", ex);
            } else {
                LOG.error("Error checking output specification: ");
                LOG.error(ex.getMessage());
            }
            job.setJobState(JobStatus.State.FAILED);
            return;
        }
        // Initialize thread pool
        pool = threadManager.initThreadPool();
        threadManager.runThreadPoller();

        progress = new AtomicInteger[splits.size()];
        for (int i = 0; i < splits.size(); i++) {
            progress[i] = new AtomicInteger();
        }
     
        job.setJobState(JobStatus.State.RUNNING);
        Monitor monitor = new Monitor();
        monitor.start();
        for (int i = 0; i < array.length && !ContentPump.shutdown; i++) {        
            InputSplit split = array[i];
            if (pool != null) {
                LocalMapTask task =
                    new LocalMapTask<>(
                        inputFormat, outputFormat, conf, i, split, reporter,
                        progress[i]);
                threadManager.submitTask(task, i, array.length);
            } else { // single-threaded
                JobID jid = new JobID();
                TaskID taskId = new TaskID(jid.getJtIdentifier(), jid.getId(), TaskType.MAP, i);
                TaskAttemptID taskAttemptId = new TaskAttemptID(taskId, 0);
                TaskAttemptContext context = 
                    ReflectionUtil.createTaskAttemptContext(conf, taskAttemptId);
                RecordReader reader = 
                    inputFormat.createRecordReader(split, context);
                RecordWriter writer = 
                    outputFormat.getRecordWriter(context);
                OutputCommitter committer = 
                    outputFormat.getOutputCommitter(context);
                TrackingRecordReader trackingReader = 
                    new TrackingRecordReader(reader, progress[i]);

                Mapper.Context mapperContext = 
                    ReflectionUtil.createMapperContext(mapper, conf, 
                        taskAttemptId, trackingReader, writer, committer, 
                        reporter, split);
                
                trackingReader.initialize(split, mapperContext);
                
                // no thread pool (only 1 thread specified)
                Class> mapClass = 
                        job.getMapperClass();
                mapperContext.getConfiguration().setClass(
                   CONF_MAPREDUCE_JOB_MAP_CLASS , mapClass, Mapper.class);
                mapper = (Mapper) 
                    ReflectionUtils.newInstance(mapClass,
                        mapperContext.getConfiguration());
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Running with single thread and will not " +
                        "auto-scale");
                }
                try {
                    mapper.run(mapperContext);
                } finally {
                    try {
                        trackingReader.close();
                    } catch (Throwable t) {
                        LOG.error("Error closing reader: " + t.getMessage());
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(t);
                        }
                    }
                    try {
                        writer.close(mapperContext);
                    } catch (Throwable t) {
                        LOG.error("Error closing writer: " + t.getMessage());
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(t);
                        }
                    } 
                    try {
                        committer.commitTask(context);
                    } catch (Throwable t) {
                        LOG.error("Error committing task: " + t.getMessage());
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(t);
                        }
                    }
                }
            }
        }
        threadManager.shutdownThreadPool();
        job.setJobState(JobStatus.State.SUCCEEDED);
        monitor.interrupt();
        monitor.join(1000);
        
        // report counters
        Iterator groupIt = 
            reporter.counters.iterator();
        while (groupIt.hasNext()) {
            CounterGroup group = groupIt.next();
            LOG.info(group.getDisplayName() + ": ");
            Iterator counterIt = group.iterator();
            while (counterIt.hasNext()) {
                Counter counter = counterIt.next();
                LOG.info(counter.getDisplayName() + ": " + 
                                counter.getValue());
            }
        }
        LOG.info("Total execution time: " + 
                 (System.currentTimeMillis() - startTime) / 1000 + " sec");
    }

    /**
     * A map task to be run in a thread.
     * 
     * @author jchen
     *
     * @param 
     * @param 
     * @param 
     * @param 
     */
    public class LocalMapTask
    implements Callable