All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.PartitionCollapsingExecutionPlanner Maven / Gradle / Ivy

Go to download

Librares that make easier to solve data problems using Hadoop and higher level languages based on it.

There is a newer version: 1.3.3
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.SortedMap;

import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

import datafu.hourglass.avro.AvroDateRangeMetadata;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;

/**
 * Execution planner used by {@link AbstractPartitionCollapsingIncrementalJob} and its derived classes.
 * This creates a plan to process partitioned input data and collapse the partitions into a single output.
 * 
 * 

* To use this class, the input and output paths must be specified. In addition the desired input date * range can be specified through several methods. Then {@link #createPlan()} can be called and the * execution plan will be created. The inputs to process will be available from {@link #getInputsToProcess()}, * the number of reducers to use will be available from {@link #getNumReducers()}, and the input schemas * will be available from {@link #getInputSchemas()}. *

* *

* Previous output may be reused by using {@link #setReusePreviousOutput(boolean)}. If previous output exists * and it is to be reused then it will be available from {@link #getPreviousOutputToProcess()}. New input data * to process that is after the previous output time range is available from {@link #getNewInputsToProcess()}. * Old input data to process that is before the previous output time range and should be subtracted from the * previous output is available from {@link #getOldInputsToProcess()}. *

* *

* Configuration properties are used to configure a {@link ReduceEstimator} instance. This is used to * calculate how many reducers should be used. * The number of reducers to use is based on the input data size and the * num.reducers.bytes.per.reducer property. This setting can be controlled more granularly * through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer. * Check {@link ReduceEstimator} for more details on how the properties are used. *

* */ public class PartitionCollapsingExecutionPlanner extends ExecutionPlanner { private final Logger _log = Logger.getLogger(PartitionCollapsingExecutionPlanner.class); private SortedMap _outputPathsByDate; private boolean _reusePreviousOutput; // the chosen execution plan private Plan _plan; /** * An execution plan. Encapsulates what inputs will be processed. * */ private class Plan { private List _inputsToProcess = new ArrayList(); private List _newInputsToProcess = new ArrayList(); private List _oldInputsToProcess = new ArrayList(); private Map _latestInputByPath = new HashMap(); private DatePath _previousOutputToProcess; private List _inputSchemas = new ArrayList(); private Map _inputSchemasByPath = new HashMap(); private boolean _needAnotherPass; private DateRange _currentDateRange; private int _numReducers; private Long _totalBytes; public void finalizePlan() throws IOException { determineInputSchemas(); determineNumReducers(); determineTotalBytes(); } /** * Determines the number of bytes that will be consumed by this execution plan. * This is used to compare alternative plans so the one with the least bytes * consumed can be used. * * @throws IOException */ private void determineTotalBytes() throws IOException { _totalBytes = 0L; for (DatePath dp : _inputsToProcess) { _totalBytes += PathUtils.countBytes(getFileSystem(), dp.getPath()); } if (_previousOutputToProcess != null) { _totalBytes += PathUtils.countBytes(getFileSystem(), _previousOutputToProcess.getPath()); } _log.info("Total bytes consumed: " + _totalBytes); } /** * Determines the input schemas. There may be multiple input schemas because multiple inputs are allowed. * The latest available inputs are used to determine the schema, the assumption being that schemas are * backwards-compatible. * * @throws IOException */ private void determineInputSchemas() throws IOException { if (_latestInputByPath.size() > 0) { _log.info("Determining input schemas"); for (Entry entry : _latestInputByPath.entrySet()) { String root = entry.getKey(); String input = entry.getValue(); _log.info("Loading schema for " + input); Schema schema = PathUtils.getSchemaFromPath(getFileSystem(),new Path(input)); _inputSchemas.add(schema); _inputSchemasByPath.put(root, schema); } } } /** * Determines the number of reducers to use based on the input data size and the previous output, * if it exists and is being reused. * The number of reducers to use is based on the input data size and the * num.reducers.bytes.per.reducer property. This setting can be controlled more granularly * through num.reducers.input.bytes.per.reducer and num.reducers.previous.bytes.per.reducer. * See {@link ReduceEstimator} for details on reducer estimation. * * @throws IOException */ private void determineNumReducers() throws IOException { ReduceEstimator estimator = new ReduceEstimator(getFileSystem(),getProps()); List inputPaths = new ArrayList(); for (DatePath input : _inputsToProcess) { inputPaths.add(input.getPath().toString()); estimator.addInputPath("input",input.getPath()); } if (_previousOutputToProcess != null) { estimator.addInputPath("previous",_previousOutputToProcess.getPath()); } _numReducers = estimator.getNumReducers(); } } /** * Initializes the execution planner. * * @param fs file system * @param props configuration properties */ public PartitionCollapsingExecutionPlanner(FileSystem fs, Properties props) { super(fs, props); } /** * Create the execution plan. * * @throws IOException IOException */ public void createPlan() throws IOException { if (_plan != null) throw new RuntimeException("Plan already exists"); _log.info("Creating execution plan"); loadInputData(); loadOutputData(); determineAvailableInputDates(); determineDateRange(); List plans = new ArrayList(); Plan plan; if (_reusePreviousOutput) { _log.info("Output may be reused, will create alternative plan that does not reuse output"); plan = new Plan(); try { determineInputsToProcess(false,plan); plan.finalizePlan(); plans.add(plan); } catch (MaxInputDataExceededException e) { _log.info(e.getMessage()); } } _log.info(String.format("Creating plan that %s previous output",(_reusePreviousOutput ? "reuses" : "does not reuse"))); plan = new Plan(); try { determineInputsToProcess(_reusePreviousOutput,plan); } catch (MaxInputDataExceededException e) { throw new RuntimeException(e); } plan.finalizePlan(); plans.add(plan); if (plans.size() > 1) { _log.info(String.format("There are %d alternative execution plans:",plans.size())); for (Plan option : plans) { _log.info(String.format("* Consume %d new inputs, %d old inputs, %s previous output (%d bytes)", option._newInputsToProcess.size(), option._oldInputsToProcess.size(), option._previousOutputToProcess != null ? "reuse" : "no", option._totalBytes)); } // choose plan with least bytes consumed Collections.sort(plans, new Comparator() { @Override public int compare(Plan o1, Plan o2) { return o1._totalBytes.compareTo(o2._totalBytes); } }); _plan = plans.get(0); _log.info(String.format("Choosing plan consuming %d bytes",_plan._totalBytes)); } else { _plan = plans.get(0); } } /** * Gets whether previous output should be reused, if it exists. * * @return true if previous output should be reused */ public boolean getReusePreviousOutput() { return _reusePreviousOutput; } /** * Sets whether previous output should be reused, if it exists. * * @param reuse true if previous output should be reused */ public void setReusePreviousOutput(boolean reuse) { _reusePreviousOutput = reuse; } /** * Get the number of reducers to use based on the input and previous output data size. * Must call {@link #createPlan()} first. * * @return number of reducers to use */ public int getNumReducers() { checkPlanExists(); return getPlan()._numReducers; } public DateRange getCurrentDateRange() { checkPlanExists(); return getPlan()._currentDateRange; } /** * Gets the previous output to reuse, or null if no output is being reused. * Must call {@link #createPlan()} first. * * @return previous output to reuse, or null */ public DatePath getPreviousOutputToProcess() { return getPlan()._previousOutputToProcess; } /** * Gets all inputs that will be processed. This includes both old and new data. * Must call {@link #createPlan()} first. * * @return inputs to process */ public List getInputsToProcess() { return getPlan()._inputsToProcess; } /** * Gets only the new data that will be processed. New data is data that falls within the * desired date range. * Must call {@link #createPlan()} first. * * @return new inputs to process */ public List getNewInputsToProcess() { return getPlan()._newInputsToProcess; } /** * Gets only the old data that will be processed. Old data is data that falls before the * desired date range. It will be subtracted out from the previous output. * Must call {@link #createPlan()} first. * * @return old inputs to process */ public List getOldInputsToProcess() { return getPlan()._oldInputsToProcess; } /** * Gets whether another pass will be required. Because there may be a limit on the number of inputs processed * in a single run, multiple runs may be required to process all data in the desired date range. * Must call {@link #createPlan()} first. * * @return true if another pass is required */ public boolean getNeedsAnotherPass() { return getPlan()._needAnotherPass; } /** * Gets the input schemas. Because multiple inputs are allowed, there may be multiple schemas. * Must call {@link #createPlan()} first. * *

* This does not include the output schema, even though previous output may be fed back as input. * The reason is that the ouput schema it determined based on the input schema. *

* * @return input schemas */ public List getInputSchemas() { return getPlan()._inputSchemas; } /** * Gets a map from input path to schema. Because multiple inputs are allowed, there may be multiple schemas. * Must call {@link #createPlan()} first. * * @return map from path to input schema */ public Map getInputSchemasByPath() { return getPlan()._inputSchemasByPath; } /** * Determines what output data already exists. Previous output may be reused. * * @throws IOException */ private void loadOutputData() throws IOException { if (getOutputPath() == null) { throw new RuntimeException("No output path specified"); } _log.info(String.format("Searching for existing output data in " + getOutputPath())); _outputPathsByDate = getDatedData(getOutputPath()); _log.info(String.format("Found %d output paths",_outputPathsByDate.size())); } /** * Determines what input data to process. * *

* The input data to consume is determined by the desired date range. If previous output is not reused then the input data to process * will coincide with the date range. If previous output may be reused and previous output exists, then the input data to process * will consist of new data and potentially old data. The new input data to process is data that has time after the previous output date range, * so that it may be added to the previous output. * The old data to process is data that has time before the previous output date range, so that it may be subtracted from the previous output. *

* *

* If there is a limit on how many days of input data can be processed then it may be the case that not all input data will be processed in * a single run. *

* * @throws IOException * @throws MaxInputDataExceededException */ private void determineInputsToProcess(boolean reusePreviousOutput, Plan plan) throws IOException, MaxInputDataExceededException { Calendar cal = Calendar.getInstance(PathUtils.timeZone); DateRange outputDateRange = null; if (reusePreviousOutput) { if (_outputPathsByDate.size() > 0) { DatePath latestPriorOutput = _outputPathsByDate.get(Collections.max(_outputPathsByDate.keySet())); _log.info("Have previous output, determining what previous incremental data to difference out"); outputDateRange = AvroDateRangeMetadata.getOutputFileDateRange(getFileSystem(),latestPriorOutput.getPath()); _log.info(String.format("Previous output has date range %s to %s", PathUtils.datedPathFormat.format(outputDateRange.getBeginDate()), PathUtils.datedPathFormat.format(outputDateRange.getEndDate()))); for (Date currentDate=outputDateRange.getBeginDate(); currentDate.compareTo(getDateRange().getBeginDate()) < 0 && currentDate.compareTo(outputDateRange.getEndDate()) <= 0;) { if (!getAvailableInputsByDate().containsKey(currentDate)) { throw new RuntimeException(String.format("Missing incremental data for %s, so can't remove it from previous output",PathUtils.datedPathFormat.format(currentDate))); } List inputs = getAvailableInputsByDate().get(currentDate); for (DatePath input : inputs) { _log.info(String.format("Old Input: %s",input.getPath())); plan._inputsToProcess.add(input); plan._oldInputsToProcess.add(input); Path root = PathUtils.getNestedPathRoot(input.getPath()); plan._latestInputByPath.put(root.toString(), input.getPath().toString()); } cal.setTime(currentDate); cal.add(Calendar.DAY_OF_MONTH, 1); currentDate = cal.getTime(); } plan._previousOutputToProcess = latestPriorOutput; _log.info("Previous Output: " + plan._previousOutputToProcess.getPath()); } else { _log.info("No previous output to reuse"); } } // consume the incremental data and produce the final output int newDataCount = 0; Date startDate = getDateRange().getBeginDate(); Date endDate = startDate; for (Date currentDate=startDate; currentDate.compareTo(getDateRange().getEndDate()) <= 0; ) { if (getMaxToProcess() != null && newDataCount >= getMaxToProcess()) { if (!reusePreviousOutput) { throw new MaxInputDataExceededException(String.format("Amount of input data has exceeded max of %d however output is not being reused so cannot do in multiple passes", getMaxToProcess())); } // too much data to process in a single run, will require another pass plan._needAnotherPass = true; break; } if (outputDateRange == null || currentDate.compareTo(outputDateRange.getEndDate()) > 0) { if (!getAvailableInputsByDate().containsKey(currentDate)) { if (isFailOnMissing()) { throw new RuntimeException("missing " + PathUtils.datedPathFormat.format(currentDate)); } else { _log.info("No input data found for " + PathUtils.datedPathFormat.format(currentDate)); } } else { List inputs = getAvailableInputsByDate().get(currentDate); for (DatePath input : inputs) { _log.info(String.format("New Input: %s",input.getPath())); plan._inputsToProcess.add(input); plan._newInputsToProcess.add(input); Path root = PathUtils.getNestedPathRoot(input.getPath()); plan._latestInputByPath.put(root.toString(), input.getPath().toString()); } newDataCount++; } } cal.setTime(currentDate); endDate = cal.getTime(); cal.add(Calendar.DAY_OF_MONTH, 1); currentDate = cal.getTime(); } plan._currentDateRange = new DateRange(startDate,endDate); } /** * Throws an exception if the plan hasn't been created. */ private void checkPlanExists() { if (_plan == null) throw new RuntimeException("Must call createPlan first"); } private Plan getPlan() { checkPlanExists(); return _plan; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy