All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.jobs.AbstractPartitionPreservingIncrementalJob Maven / Gradle / Ivy

Go to download

Librares that make easier to solve data problems using Hadoop and higher level languages based on it.

There is a newer version: 1.3.3
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.hourglass.jobs;

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.avro.mapreduce.AvroMultipleOutputs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.log4j.Logger;


import datafu.hourglass.avro.AvroMultipleInputsKeyInputFormat;
import datafu.hourglass.avro.AvroMultipleInputsUtil;
import datafu.hourglass.fs.DatePath;
import datafu.hourglass.fs.PathUtils;
import datafu.hourglass.mapreduce.DelegatingCombiner;
import datafu.hourglass.mapreduce.DelegatingMapper;
import datafu.hourglass.mapreduce.DelegatingReducer;
import datafu.hourglass.mapreduce.DistributedCacheHelper;
import datafu.hourglass.mapreduce.ObjectMapper;
import datafu.hourglass.mapreduce.ObjectReducer;
import datafu.hourglass.mapreduce.Parameters;
import datafu.hourglass.mapreduce.PartitioningCombiner;
import datafu.hourglass.mapreduce.PartitioningMapper;
import datafu.hourglass.mapreduce.PartitioningReducer;
import datafu.hourglass.model.Accumulator;
import datafu.hourglass.model.Mapper;
import datafu.hourglass.schemas.PartitionPreservingSchemas;

/**
 * An {@link IncrementalJob} that consumes partitioned input data and produces
 * output data having the same partitions.
 * Typically this is used in conjunction with {@link AbstractPartitionCollapsingIncrementalJob}
 * when computing aggregates over sliding windows.  A partition-preserving job can perform
 * initial aggregation per-day, which can then be consumed by a partition-collapsing job to
 * produce the final aggregates over the time window. 
 * Only Avro is supported for the input, intermediate, and output data.
 * 
 * 

* Implementations of this class must provide key, intermediate value, and output value schemas. * The key and intermediate value schemas define the output for the mapper and combiner. * The key and output value schemas define the output for the reducer. * These are defined by overriding {@link #getKeySchema()}, {@link #getIntermediateValueSchema()}, * and {@link #getOutputValueSchema()}. *

* *

* Implementations must also provide a mapper by overriding {@link #getMapper()} and an accumulator * for the reducer by overriding {@link #getReducerAccumulator()}. An optional combiner may be * provided by overriding {@link #getCombinerAccumulator()}. For the combiner to be used * the property use.combiner must also be set to true. *

* *

* The distinguishing feature this type of job is that the input partitioning is preserved in the ouput. * The data from each partition is processed independently of other partitions and then output separately. * For example, input that is partitioned by day can be aggregated by day and then output by day. * This is achieved by attaching a long value to each key, which represents the partition, so that the reducer * receives data grouped by the key and partition together. Multiple outputs are then used so that the output * will have the same partitions as the input. *

* *

* The input path can be provided either through the property input.path * or by calling {@link #setInputPaths(List)}. If multiple input paths are provided then * this implicitly means a join is to be performed. Multiple input paths can be provided via * properties by prefixing each with input.path., such as input.path.first * and input.path.second. * Input data must be partitioned by day according to the naming convention yyyy/MM/dd. * The output path can be provided either through the property output.path * or by calling {@link #setOutputPath(Path)}. * Output data will be written using the same naming convention as the input, namely yyyy/MM/dd, where the date used * to format the output path is the same the date for the input it was derived from. * For example, if the desired time range to process is 2013/01/01 through 2013/01/14, * then the output will be named 2013/01/01 through 2013/01/14. * By default the job will fail if any input data in the desired time window is missing. This can be overriden by setting * fail.on.missing to false. *

* *

* The job will not process input for which a corresponding output already exists. For example, if the desired date * range is 2013/01/01 through 2013/01/14 and the outputs 2013/01/01 through 2013/01/12 exist, then only * 2013/01/13 and 2013/01/14 will be processed and only 2013/01/13 and 2013/01/14 will be produced. *

* *

* The number of paths in the output to retain can be configured through the property retention.count, * or by calling {@link #setRetentionCount(Integer)}. When this property is set only the latest paths in the output * will be kept; the remainder will be removed. By default there is no retention count set so all output paths are kept. *

* *

* The inputs to process can be controlled by defining a desired date range. By default the job will process all input * data available. To limit the number of days of input to process one can set the property num.days * or call {@link #setNumDays(Integer)}. This would define a processing window with the same number of days, * where the end date of the window is the latest available input and the start date is num.days ago. * Only inputs within this window would be processed. * Because the end date is the same as the latest available input, as new input data becomes available the end of the * window will advance forward to include it. The end date can be adjusted backwards relative to the latest input * through the property days.ago, or by calling {@link #setDaysAgo(Integer)}. This subtracts as many days * from the latest available input date to determine the end date. The start date or end date can also be fixed * by setting the properties start.date or end.date, or by calling {@link #setStartDate(Date)} * or {@link #setEndDate(Date)}. *

* *

* The number of reducers to use is automatically determined based on the size of the data to process. * The total size is computed and then divided by the value of the property num.reducers.bytes.per.reducer, which * defaults to 256 MB. This is the number of reducers that will be used. * The number of reducers can also be set to a fixed value through the property num.reducers. *

* *

* This type of job is capable of performing its work over multiple iterations. * The number of days to process at a time can be limited by setting the property max.days.to.process, * or by calling {@link #setMaxToProcess(Integer)}. The default is 90 days. * This can be useful when there are restrictions on how many tasks * can be used by a single MapReduce job in the cluster. When this property is set, the job will process no more than * this many days at a time, and it will perform one or more iterations if necessary to complete the work. * The number of iterations can be limited by setting the property max.iterations, or by calling {@link #setMaxIterations(Integer)}. * If the number of iterations is exceeded the job will fail. By default the maximum number of iterations is 20. *

* *

* Hadoop configuration may be provided by setting a property with the prefix hadoop-conf.. * For example, mapred.min.split.size can be configured by setting property * hadoop-conf.mapred.min.split.size to the desired value. *

* */ public abstract class AbstractPartitionPreservingIncrementalJob extends IncrementalJob { private final Logger _log = Logger.getLogger(AbstractPartitionPreservingIncrementalJob.class); private List _reports = new ArrayList(); private PartitioningMapper _mapper; private PartitioningCombiner _combiner; private PartitioningReducer _reducer; private FileCleaner _garbage; /** * Initializes the job. * @throws IOException IOException */ public AbstractPartitionPreservingIncrementalJob() throws IOException { } /** * Initializes the job with a job name and properties. * * @param name job name * @param props configuration properties * @throws IOException IOException */ public AbstractPartitionPreservingIncrementalJob(String name, Properties props) throws IOException { super(name,props); } /** * Gets the mapper. * * @return mapper */ public abstract Mapper getMapper(); /** * Gets the accumulator used for the combiner. * * @return combiner accumulator */ public Accumulator getCombinerAccumulator() { return null; } /** * Gets the accumulator used for the reducer. * * @return reducer accumulator */ public abstract Accumulator getReducerAccumulator(); /** * Run the job. * * @throws IOException IOException * @throws InterruptedException InterruptedException * @throws ClassNotFoundException ClassNotFoundException */ @Override public void run() throws IOException, InterruptedException, ClassNotFoundException { try { initialize(); validate(); execute(); } finally { cleanup(); } } /** * Get reports that summarize each of the job iterations. * * @return reports */ public List getReports() { return Collections.unmodifiableList(_reports); } @Override protected void initialize() { _garbage = new FileCleaner(getFileSystem()); if (getMaxIterations() == null) { setMaxIterations(20); } if (getMaxToProcess() == null) { if (getNumDays() != null) { setMaxToProcess(getNumDays()); } else { setMaxToProcess(90); } } super.initialize(); } /** * Get the name for the reduce output schema. * By default this is the name of the class with "Output" appended. * * @return output schema name */ protected String getOutputSchemaName() { return this.getClass().getSimpleName() + "Output"; } /** * Get the namespace for the reduce output schema. * By default this is the package of the class. * * @return output schema namespace */ protected String getOutputSchemaNamespace() { return this.getClass().getPackage().getName(); } protected ObjectMapper getMapProcessor() { return _mapper; } protected ObjectReducer getCombineProcessor() { return _combiner; } protected ObjectReducer getReduceProcessor() { return _reducer; } /** * Execute the job. * * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private void execute() throws IOException, InterruptedException, ClassNotFoundException { int iterations = 0; while(true) { PartitionPreservingExecutionPlanner planner = new PartitionPreservingExecutionPlanner(getFileSystem(),getProperties()); planner.setInputPaths(getInputPaths()); planner.setOutputPath(getOutputPath()); planner.setStartDate(getStartDate()); planner.setEndDate(getEndDate()); planner.setDaysAgo(getDaysAgo()); planner.setNumDays(getNumDays()); planner.setMaxToProcess(getMaxToProcess()); planner.setFailOnMissing(isFailOnMissing()); planner.createPlan(); if (planner.getInputsToProcess().size() == 0) { _log.info("Found all necessary incremental data"); break; } if (iterations >= getMaxIterations()) { throw new RuntimeException(String.format("Already completed %d iterations but the max is %d and there are still %d inputs to process", iterations, getMaxIterations(), planner.getInputsToProcess().size())); } Path jobTempPath = createRandomTempPath(); _garbage.add(jobTempPath); ensurePath(getOutputPath()); Path incrementalStagingPath = ensurePath(new Path(jobTempPath,".incremental-staging")); Path incrementalStagingTmpPath = ensurePath(new Path(jobTempPath,".incremental-staging-tmp")); Report report = new Report(); // create input paths for job List inputPaths = new ArrayList(); for (DatePath input : planner.getInputsToProcess()) { inputPaths.add(input.getPath().toString()); report.inputFiles.add(input); } _log.info("Staging path: " + incrementalStagingPath); final StagedOutputJob job = StagedOutputJob.createStagedJob( getConf(), getName() + "-" + "incremental", inputPaths, incrementalStagingTmpPath.toString(), incrementalStagingPath.toString(), _log); job.setCountersParentPath(getCountersParentPath()); final Configuration conf = job.getConfiguration(); config(conf); PartitionPreservingSchemas fpSchemas = new PartitionPreservingSchemas(getSchemas(), planner.getInputSchemasByPath(), getOutputSchemaName(), getOutputSchemaNamespace() ); job.setInputFormatClass(AvroMultipleInputsKeyInputFormat.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); _log.info("Setting input path to schema mappings"); for (String path : fpSchemas.getMapInputSchemas().keySet()) { Schema schema = fpSchemas.getMapInputSchemas().get(path); _log.info("*** " + path); _log.info("*** => " + schema.toString()); AvroMultipleInputsUtil.setInputKeySchemaForPath(job, schema, path); } AvroJob.setMapOutputKeySchema(job, fpSchemas.getMapOutputKeySchema()); AvroJob.setMapOutputValueSchema(job, fpSchemas.getMapOutputValueSchema()); AvroJob.setOutputKeySchema(job, fpSchemas.getReduceOutputSchema()); StringBuilder inputTimesJoined = new StringBuilder(); for (Date input : planner.getDatesToProcess()) { String namedOutput = PathUtils.datedPathFormat.format(input); _log.info(String.format("Adding named output %s",namedOutput)); AvroMultipleOutputs.addNamedOutput(job, namedOutput, AvroKeyOutputFormat.class, fpSchemas.getReduceOutputSchema()); inputTimesJoined.append(Long.toString(input.getTime())); inputTimesJoined.append(","); } int numReducers; if (getNumReducers() != null) { numReducers = getNumReducers(); _log.info(String.format("Using %d reducers (fixed)",numReducers)); } else { numReducers = planner.getNumReducers(); _log.info(String.format("Using %d reducers (computed)",numReducers)); } int avgReducersPerInput = (int)Math.ceil(numReducers/(double)planner.getDatesToProcess().size()); _log.info(String.format("Reducers per input path: %d", avgReducersPerInput)); // counters for multiple outputs // conf.set("mo.counters", "true"); conf.set(TimePartitioner.REDUCERS_PER_INPUT, Integer.toString(avgReducersPerInput)); conf.set(TimePartitioner.INPUT_TIMES, inputTimesJoined.substring(0,inputTimesJoined.length()-1)); job.setNumReduceTasks(numReducers); Path mapperPath = new Path(incrementalStagingPath,".mapper_impl"); Path reducerPath = new Path(incrementalStagingPath,".reducer_impl"); Path combinerPath = new Path(incrementalStagingPath,".combiner_impl"); conf.set(Parameters.REDUCER_IMPL_PATH, reducerPath.toString()); conf.set(Parameters.MAPPER_IMPL_PATH, mapperPath.toString()); _mapper = new PartitioningMapper(); _mapper.setSchemas(fpSchemas); _mapper.setMapper(getMapper()); _reducer = new PartitioningReducer(); _reducer.setSchemas(fpSchemas); _reducer.setAccumulator(getReducerAccumulator()); DistributedCacheHelper.writeObject(conf, getMapProcessor(), mapperPath); DistributedCacheHelper.writeObject(conf, getReduceProcessor(), reducerPath); job.setMapperClass(DelegatingMapper.class); job.setReducerClass(DelegatingReducer.class); if (isUseCombiner()) { _combiner = new PartitioningCombiner(); _combiner.setAccumulator(getCombinerAccumulator()); conf.set(Parameters.COMBINER_IMPL_PATH, combinerPath.toString()); job.setCombinerClass(DelegatingCombiner.class); DistributedCacheHelper.writeObject(conf, getCombineProcessor(), combinerPath); } job.setPartitionerClass(TimePartitioner.class); if (!job.waitForCompletion(true)) { _log.error("Job failed! Quitting..."); throw new RuntimeException("Job failed"); } report.jobName = job.getJobName(); report.jobId = job.getJobID().toString(); moveStagedFiles(report,incrementalStagingPath); if (getCountersParentPath() == null && job.getCountersPath() != null) { // save the counters in the target path, for lack of a better place to put it Path counters = job.getCountersPath(); if (getFileSystem().exists(counters)) { Path target = new Path(getOutputPath(),counters.getName()); if (getFileSystem().exists(target)) { _log.info(String.format("Removing old counters at %s",target)); getFileSystem().delete(target, true); } _log.info(String.format("Moving %s to %s",counters.getName(),getOutputPath())); getFileSystem().rename(counters, target); report.countersPath = target; } else { _log.error("Could not find counters at " + counters); } } applyRetention(); _reports.add(report); if (!planner.getNeedsAnotherPass()) { break; } cleanup(); iterations++; } } /** * Remove all temporary paths. * * @throws IOException */ private void cleanup() throws IOException { if (_garbage != null) { _garbage.clean(); } } /** * Removes all but the more recent days from the ouput that are within the retention period, if one is specified. * * @throws IOException */ private void applyRetention() throws IOException { if (getRetentionCount() != null) { PathUtils.keepLatestNestedDatedPaths(getFileSystem(), getOutputPath(), getRetentionCount()); } } /** * Moves files from the staging path to the final output path. * * @param report report to update with output paths * @param sourcePath source of data to move * @throws IOException */ private void moveStagedFiles(Report report, Path sourcePath) throws IOException { _log.info("Following files produced in staging path:"); for (FileStatus stat : getFileSystem().globStatus(new Path(sourcePath,"*.avro"))) { _log.info(String.format("* %s (%d bytes)",stat.getPath(),stat.getLen())); } FileStatus[] incrementalParts = getFileSystem().globStatus(new Path(sourcePath,"*"), new PathFilter() { @Override public boolean accept(Path path) { String[] pathParts = path.getName().split("-"); try { Long.parseLong(pathParts[0]); return true; } catch (NumberFormatException e) { return false; } } }); // collect the new incremental data from the temp folder and move to subfolders Map incrementalTargetPaths = new HashMap(); for (FileStatus stat : incrementalParts) { String[] pathParts = stat.getPath().getName().split("-"); try { String timestamp = pathParts[0]; if (!incrementalTargetPaths.containsKey(timestamp)) { Path parent = new Path(sourcePath,timestamp); if (!getFileSystem().exists(parent)) { getFileSystem().mkdirs(parent); } else { throw new RuntimeException("already exists: " + parent.toString()); } incrementalTargetPaths.put(timestamp,parent); } Path parent = incrementalTargetPaths.get(timestamp); _log.info(String.format("Moving %s to %s",stat.getPath().getName(),parent.toString())); getFileSystem().rename(stat.getPath(), new Path(parent,stat.getPath().getName())); } catch (NumberFormatException e) { throw new RuntimeException(e); } } for (Path src : incrementalTargetPaths.values()) { Date srcDate; try { srcDate = PathUtils.datedPathFormat.parse(src.getName()); } catch (ParseException e) { throw new RuntimeException(e); } Path target = new Path(getOutputPath(),PathUtils.nestedDatedPathFormat.format(srcDate)); _log.info(String.format("Moving %s to %s",src.getName(),target)); getFileSystem().mkdirs(target.getParent()); if (!getFileSystem().rename(src, target)) { throw new RuntimeException("Failed to rename " + src + " to " + target); } report.outputFiles.add(new DatePath(srcDate,target)); } } /** * Reports files created and processed for an iteration of the job. * */ public static class Report { private String jobName; private String jobId; private Path countersPath; private List inputFiles = new ArrayList(); private List outputFiles = new ArrayList(); /** * Gets the job name. * * @return job name */ public String getJobName() { return jobName; } /** * Gets the job ID. * * @return job ID */ public String getJobId() { return jobId; } /** * Gets the path to the counters file, if one was written. * * @return counters path */ public Path getCountersPath() { return countersPath; } /** * Gets input files that were processed. These are files that are within * the desired date range. * * @return new input files */ public List getInputFiles() { return Collections.unmodifiableList(inputFiles); } /** * Gets the output files that were produced by the job. * * @return old input files */ public List getOutputFiles() { return Collections.unmodifiableList(outputFiles); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy