
datafu.hourglass.mapreduce.PartitioningReducer Maven / Gradle / Ivy
/**
* Copyright 2013 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.hourglass.mapreduce;
import java.io.IOException;
import java.io.Serializable;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroMultipleOutputs;
import org.apache.hadoop.mapreduce.ReduceContext;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import datafu.hourglass.fs.PathUtils;
import datafu.hourglass.model.Accumulator;
import datafu.hourglass.schemas.PartitionPreservingSchemas;
/**
* The reducer used by {@link datafu.hourglass.jobs.AbstractPartitionPreservingIncrementalJob} and its derived classes.
*
*
* An implementation of {@link datafu.hourglass.model.Accumulator} is used to perform aggregation and produce the
* output value.
*
*
*
* The input key is assumed to have time and value fields. The value here is the true key,
* and the time represents the input partition the data was derived from. The true key is
* used as the key in the reducer output and the time is dropped.
* This reducer uses multiple outputs; the time is used to determine which output to write to,
* where the named outputs have the form yyyyMMdd derived from the time.
*
*
* @author "Matthew Hayes"
*
*/
public class PartitioningReducer extends ObjectReducer implements Serializable
{
private transient AvroMultipleOutputs _multipleOutputs;
private transient Map _timeToNamedOutput;
private PartitionPreservingSchemas _schemas;
private Accumulator accumulator;
@SuppressWarnings("unchecked")
public void reduce(Object keyObj,
Iterable
© 2015 - 2025 Weber Informatics LLC | Privacy Policy