org.apache.hadoop.contrib.utils.join.DataJoinReducerBase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-datajoin Show documentation
Apache Hadoop Data Join
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.contrib.utils.join;

import java.io.IOException;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

/**
 * This abstract class serves as the base class for the reducer class of a data
 * join job. The reduce function will first group the values according to their
 * input tags, and then compute the cross product of over the groups. For each
 * tuple in the cross product, it calls the following method, which is expected
 * to be implemented in a subclass.
 * 
 * protected abstract TaggedMapOutput combine(Object[] tags, Object[] values);
 * 
 * The above method is expected to produce one output value from an array of
 * records of different sources. The user code can also perform filtering here.
 * It can return null if it decides to the records do not meet certain
 * conditions.
 * 
 */
public abstract class DataJoinReducerBase extends JobBase {

  protected Reporter reporter = null;

  private long maxNumOfValuesPerGroup = 100;

  protected long largestNumOfValues = 0;

  protected long numOfValues = 0;

  protected long collected = 0;

  protected JobConf job;

  public void close() throws IOException {
    if (this.reporter != null) {
      this.reporter.setStatus(super.getReport());
    }
  }

  public void configure(JobConf job) {
    super.configure(job);
    this.job = job;
    this.maxNumOfValuesPerGroup = job.getLong("datajoin.maxNumOfValuesPerGroup", 100);
  }

  /**
   * The subclass can provide a different implementation on ResetableIterator.
   * This is necessary if the number of values in a reduce call is very high.
   * 
   * The default provided here uses ArrayListBackedIterator
   * 
   * @return an Object of ResetableIterator.
   */
  protected ResetableIterator createResetableIterator() {
    return new ArrayListBackedIterator();
  }

  /**
   * This is the function that re-groups values for a key into sub-groups based
   * on a secondary key (input tag).
   * 
   * @param arg1
   * @return
   */
  private SortedMap regroup(Object key,
                                                       Iterator arg1, Reporter reporter) throws IOException {
    this.numOfValues = 0;
    SortedMap retv = new TreeMap();
    TaggedMapOutput aRecord = null;
    while (arg1.hasNext()) {
      this.numOfValues += 1;
      if (this.numOfValues % 100 == 0) {
        reporter.setStatus("key: " + key.toString() + " numOfValues: "
                           + this.numOfValues);
      }
      if (this.numOfValues > this.maxNumOfValuesPerGroup) {
        continue;
      }
      aRecord = ((TaggedMapOutput) arg1.next()).clone(job);
      Text tag = aRecord.getTag();
      ResetableIterator data = retv.get(tag);
      if (data == null) {
        data = createResetableIterator();
        retv.put(tag, data);
      }
      data.add(aRecord);
    }
    if (this.numOfValues > this.largestNumOfValues) {
      this.largestNumOfValues = numOfValues;
      LOG.info("key: " + key.toString() + " this.largestNumOfValues: "
               + this.largestNumOfValues);
    }
    return retv;
  }

  public void reduce(Object key, Iterator values,
                     OutputCollector output, Reporter reporter) throws IOException {
    if (this.reporter == null) {
      this.reporter = reporter;
    }

    SortedMap groups = regroup(key, values, reporter);
    Object[] tags = groups.keySet().toArray();
    ResetableIterator[] groupValues = new ResetableIterator[tags.length];
    for (int i = 0; i < tags.length; i++) {
      groupValues[i] = groups.get(tags[i]);
    }
    joinAndCollect(tags, groupValues, key, output, reporter);
    addLongValue("groupCount", 1);
    for (int i = 0; i < tags.length; i++) {
      groupValues[i].close();
    }
  }

  /**
   * The subclass can overwrite this method to perform additional filtering
   * and/or other processing logic before a value is collected.
   * 
   * @param key
   * @param aRecord
   * @param output
   * @param reporter
   * @throws IOException
   */
  protected void collect(Object key, TaggedMapOutput aRecord,
                         OutputCollector output, Reporter reporter) throws IOException {
    this.collected += 1;
    addLongValue("collectedCount", 1);
    if (aRecord != null) {
      output.collect(key, aRecord.getData());
      reporter.setStatus("key: " + key.toString() + " collected: " + collected);
      addLongValue("actuallyCollectedCount", 1);
    }
  }

  /**
   * join the list of the value lists, and collect the results.
   * 
   * @param tags
   *          a list of input tags
   * @param values
   *          a list of value lists, each corresponding to one input source
   * @param key
   * @param output
   * @throws IOException
   */
  private void joinAndCollect(Object[] tags, ResetableIterator[] values,
                              Object key, OutputCollector output, Reporter reporter)
    throws IOException {
    if (values.length < 1) {
      return;
    }
    Object[] partialList = new Object[values.length];
    joinAndCollect(tags, values, 0, partialList, key, output, reporter);
  }

  /**
   * Perform the actual join recursively.
   * 
   * @param tags
   *          a list of input tags
   * @param values
   *          a list of value lists, each corresponding to one input source
   * @param pos
   *          indicating the next value list to be joined
   * @param partialList
   *          a list of values, each from one value list considered so far.
   * @param key
   * @param output
   * @throws IOException
   */
  private void joinAndCollect(Object[] tags, ResetableIterator[] values,
                              int pos, Object[] partialList, Object key,
                              OutputCollector output, Reporter reporter) throws IOException {

    if (values.length == pos) {
      // get a value from each source. Combine them
      TaggedMapOutput combined = combine(tags, partialList);
      collect(key, combined, output, reporter);
      return;
    }
    ResetableIterator nextValues = values[pos];
    nextValues.reset();
    while (nextValues.hasNext()) {
      Object v = nextValues.next();
      partialList[pos] = v;
      joinAndCollect(tags, values, pos + 1, partialList, key, output, reporter);
    }
  }

  public static Text SOURCE_TAGS_FIELD = new Text("SOURCE_TAGS");

  public static Text NUM_OF_VALUES_FIELD = new Text("NUM_OF_VALUES");

  /**
   * 
   * @param tags
   *          a list of source tags
   * @param values
   *          a value per source
   * @return combined value derived from values of the sources
   */
  protected abstract TaggedMapOutput combine(Object[] tags, Object[] values);

  public void map(Object arg0, Object arg1, OutputCollector arg2,
                  Reporter arg3) throws IOException {
    // TODO Auto-generated method stub

  }
}