All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapreduce.lib.aggregate.UniqValueCount Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.aggregate;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceStability;

/**
 * This class implements a value aggregator that dedupes a sequence of objects.
 * 
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class UniqValueCount implements ValueAggregator {
  public static final String MAX_NUM_UNIQUE_VALUES = 
    "mapreduce.aggregate.max.num.unique.values";

  private TreeMap uniqItems = null;

  private long numItems = 0;
  
  private long maxNumItems = Long.MAX_VALUE;

  /**
   * the default constructor
   * 
   */
  public UniqValueCount() {
    this(Long.MAX_VALUE);
  }
  
  /**
   * constructor
   * @param maxNum the limit in the number of unique values to keep.
   *  
   */
  public UniqValueCount(long maxNum) {
    uniqItems = new TreeMap();
    this.numItems = 0;
    maxNumItems = Long.MAX_VALUE;
    if (maxNum > 0 ) {
      this.maxNumItems = maxNum;
    }
  }

  /**
   * Set the limit on the number of unique values
   * @param n the desired limit on the number of unique values
   * @return the new limit on the number of unique values
   */
  public long setMaxItems(long n) {
    if (n >= numItems) {
      this.maxNumItems = n;
    } else if (this.maxNumItems >= this.numItems) {
      this.maxNumItems = this.numItems;
    }
    return this.maxNumItems;
  }
  
  /**
   * add a value to the aggregator
   * 
   * @param val
   *          an object.
   * 
   */
  public void addNextValue(Object val) {
    if (this.numItems <= this.maxNumItems) {
      uniqItems.put(val.toString(), "1");
      this.numItems = this.uniqItems.size();
    }
  }

  /**
   * @return return the number of unique objects aggregated
   */
  public String getReport() {
    return "" + uniqItems.size();
  }

  /**
   * 
   * @return the set of the unique objects
   */
  public Set getUniqueItems() {
    return uniqItems.keySet();
  }

  /**
   * reset the aggregator
   */
  public void reset() {
    uniqItems = new TreeMap();
  }

  /**
   * @return return an array of the unique objects. The return value is
   *         expected to be used by the a org.apache.hadoop.shaded.com.iner.
   */
  public ArrayList getCombinerOutput() {
    Object key = null;
    Iterator iter = uniqItems.keySet().iterator();
    ArrayList retv = new ArrayList();

    while (iter.hasNext()) {
      key = iter.next();
      retv.add(key);
    }
    return retv;
  }
}