All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.compaction.mapreduce.avro.AvroKeyDedupReducer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.compaction.mapreduce.avro;

import java.io.IOException;
import java.util.Comparator;

import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;

import org.apache.gobblin.util.reflection.GobblinConstructorUtils;


/**
 * Reducer class for compaction MR job for Avro data.
 *
 * If there are multiple values of the same key, it keeps the last value read.
 *
 * @author Ziyang Liu
 */
public class AvroKeyDedupReducer extends Reducer, AvroValue, AvroKey, NullWritable> {

  public enum EVENT_COUNTER {
    MORE_THAN_1,
    DEDUPED,
    RECORD_COUNT
  }

  public static final String DELTA_SCHEMA_PROVIDER =
      "org.apache.gobblin.compaction." + AvroKeyDedupReducer.class.getSimpleName() + ".deltaFieldsProvider";
  private AvroKey outKey;
  private Optional deltaComparatorOptional;
  private AvroDeltaFieldNameProvider deltaFieldNamesProvider;

  @Override
  protected void setup(Context context)
      throws IOException, InterruptedException {
    this.outKey = new AvroKey<>();
    this.deltaComparatorOptional = Optional.absent();
    Configuration conf = context.getConfiguration();
    String deltaSchemaProviderClassName = conf.get(DELTA_SCHEMA_PROVIDER);
    if (deltaSchemaProviderClassName != null) {
      this.deltaFieldNamesProvider =
          GobblinConstructorUtils.invokeConstructor(AvroDeltaFieldNameProvider.class, deltaSchemaProviderClassName, conf);
      this.deltaComparatorOptional = Optional.of(new AvroValueDeltaSchemaComparator(deltaFieldNamesProvider));
    }
  }

  @Override
  protected void reduce(AvroKey key, Iterable> values, Context context)
      throws IOException, InterruptedException {
    int numVals = 0;

    AvroValue valueToRetain = null;

    for (AvroValue value : values) {
      if (valueToRetain == null) {
        valueToRetain = value;
      } else if (this.deltaComparatorOptional.isPresent()) {
        valueToRetain = this.deltaComparatorOptional.get().compare(valueToRetain, value) >= 0 ? valueToRetain : value;
      }
      numVals++;
    }
    this.outKey.datum(valueToRetain.datum());

    if (numVals > 1) {
      context.getCounter(EVENT_COUNTER.MORE_THAN_1).increment(1);
      context.getCounter(EVENT_COUNTER.DEDUPED).increment(numVals - 1);
    }

    context.getCounter(EVENT_COUNTER.RECORD_COUNT).increment(1);

    context.write(this.outKey, NullWritable.get());
  }

  @VisibleForTesting
  protected static class AvroValueDeltaSchemaComparator implements Comparator> {
    private final AvroDeltaFieldNameProvider deltaSchemaProvider;

    public AvroValueDeltaSchemaComparator(AvroDeltaFieldNameProvider provider) {
      this.deltaSchemaProvider = provider;
    }

    @Override
    public int compare(AvroValue o1, AvroValue o2) {
      GenericRecord record1= o1.datum();
      GenericRecord record2 = o2.datum();
      for (String deltaFieldName : this.deltaSchemaProvider.getDeltaFieldNames(record1)) {
        if (record1.get(deltaFieldName).equals(record2.get(deltaFieldName))) {
          continue;
        }
        return ((Comparable)record1.get(deltaFieldName)).compareTo(record2.get(deltaFieldName));
      }

      return 0;
    }
  }

  @VisibleForTesting
  protected AvroKey getOutKey() {
    return this.outKey;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy