All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.impl.spark.fn.CombineMapsideFunction Maven / Gradle / Ivy

There is a newer version: 1.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.impl.spark.fn;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.UnmodifiableIterator;
import org.apache.crunch.CombineFn;
import org.apache.crunch.Pair;
import org.apache.crunch.impl.mem.emit.InMemoryEmitter;
import org.apache.crunch.impl.spark.SparkRuntimeContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class CombineMapsideFunction implements PairFlatMapFunction>, K, V> {

  private static final int REDUCE_EVERY_N = 50000;

  private final CombineFn combineFn;
  private final SparkRuntimeContext ctxt;

  public CombineMapsideFunction(CombineFn combineFn, SparkRuntimeContext ctxt) {
    this.combineFn = combineFn;
    this.ctxt = ctxt;
  }

  @Override
  public Iterable> call(Iterator> iter) throws Exception {
    ctxt.initialize(combineFn, null);
    Map> cache = Maps.newHashMap();
    int cnt = 0;
    while (iter.hasNext()) {
      Tuple2 t = iter.next();
      List values = cache.get(t._1());
      if (values == null) {
        values = Lists.newArrayList();
        cache.put(t._1(), values);
      }
      values.add(t._2());
      cnt++;
      if (cnt % REDUCE_EVERY_N == 0) {
        cache = reduce(cache);
      }
    }

    return new Flattener(cache);
  }

  private Map> reduce(Map> cache) {
    Set keys = cache.keySet();
    Map> res = Maps.newHashMap();
    for (K key : keys) {
      for (Pair p : reduce(key, cache.get(key))) {
        List values = res.get(p.first());
        if (values == null) {
          values = Lists.newArrayList();
          res.put(p.first(), values);
        }
        values.add(p.second());
      }
    }
    return res;
  }

  private List> reduce(K key, Iterable values) {
    InMemoryEmitter> emitter = new InMemoryEmitter>();
    combineFn.process(Pair.of(key, values), emitter);
    combineFn.cleanup(emitter);
    return emitter.getOutput();
  }

  private static class Flattener implements Iterable> {
    private final Map> entries;

    public Flattener(Map> entries) {
      this.entries = entries;
    }

    @Override
    public Iterator> iterator() {
      return new UnmodifiableIterator>() {
        private Iterator keyIter = entries.keySet().iterator();
        private K currentKey;
        private Iterator valueIter = null;

        @Override
        public boolean hasNext() {
          while (valueIter == null || !valueIter.hasNext()) {
            if (keyIter.hasNext()) {
              currentKey = keyIter.next();
              valueIter = entries.get(currentKey).iterator();
            } else {
              return false;
            }
          }
          return true;
        }

        @Override
        public Tuple2 next() {
          return new Tuple2(currentKey, valueIter.next());
        }
      };
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy