All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.impl.spark.collect.CreatedTable Maven / Gradle / Ivy

There is a newer version: 1.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.impl.spark.collect;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.crunch.CreateOptions;
import org.apache.crunch.CrunchRuntimeException;
import org.apache.crunch.MapFn;
import org.apache.crunch.Pair;
import org.apache.crunch.ReadableData;
import org.apache.crunch.impl.dist.collect.PCollectionImpl;
import org.apache.crunch.impl.dist.collect.PTableBase;
import org.apache.crunch.impl.spark.ByteArray;
import org.apache.crunch.impl.spark.SparkCollection;
import org.apache.crunch.impl.spark.SparkPipeline;
import org.apache.crunch.impl.spark.SparkRuntime;
import org.apache.crunch.impl.spark.SparkRuntimeContext;
import org.apache.crunch.impl.spark.serde.SerDe;
import org.apache.crunch.impl.spark.serde.SerDeFactory;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDDLike;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;

import java.io.IOException;
import java.util.List;

/**
 * Represents a Spark-based PTable that was created from a Java {@code Iterable} of
 * key-value pairs.
 */
public class CreatedTable extends PTableBase implements SparkCollection {

  private final Iterable> contents;
  private final PTableType ptype;
  private final int parallelism;
  private JavaPairRDD rdd;

  public CreatedTable(
      SparkPipeline pipeline,
      Iterable> contents,
      PTableType ptype,
      CreateOptions options) {
    super(options.getName(), pipeline);
    this.contents = contents;
    this.ptype = ptype;
    this.parallelism = options.getParallelism();
  }

  @Override
  protected void acceptInternal(Visitor visitor) {
    // No-op
  }

  @Override
  public List> getParents() {
    return ImmutableList.of();
  }

  @Override
  protected ReadableData> getReadableDataInternal() {
    try {
      return ptype.createSourceTarget(pipeline.getConfiguration(),
              pipeline.createTempPath(), contents, parallelism).asReadable();
    } catch (IOException e) {
      throw new CrunchRuntimeException(e);
    }
  }

  @Override
  protected long getSizeInternal() {
    return Iterables.size(contents);
  }

  @Override
  public long getLastModifiedAt() {
    return -1;
  }

  @Override
  public PTableType getPTableType() {
    return ptype;
  }

  @Override
  public PType> getPType() {
    return ptype;
  }

  @Override
  public JavaRDDLike getJavaRDDLike(SparkRuntime runtime) {
    if (!runtime.isValid(rdd)) {
      rdd = getJavaRDDLikeInternal(runtime);
      rdd.rdd().setName(getName());
      StorageLevel sl = runtime.getStorageLevel(this);
      if (sl != null) {
        rdd.rdd().persist(sl);
      }
    }
    return rdd;
  }

  private JavaPairRDD getJavaRDDLikeInternal(SparkRuntime runtime) {
    ptype.initialize(runtime.getConfiguration());
    PType keyType = ptype.getKeyType();
    PType valueType = ptype.getValueType();
    SerDe keySerde = SerDeFactory.create(keyType, runtime.getConfiguration());
    SerDe valueSerde = SerDeFactory.create(valueType, runtime.getConfiguration());
    List> res = Lists.newLinkedList();
    try {
      for (Pair p : contents) {
        ByteArray key = keySerde.toBytes(keyType.getOutputMapFn().map(p.first()));
        ByteArray value = valueSerde.toBytes(valueType.getOutputMapFn().map(p.second()));
        res.add(new Tuple2(key, value));
      }
    } catch (Exception e) {
      throw new CrunchRuntimeException(e);
    }
    return runtime.getSparkContext()
        .parallelizePairs(res, parallelism)
        .mapToPair(new MapPairInputFn(
            keySerde, valueSerde, keyType.getInputMapFn(), valueType.getInputMapFn(), runtime.getRuntimeContext()));
  }

  static class MapPairInputFn implements PairFunction, K, V> {

    private final SerDe keySerde;
    private final SerDe valueSerde;
    private final MapFn keyFn;
    private final MapFn valueFn;
    private final SparkRuntimeContext context;
    private boolean initialized;

    public MapPairInputFn(
        SerDe keySerde,
        SerDe valueSerde,
        MapFn keyFn,
        MapFn valueFn,
        SparkRuntimeContext context) {
      this.keySerde = keySerde;
      this.valueSerde = valueSerde;
      this.keyFn = keyFn;
      this.valueFn = valueFn;
      this.context = context;
      this.initialized = false;
    }

    @Override
    public Tuple2 call(Tuple2 in) throws Exception {
      if (!initialized) {
        context.initialize(keyFn, -1);
        context.initialize(valueFn, -1);
        initialized = true;
      }
      return new Tuple2(
          keyFn.map(keySerde.fromBytes(in._1().value)),
              valueFn.map(valueSerde.fromBytes(in._2().value)));
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy