com.datastax.spark.connector.japi.PairRDDJavaFunctions Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of spark-cassandra-connector-java_2.10 Show documentation

String

The newest version!

package com.datastax.spark.connector.japi;

import com.datastax.spark.connector.PairRDDFunctions;
import com.datastax.spark.connector.util.JavaApiHelper;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.rdd.RDD;
import scala.Tuple2;
import scala.collection.Seq;
import scala.reflect.ClassTag;

import java.util.Collection;

import static com.datastax.spark.connector.japi.CassandraJavaUtil.classTag;

public class PairRDDJavaFunctions extends RDDJavaFunctions> {

    public final PairRDDFunctions pairRDDFunctions;

    public PairRDDJavaFunctions(RDD> rdd) {
        super(rdd);
        pairRDDFunctions = new PairRDDFunctions<>(rdd);
    }

    /**
     * Groups items with the same key, assuming the items with the same key are next to each other in the
     * collection. It does not perform shuffle, therefore it is much faster than using much more
     * universal Spark RDD `groupByKey`. For this method to be useful with Cassandra tables, the key must
     * represent a prefix of the primary key, containing at least the partition key of the Cassandra
     * table.
     */
    public JavaPairRDD> spanByKey(ClassTag keyClassTag) {
        ClassTag>> tupleClassTag = classTag(Tuple2.class);
        ClassTag> vClassTag = classTag(Collection.class);
        RDD>> newRDD = pairRDDFunctions.spanByKey()
                .map(JavaApiHelper.>valuesAsJavaCollection(), tupleClassTag);

        return new JavaPairRDD<>(newRDD, keyClassTag, vClassTag);
    }
}