All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.spark.connector.japi.RDDJavaFunctions Maven / Gradle / Ivy

The newest version!
package com.datastax.spark.connector.japi;

import com.datastax.spark.connector.rdd.reader.RowReader;
import com.datastax.spark.connector.writer.RowWriter;
import com.datastax.spark.connector.writer.TokenRangeAccumulator;
import scala.Option;
import scala.Tuple2;
import scala.reflect.ClassTag;

import static com.datastax.spark.connector.japi.CassandraJavaUtil.classTag;
import static com.datastax.spark.connector.util.JavaApiHelper.toScalaFunction1;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.rdd.RDD;

import com.datastax.spark.connector.ColumnSelector;
import com.datastax.spark.connector.RDDFunctions;
import com.datastax.spark.connector.cql.CassandraConnector;
import com.datastax.spark.connector.cql.CassandraConnector$;
import com.datastax.spark.connector.japi.rdd.CassandraJavaPairRDD;
import com.datastax.spark.connector.rdd.*;
import com.datastax.spark.connector.rdd.partitioner.CassandraPartitionedRDD;
import com.datastax.spark.connector.rdd.reader.RowReaderFactory;
import com.datastax.spark.connector.util.JavaApiHelper;
import com.datastax.spark.connector.writer.RowWriterFactory;
import com.datastax.spark.connector.writer.WriteConf;


/**
 * A Java API wrapper over {@link RDD} to provide Spark Cassandra Connector functionality.
 * 

* To obtain an instance of this wrapper, use one of the factory methods in {@link CassandraJavaUtil} * class. */ public class RDDJavaFunctions extends RDDAndDStreamCommonJavaFunctions { public final RDD rdd; public final RDDFunctions rddFunctions; public RDDJavaFunctions(RDD rdd) { this.rdd = rdd; this.rddFunctions = new RDDFunctions<>(rdd); } public CassandraConnector defaultConnector() { return CassandraConnector$.MODULE$.apply(rdd.conf()); } public SparkConf getConf() { return rdd.conf(); } public void saveToCassandra( String keyspace, String table, RowWriterFactory rowWriterFactory, ColumnSelector columnNames, WriteConf conf, CassandraConnector connector ) { rddFunctions.saveToCassandra(keyspace, table, columnNames, conf, Option.apply(null), connector, rowWriterFactory); } public void deleteFromCassandra( String keyspace, String table, RowWriterFactory rowWriterFactory, ColumnSelector deleteColumns, ColumnSelector keyColumns, WriteConf conf, CassandraConnector connector ) { rddFunctions.deleteFromCassandra(keyspace, table, deleteColumns, keyColumns, conf, connector, rowWriterFactory); } /** * Applies a function to each item, and groups consecutive items having the same value together. * Contrary to {@code groupBy}, items from the same group must be already next to each other in the * original collection. Works locally on each partition, so items from different partitions will * never be placed in the same group. */ public JavaPairRDD> spanBy(final Function f, ClassTag keyClassTag) { ClassTag>> tupleClassTag = classTag(Tuple2.class); ClassTag> iterableClassTag = CassandraJavaUtil.classTag(Iterable.class); RDD>> newRDD = rddFunctions.spanBy(toScalaFunction1(f)) .map(JavaApiHelper.>valuesAsJavaIterable(), tupleClassTag); return new JavaPairRDD<>(newRDD, keyClassTag, iterableClassTag); } /** * Uses the data from {@code RDD} to join with a Cassandra table without retrieving the entire table. * Any RDD which can be used to saveToCassandra can be used to joinWithCassandra as well as any RDD * which only specifies the partition Key of a Cassandra Table. This method executes single partition * requests against the Cassandra Table and accepts the functional modifiers that a normal * {@link CassandraTableScanRDD} takes. *

* By default this method only uses the Partition Key for joining but any combination of columns * which are acceptable to C* can be used in the join. Specify columns using joinColumns as a * parameter or the {@code on()} method. */ public CassandraJavaPairRDD joinWithCassandraTable( String keyspaceName, String tableName, ColumnSelector selectedColumns, ColumnSelector joinColumns, RowReaderFactory rowReaderFactory, RowWriterFactory rowWriterFactory ) { ClassTag classTagT = rdd.toJavaRDD().classTag(); ClassTag classTagR = JavaApiHelper.getClassTag(rowReaderFactory.targetClass()); CassandraConnector connector = defaultConnector(); Option clusteringOrder = Option.empty(); Option limit = Option.empty(); CqlWhereClause whereClause = CqlWhereClause.empty(); ReadConf readConf = ReadConf.fromSparkConf(rdd.conf()); CassandraJoinRDD joinRDD = new CassandraJoinRDD<>( rdd, keyspaceName, tableName, connector, selectedColumns, joinColumns, whereClause, limit, clusteringOrder, readConf, Option.>empty(), Option.>empty(), classTagT, classTagR, rowWriterFactory, rowReaderFactory); return new CassandraJavaPairRDD<>(joinRDD, classTagT, classTagR); } /** * Repartitions the data (via a shuffle) based upon the replication of the given {@code keyspaceName} * and {@code tableName}. Calling this method before using joinWithCassandraTable will ensure that * requests will be coordinator local. {@code partitionsPerHost} Controls the number of Spark * Partitions that will be created in this repartitioning event. The calling RDD must have rows that * can be converted into the partition key of the given Cassandra Table. */ public JavaRDD repartitionByCassandraReplica( String keyspaceName, String tableName, int partitionsPerHost, ColumnSelector partitionkeyMapper, RowWriterFactory rowWriterFactory ) { CassandraConnector connector = defaultConnector(); ClassTag ctT = rdd.toJavaRDD().classTag(); CassandraPartitionedRDD newRDD = rddFunctions.repartitionByCassandraReplica( keyspaceName, tableName, partitionsPerHost, partitionkeyMapper, connector, ctT, rowWriterFactory); return new JavaRDD<>(newRDD, ctT); } }