Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.instructions.spark.utils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import scala.Tuple2;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.DMLUnsupportedOperationException;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.functionobjects.SortIndex;
import org.apache.sysml.runtime.instructions.spark.data.PartitionedMatrixBlock;
import org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock;
import org.apache.sysml.runtime.instructions.spark.functions.ReplicateVectorFunction;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.util.DataConverter;
import org.apache.sysml.runtime.util.UtilFunctions;
/**
*
*/
public class RDDSortUtils
{
/**
*
* @param in
* @param rlen
* @param brlen
* @return
*/
public static JavaPairRDD sortByVal( JavaPairRDD in, long rlen, int brlen )
{
//create value-index rdd from inputs
JavaRDD dvals = in.values()
.flatMap(new ExtractDoubleValuesFunction());
//sort (creates sorted range per partition)
long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
JavaRDD sdvals = dvals
.sortBy(new CreateDoubleKeyFunction(), true, numPartitions);
//create binary block output
JavaPairRDD ret = sdvals
.zipWithIndex()
.mapPartitionsToPair(new ConvertToBinaryBlockFunction(rlen, brlen));
ret = RDDAggregateUtils.mergeByKey(ret);
return ret;
}
/**
*
* @param in
* @param in2
* @param rlen
* @param brlen
* @return
*/
public static JavaPairRDD sortByVal( JavaPairRDD in,
JavaPairRDD in2, long rlen, int brlen )
{
//create value-index rdd from inputs
JavaRDD dvals = in.join(in2).values()
.flatMap(new ExtractDoubleValuesFunction2());
//sort (creates sorted range per partition)
long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
int numPartitions = (int)Math.ceil(((double)rlen*8)/hdfsBlocksize);
JavaRDD sdvals = dvals
.sortBy(new CreateDoubleKeyFunction2(), true, numPartitions);
//create binary block output
JavaPairRDD ret = sdvals
.zipWithIndex()
.mapPartitionsToPair(new ConvertToBinaryBlockFunction2(rlen, brlen));
ret = RDDAggregateUtils.mergeByKey(ret);
return ret;
}
/**
*
* @param in
* @param rlen
* @param brlen
* @return
*/
public static JavaPairRDD sortIndexesByVal( JavaPairRDD val,
boolean asc, long rlen, int brlen )
{
//create value-index rdd from inputs
JavaPairRDD dvals = val
.flatMapToPair(new ExtractDoubleValuesWithIndexFunction(brlen));
//sort (creates sorted range per partition)
long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
int numPartitions = (int)Math.ceil(((double)rlen*16)/hdfsBlocksize);
JavaRDD sdvals = dvals
.sortByKey(new IndexComparator(asc), true, numPartitions)
.keys(); //workaround for index comparator
//create binary block output
JavaPairRDD ret = sdvals
.zipWithIndex()
.mapPartitionsToPair(new ConvertToBinaryBlockFunction3(rlen, brlen));
ret = RDDAggregateUtils.mergeByKey(ret);
return ret;
}
/**
*
* @param val
* @param data
* @param asc
* @param rlen
* @param brlen
* @return
*/
public static JavaPairRDD sortDataByVal( JavaPairRDD val,
JavaPairRDD data, boolean asc, long rlen, long clen, int brlen, int bclen )
{
//create value-index rdd from inputs
JavaPairRDD dvals = val
.flatMapToPair(new ExtractDoubleValuesWithIndexFunction(brlen));
//sort (creates sorted range per partition)
long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
int numPartitions = (int)Math.ceil(((double)rlen*16)/hdfsBlocksize);
JavaRDD sdvals = dvals
.sortByKey(new IndexComparator(asc), true, numPartitions)
.keys(); //workaround for index comparator
//create target indexes by original index
long numRep = (long)Math.ceil((double)clen/bclen);
JavaPairRDD ixmap = sdvals
.zipWithIndex()
.mapToPair(new ExtractIndexFunction())
.sortByKey()
.mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, brlen));
ixmap = RDDAggregateUtils.mergeByKey(ixmap);
//replicate indexes for all column blocks
JavaPairRDD rixmap = ixmap
.flatMapToPair(new ReplicateVectorFunction(false, numRep));
//create binary block output
JavaPairRDD ret = data
.join(rixmap)
.mapPartitionsToPair(new ShuffleMatrixBlockRowsFunction(rlen, brlen));
return RDDAggregateUtils.mergeRowsByKey(ret);
}
/**
*
* @param val
* @param data
* @param asc
* @param rlen
* @param brlen
* @param bclen
* @param ec
* @param r_op
* @return
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
/* This function collects and sorts value column through cluster distribution and then broadcasts it.
*
* For now, its commented out until it gets evaluated completely through experiments.
*/
// public static JavaPairRDD sortDataByValDistSort( JavaPairRDD val,
// JavaPairRDD data, boolean asc, long rlen, long clen, int brlen, int bclen,
// ExecutionContext ec, ReorgOperator r_op)
// throws DMLRuntimeException, DMLUnsupportedOperationException
// {
// SparkExecutionContext sec = (SparkExecutionContext)ec;
// MatrixBlock sortedBlock;
//
// //create value-index rdd from inputs
// JavaPairRDD dvals = val
// .flatMapToPair(new ExtractDoubleValuesWithIndexFunction(brlen));
//
// //sort (creates sorted range per partition)
// long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
// int numPartitions = (int)Math.ceil(((double)rlen*16)/hdfsBlocksize);
// JavaRDD sdvals = dvals
// .sortByKey(new IndexComparator(asc), true, numPartitions)
// .keys(); //workaround for index comparator
//
// //create target indexes by original index
// JavaPairRDD ixmap = sdvals
// .zipWithIndex()
// .mapToPair(new ExtractIndexFunction()) // Original Index sorted by values
// .sortByKey(); // Original Index sorted to original order, with target index associaed with them.
//
// JavaPairRDD ixmap2 = ixmap
// .mapPartitions(new ConvertToBinaryBlockFunction4(rlen, brlen))
// .mapToPair(new UnfoldBinaryBlockFunction());
//
// sortedBlock = SparkExecutionContext.toMatrixBlock(ixmap2, (int)rlen, 1, brlen, bclen, -1);
//
// PartitionedMatrixBlock pmb = new PartitionedMatrixBlock(sortedBlock, brlen, bclen);
// Broadcast _pmb = sec.getSparkContext().broadcast(pmb);
//
// JavaPairRDD ret = data
// .flatMapToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, brlen, _pmb));
// ret = RDDAggregateUtils.mergeByKey(ret);
//
// return ret;
// }
/**
* This function collects and sorts value column in memory and then broadcasts it.
*
* @param val
* @param data
* @param asc
* @param rlen
* @param brlen
* @param bclen
* @param ec
* @param r_op
* @return
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
public static JavaPairRDD sortDataByValMemSort( JavaPairRDD val,
JavaPairRDD data, boolean asc, long rlen, long clen, int brlen, int bclen,
SparkExecutionContext sec, ReorgOperator r_op)
throws DMLRuntimeException, DMLUnsupportedOperationException
{
//collect orderby column for in-memory sorting
MatrixBlock inMatBlock = SparkExecutionContext
.toMatrixBlock(val, (int)rlen, 1, brlen, bclen, -1);
//in-memory sort operation (w/ index return: source index in target position)
ReorgOperator lrop = new ReorgOperator(SortIndex.getSortIndexFnObject(1, !asc, true));
MatrixBlock sortedIx = (MatrixBlock) inMatBlock
.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
//flip sort indices from