it.polimi.genomics.spark.implementation.MetaOperators.OrderMD.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of GMQL-Spark Show documentation
Show all versions of GMQL-Spark Show documentation
Spark implementation of GMQL operators
The newest version!
package it.polimi.genomics.spark.implementation.MetaOperators
import com.google.common.base.Charsets
import com.google.common.hash.Hashing
import it.polimi.genomics.core.DataStructures.GroupMDParameters.Direction.Direction
import it.polimi.genomics.core.DataStructures.GroupMDParameters._
import it.polimi.genomics.core.DataStructures.MetaOperator
import it.polimi.genomics.core.exception.SelectFormatException
import it.polimi.genomics.core.{GDouble, GNull, GString, GValue}
import it.polimi.genomics.spark.implementation.GMQLSparkExecutor
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.slf4j.LoggerFactory
import scala.util.{Failure, Success, Try}
/**
* Created by abdulrahman kaitoua on 09/06/15.
*/
object OrderMD {
private final val logger = LoggerFactory.getLogger(OrderMD.getClass);
@throws[SelectFormatException]
def apply(executor: GMQLSparkExecutor, ordering: List[(String, Direction)], newAttribute: String, topParameter: TopParameter, inputDataset: MetaOperator, sc: SparkContext): RDD[(Long, (String, String))] = {
logger.info("----------------OrderMD executing..")
val ds: RDD[(Long, (String, String))] =
executor.implement_md(inputDataset, sc)
val grouping: Boolean =
topParameter match {
case NoTop() => false
case Top(_) => false
case TopG(_) => true
case TopP(_) => false
}
val top: Int =
topParameter match {
case NoTop() => 0
case Top(v) => v
case TopP(v) => v
case TopG(v) => v
}
//extract the key of metadata that will be used in grouping/ordering
val keys: List[String] =
ordering.map(_._1)
//meta data that match the grouping/ordering field
val metaFilteredDs: Seq[(Long, (String, String))] =
ds.filter { m =>
keys.contains(m._2._1)
}.collect
val distinctFilteredValues = metaFilteredDs.distinct
val orderingWithIndex: Map[String, Int] = ordering.zipWithIndex.map(x => (x._1._1, x._2)).toMap
val SortingAttVallist1 = distinctFilteredValues.groupBy(_._1)
val SortingAttVallist: Array[(Long, List[(String, GValue)])] = SortingAttVallist1.map { x =>
val missingAtt = keys.filter { s => val dd = x._2.filter(_._2._1.equals(s)).size; dd <= 0 };
val misingPair: Array[(Int, String, GValue)] = missingAtt.map(m => (orderingWithIndex.get(m).get, m, GDouble(Double.MinValue))).toArray
(x._1, (x._2.map { f =>
val value: GValue = try {
GDouble(f._2._2.toDouble)
} catch {
case e: Throwable => /* println(f._2._2.hashCode);*/ GString(f._2._2)
}
(orderingWithIndex.get(f._2._1).get, f._2._1, value)
}.toArray ++ misingPair).toList
.sortBy(b => b._1)
.map(f => (f._2, f._3)))
}.toArray
var index = -1
val valuesOrdering: Ordering[Array[GValue]] = ordering.map(x => x._2 match {
case Direction.ASC => /*println(x);*/ index = index + 1; orderByColumn(index)
case Direction.DESC => /*println(x);*/ index = index + 1; orderByColumn(index).reverse
}).reduceLeft((res, x) => res orElse x)
//for each sample for each key take the first value that will be used for grouping/sorting
//SampleID, List[metavalues to be used in grouping/ordering ordered as requested in input]
val valueList: Map[Long, List[String]] =
metaFilteredDs
.groupBy(_._1)
.map { g =>
keys.foldLeft((g._1, List(): List[String])) { (z, k) =>
val matchedValues: Seq[(Long, (String, String))] = g._2.filter(m => m._2._1.equals(k))
val head: String =
if (matchedValues.isEmpty) {
null
} else {
ordering.filter(o => o._1.equals(matchedValues.head._2._1)).head._2 match {
case Direction.ASC => matchedValues.sortWith((a, b) => a._2._2.compareTo(b._2._2) < 0).head._2._2
case Direction.DESC => matchedValues.sortWith((a, b) => a._2._2.compareTo(b._2._2) > 0).head._2._2
}
}
(z._1, z._2 :+ head)
}
}
val valueListBoth: Map[Long, List[GValue]] = valueList.map { g =>
val inputs = g._2.map { inp =>
Try(inp.toDouble) match {
case Success(v) => GDouble(v)
case Failure(_) => Try(inp.toString) match {
case Success(v) => GString(v)
case Failure(_) => GNull()
}
}
}
(g._1, inputs)
}
//list of meta data that will be added
//if grouping is defined there are 2 meta tuple for each sample
//they are in the form
//sampleID newAttribute [ position | positionInGroup ]
//sampleID newAttribute_group groupID
val sortedTop: List[(Long, (String, String))] =
if (grouping) {
//Grouping group by the first n-1 ordering attributes and then selectes the top k
val sortedGroups1 = SortingAttVallist.groupBy { record =>
Hashing.md5.newHasher.putString(record._2.map(_._2).toList.init.mkString("§"), Charsets.UTF_8).hash.asLong
}.map(s => (s._1, s._2.sortBy((sort: (Long, List[(String, GValue)])) => sort._2.map(_._2).toArray[GValue])(valuesOrdering)))
/*val percentages1 = if (topParameter.isInstanceOf[TopP]) {
sortedGroups1.map(x => (x._1, x._2.size * top / 100))
} else HashMap[Long, Int]()*/
sortedGroups1.flatMap { g =>
//TOPG
val gFiltered: List[Long] =
if (top == 0) {
g._2.map(_._1).toList
} else {
g._2.map(_._1).take(top /*percentages1.getOrElse(g._1, top)*/).toList
}
//create metadata
assignPosition(Some(g._1), gFiltered, 1, newAttribute, List())
}.toList
} else {
val comparator: ((Long, List[GValue]), (Long, List[GValue])) => Boolean =
metaSampleComparator(ordering)
val sortedSamples: List[Long] = valueListBoth.toList.sortWith { (a, b) => comparator(a, b) }.map(_._1)
//TOPP
val percentages = if (topParameter.isInstanceOf[TopP]) {
Some(sortedSamples.size * top / 100)
} else None
//TOP
val filteredSortedSamples: List[Long] =
if (top != 0)
sortedSamples.take(percentages.getOrElse(top))
else sortedSamples
//create metadata
assignPosition(None, filteredSortedSamples, 1, newAttribute, List())
}
//extract id of resulting set
//if top is applied it is a subset of original set
val filteredId: List[Long] = sortedTop.map(_._1).distinct
//filter input dataset by id
val topDs: RDD[(Long, (String, String))] =
ds.filter((m) => filteredId.contains(m._1))
//Create the dataset, merge with input and return as output
topDs.union(sc.parallelize(sortedTop))
}
/**
* Creates 1 or 2 metadata line for each sample:
* one with the groupID (if defined)
* one with the position in the group
*
* @param groupId groupId
* @param list ordered list of sample
* @param step position
* @param newAttribute new attribute name for position, group attribute will be called newAttribute_groupId
* @param acc recursive accumulator
* @return Seq of meta data
*/
def assignPosition(groupId: Option[Long], list: List[(Long)], step: Int, newAttribute: String, acc: List[(Long, (String, String))]): List[(Long, (String, String))] = {
if (list.size.equals(0)) {
acc
} else {
if (groupId.isDefined) {
assignPosition(groupId, list.tail, step + 1, newAttribute,
acc :+
(list.head, (newAttribute, step.toString)) :+
(list.head, (newAttribute + "_groupId", groupId.get.toString))
)
} else {
assignPosition(groupId, list.tail, step + 1, newAttribute,
acc :+
(list.head, (newAttribute, step.toString))
)
}
}
}
/////////////////////
//comparator
/////////////////////
//true if a < b
def metaSampleComparator(ordering: List[(String, Direction)])(a: (Long, List[GValue]), b: (Long, List[GValue])): Boolean = {
val res: Option[Boolean] = comparatorHelper(ordering, a, b)
// if res is not defined then "a == b", so (a Some(aVal.compareTo(bVal) < 0)
case Direction.DESC => Some(aVal.compareTo(bVal) > 0)
}
}
}
}
// Functions and objects needed for sorting by multi values.
def orderByColumn(col: Int) = Ordering.by { ar: Array[GValue] => ar(col) }
final class CompositeOrdering[T](val ord1: Ordering[T], val ord2: Ordering[T]) extends Ordering[T] {
def compare(x: T, y: T) = {
val comp = ord1.compare(x, y)
if (comp != 0) comp else ord2.compare(x, y)
}
}
object CompositeOrdering {
def apply[T](orderings: Ordering[T]*) = orderings reduceLeft (_ orElse _)
}
implicit class OrderingOps[T](val ord: Ordering[T]) extends AnyVal {
def orElse(ord2: Ordering[T]) = new CompositeOrdering[T](ord, ord2)
}
}