All Downloads are FREE. Search and download functionalities are using the official Maven repository.

zhao.algorithmMagic.operands.vector.SparkVector.scala Maven / Gradle / Ivy

There is a newer version: 1.42
Show newest version
package zhao.algorithmMagic.operands.vector

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import zhao.algorithmMagic.SerialVersionUID
import zhao.algorithmMagic.exception.OperatorOperationException
import zhao.algorithmMagic.utils.ASMath

/**
 * Spark向量对象,通过该类可以将Spark的API接入到本框架中,能够很好的对接到分布式内存计算技术
 *
 * Spark vector object. Through this class, you can connect Spark's API to this framework, which can be well connected to the distributed memory computing technology
 *
 * @param sparkContext Spark上下文对象
 * @param vector       Spark的vector对象
 */
final class SparkVector(sparkContext: SparkContext, vector: org.apache.spark.mllib.linalg.Vector)
  extends
    zhao.algorithmMagic.operands.vector.ThirdVector[SparkVector, Double, Array[Double], org.apache.spark.mllib.linalg.Vector] {

  private val size: Int = vector.size

  private val serialVersionUID = SerialVersionUID.SparkVector.getNum


  /**
   * 计算该向量的模长,具体实现请参阅api说明
   * 

* Calculate the modulo length of the vector, please refer to the api node for the specific implementation * * @return 向量的模长 * waiting to be realized */ override def moduleLength(): Double = { var res = this.vector.apply(0) for (i <- 1 until this.size) { res = res + this.vector.apply(i) } res } /** * 两个向量相乘,同时也是两个向量的外积,具体实现请参阅api说明 *

* The multiplication of two vectors is also the outer product of the two vectors. For the specific implementation, please refer to the api description. * * @param vector 被做乘的向量 * @return 向量的外积 * waiting to be realized */ override def multiply(vector: SparkVector): SparkVector = { val vectorArray1 = this.copyToNewArray() val vectorArray2 = vector.copyToNewArray() val length1 = vectorArray1.length val length2 = vectorArray2.length if (length1 == length2) SparkVector.parse(sparkContext, ASMath.CrossMultiplication(vectorArray1, vectorArray2)) else throw new OperatorOperationException("'DoubleVector1 multiply DoubleVector2' 时,两个'DoubleVector'的向量所包含的数量不同,DoubleVector1=[" + length1 + "],DoubleVector2=[" + length2 + "]\n" + "When 'DoubleVector1 multiply DoubleVector2', the two vectors of 'DoubleVector' contain different quantities, DoubleVector1=[" + length1 + "], DoubleVector2=[" + length2 + "]") } /** * 计算两个向量的内积,也称之为数量积,具体实现请参阅api说明 *

* Calculate the inner product of two vectors, also known as the quantity product, please refer to the api node for the specific implementation * * @param v 第二个被计算的向量对象 *

* the second computed vector object * @return 两个向量的内积 * waiting to be realized */ override def innerProduct(v: SparkVector): Double = this.toThirdArray.dot(v.toThirdArray) /** * * @return 第三方向量中所维护的向量序列,通过此函数您可以直接获取到第三方库中的对象。 * * The vector sequence maintained in the third direction quantity. Through this function, you can directly obtain the objects in the third party library. */ override def toThirdArray: org.apache.spark.mllib.linalg.Vector = this.vector /** * 将两个操作数进行求和的方法,具体用法请参阅API说明。 *

* The method for summing two operands, please refer to the API description for specific usage. * * @param value 被求和的参数 Parameters to be summed * @return 求和之后的数值 the value after the sum *

* There is no description for the super interface, please refer to the subclass documentation */ override def add(value: SparkVector): SparkVector = { val numberOfDimensions1 = this.getNumberOfDimensions val numberOfDimensions2 = value.getNumberOfDimensions if (numberOfDimensions1 == numberOfDimensions2) { val res = new Array[Double](numberOfDimensions1) val doubles1 = copyToNewArray() val doubles2 = copyToNewArray() for (i <- 0 until numberOfDimensions1) { res(i) = doubles1(i) + doubles2(i) } SparkVector.parse(sparkContext, res) } else throw new OperatorOperationException("'DoubleVector1 add DoubleVector2' 时,两个'DoubleVector'的向量所包含的数量不同,DoubleVector1=[" + numberOfDimensions1 + "],DoubleVector2=[" + numberOfDimensions2 + "]\n" + "When 'DoubleVector1 add DoubleVector2', the two vectors of 'DoubleVector' contain different quantities, DoubleVector1=[" + numberOfDimensions1 + "], DoubleVector2=[" + numberOfDimensions2 + "]") } /** * 在两个操作数之间做差的方法,具体用法请参阅API说明。 *

* The method of making a difference between two operands, please refer to the API description for specific usage. * * @param value 被做差的参数(被减数) The parameter to be subtracted (minuend) * @return 差异数值 difference value * There is no description for the super interface, please refer to the subclass documentation */ override def diff(value: SparkVector): SparkVector = { val numberOfDimensions1 = this.getNumberOfDimensions val numberOfDimensions2 = value.getNumberOfDimensions if (numberOfDimensions1 == numberOfDimensions2) { val res = new Array[Double](numberOfDimensions1) val doubles1 = this.copyToNewArray() val doubles2 = value.copyToNewArray() for (i <- 0 until numberOfDimensions1) { res(i) = doubles1(i) - doubles2(i) } SparkVector.parse(sparkContext, res) } else throw new OperatorOperationException("'DoubleVector1 diff DoubleVector2' 时,两个'DoubleVector'的向量所包含的数量不同,DoubleVector1=[" + numberOfDimensions1 + "],DoubleVector2=[" + numberOfDimensions2 + "]\n" + "When 'DoubleVector1 diff DoubleVector2', the two vectors of 'DoubleVector' contain different quantities, DoubleVector1=[" + numberOfDimensions1 + "], DoubleVector2=[" + numberOfDimensions2 + "]") } /** * 将本对象中的所有数据进行洗牌打乱,随机分布数据行的排列。 *

* Shuffle all the data in this object and randomly distribute the arrangement of data rows. * * @param seed 打乱算法中所需要的随机种子。 *

* Disrupt random seeds required in the algorithm. * @return 打乱之后的对象。 *

* Objects after disruption. */ override def shuffle(seed: Long): SparkVector = SparkVector.parse(sparkContext, ASMath.shuffle(this.copyToNewArray(), seed, false)) /** * 将两个操作数进行求和的方法,具体用法请参阅API说明。 *

* The method for summing two operands, please refer to the API description for specific usage. * * @param value 被求和的参数 Parameters to be summed * @return 求和之后的数值 the value after the sum *

* There is no description for the super interface, please refer to the subclass documentation */ override def add(value: Number): SparkVector = { val dimensions = this.getNumberOfDimensions val doubles = this.copyToNewArray() val v = value.doubleValue() for (i <- 0 until dimensions) { doubles(i) -= v } SparkVector.parse(sparkContext, doubles) } /** * * @return 将本对象中存储的向量序列数组拷贝到一个新数组并将新数组返回,这里返回的是一个新数组,支持修改等操作。 * * Copy the vector sequence array stored in this object to a new array and return the new array. Here, a new array is returned, which supports modification and other operations. */ override def copyToNewArray(): Array[Double] = vector.toArray /** * @return 向量中包含的维度数量 *

* the number of dimensions contained in the vector */ override def getNumberOfDimensions: Int = size /** * 在两个操作数之间做差的方法,具体用法请参阅API说明。 *

* The method of making a difference between two operands, please refer to the API description for specific usage. * * @param value 被做差的参数(被减数) The parameter to be subtracted (minuend) * @return 差异数值 difference value * There is no description for the super interface, please refer to the subclass documentation */ override def diff(value: Number): SparkVector = { val dimensions = this.getNumberOfDimensions val doubles = this.copyToNewArray() val v = value.doubleValue() for (i <- 0 until dimensions) { doubles(i) += v } SparkVector.parse(sparkContext, doubles) } /** * 将当前对象转换成为其子类实现,其具有强大的类型拓展效果,能够实现父类到子类的转换操作。 * * Transforming the current object into its subclass implementation has a powerful type extension effect, enabling the conversion operation from parent class to subclass. * * @return 当前类对应的子类实现数据类型的对象。 * * The subclass corresponding to the current class implements objects of data type. */ override def expand(): SparkVector = this /** * @return 当前对象或类的序列化数值,相同类型的情况下该数值是相同的。 *

* The serialized value of the current object or class, which is the same for the same type. */ override def getSerialVersionUID: Long = serialVersionUID } object SparkVector { /** * 将一个Spark向量获取到,通过一个文件中的向量数据构建向量集 * * @param sparkContext Spark上下文对象,用于诸多的计算操作 * @param textFilePath 包含向量数据的文件 * @param splitRex 分割正则,在一个数据文件中有很多向量的元素数据,每一个元素之间的分分割正则表达式 * @return SparkVector */ def parse(sparkContext: SparkContext, textFilePath: String, splitRex: String): SparkVector = { parse(sparkContext, sparkContext.textFile(textFilePath) .flatMap(data => data.split(splitRex)) .map(_.toDouble) ) } /** * 将一个Spark向量获取到,通过一个RDD集合构建向量集 * * @param sparkContext Spark上下文对象,用于诸多的计算操作 * @param rdd 需要被构建成为一个向量的集合,其中的每一个元素都是向量的一个维度元素 * @return SparkVector */ def parse(sparkContext: SparkContext, rdd: RDD[Double]): SparkVector = { new SparkVector(sparkContext, org.apache.spark.mllib.linalg.Vectors.dense(rdd.collect())) } /** * 将一个Spark向量获取到,通过一个double的数组构建向量集 * * @param sparkContext spark上下文对象,用于诸多的计算操作 * @param doubles 需要被构建成为一个向量的集合,其中的每一个元素都是向量的一个维度元素 * @return SparkVector */ def parse(sparkContext: SparkContext, doubles: Array[Double]): SparkVector = { new SparkVector(sparkContext, org.apache.spark.mllib.linalg.Vectors.dense(doubles)) } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy