com.cloudera.sparkts.TimeSeries.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of sparkts Show documentation
A library for analyzing large-scale time series data.
The newest version!
/**
 * Copyright (c) 2015, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.sparkts

import java.time._
import breeze.linalg.{diff, DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.mllib.linalg.{DenseMatrix, Vector}

import MatrixUtil._
import TimeSeries._

import scala.reflect.ClassTag

class TimeSeries[K](val index: DateTimeIndex, val data: DenseMatrix,
                    val keys: Array[K])(implicit val kClassTag: ClassTag[K])
  extends Serializable {

  private def dataToBreeze: BDM[Double] = data

  /**
   * IMPORTANT: this function assumes that the DateTimeIndex is a UniformDateTimeIndex, not an
   * Irregular one.
   *
   * Lags all individual time series of the TimeSeries instance by up to maxLag amount.
   * The lagged time series has its keys generated by the laggedKey function which takes
   * two input parameters: the original key and the lag order, and should return a
   * corresponding lagged key.
   *
   * Example input TimeSeries:
   *   time   a   b
   *   4 pm   1   6
   *   5 pm   2   7
   *   6 pm   3   8
   *   7 pm   4   9
   *   8 pm   5   10
   *
   * With maxLag 2, includeOriginals = true and TimeSeries.laggedStringKey, we would get:
   *   time   a   lag1(a)   lag2(a)  b   lag1(b)  lag2(b)
   *   6 pm   3   2         1         8   7         6
   *   7 pm   4   3         2         9   8         7
   *   8 pm   5   4         3         10  9         8
   *
   */
  def lags[U: ClassTag](maxLag: Int, includeOriginals: Boolean, laggedKey: (K, Int) => U)
    : TimeSeries[U] = {
    val numCols = maxLag * keys.length + (if (includeOriginals) keys.length else 0)
    val numRows = data.numRows - maxLag

    val dataBreeze = dataToBreeze
    val laggedDataBreeze = new BDM[Double](numRows, numCols)
    (0 until data.numCols).foreach { colIndex =>
      val offset = maxLag + (if (includeOriginals) 1 else 0)
      val start = colIndex * offset

      Lag.lagMatTrimBoth(dataBreeze(::, colIndex), laggedDataBreeze, maxLag, includeOriginals,
        start)
    }

    val newKeys = keys.indices.map { keyIndex =>
      val key = keys(keyIndex)
      val lagKeys = (1 to maxLag).map(lagOrder => laggedKey(key, lagOrder)).toArray[U]

      if (includeOriginals) Array(laggedKey(key, 0)) ++ lagKeys else lagKeys
    }.reduce((prev: Array[U], next: Array[U]) => prev ++ next)

    val newDatetimeIndex = index.islice(maxLag, data.rows)

    new TimeSeries[U](newDatetimeIndex, laggedDataBreeze, newKeys.asInstanceOf[Array[U]])
  }

  /**
   * This is equivalent to lags(maxLag, includeOriginals, TimeSeries.laggedPairKey _).
   * It returns TimeSeries with a new key that is a pair of (original key, lag order).
   *
   */
  def lags[U >: (K, Int)](maxLag: Int, includeOriginals: Boolean): TimeSeries[(K, Int)] =
    lags(maxLag, includeOriginals, laggedPairKey[K])

  /**
   * IMPORTANT: this function assumes that the DateTimeIndex is a UniformDateTimeIndex, not an
   * Irregular one.
   *
   * Lags the specified individual time series of the TimeSeries instance by up to their matching
   * lag amount. Each time series can be indicated to either retain the original value, or drop it.
   *
   * In other words, the lagsPerCol has the following structure:
   *
   *    ("variableName1" -> (keepOriginalValue, maxLag),
   *     "variableName2" -> (keepOriginalValue, maxLag),
   *     ...)
   *
   * See description of the above lags function for an example of the lagging process.
   */
  def lags[U: ClassTag](lagsPerCol: Map[K, (Boolean, Int)], laggedKey: (K, Int) => U)
    : TimeSeries[U] = {
    val maxLag = lagsPerCol.map(_._2._2).max
    val numCols = lagsPerCol.map(pair => pair._2._2 + (if (pair._2._1) 1 else 0)).sum
    val numRows = data.rows - maxLag

    val dataBreeze = dataToBreeze
    val laggedDataBreeze = new BDM[Double](numRows, numCols)

    var curStart = 0
    keys.indices.zip(keys).foreach { indexKeyPair =>
      val colIndex = indexKeyPair._1
      val curLag = lagsPerCol(indexKeyPair._2)._2
      val curInclude = lagsPerCol(indexKeyPair._2)._1
      val offset = curLag + (if (curInclude) 1 else 0)

      Lag.lagMatTrimBoth(dataBreeze(::, colIndex), laggedDataBreeze, curLag, curInclude, curStart)

      curStart += offset
    }

    val newKeys: Array[U] = keys.indices.map(keyIndex => {
      val key = keys(keyIndex)

      var lagKeys = Array[U]()
      if (lagsPerCol.contains(key)) {
        lagKeys = (1 to lagsPerCol(key)._2).map(lagOrder => laggedKey(key, lagOrder)).toArray
      }

      if (lagsPerCol(key)._1) {
        Array(laggedKey(key, 0)) ++ lagKeys
      } else {
        lagKeys
      }
    }).reduce(_ ++ _)

    // This assumes the datetimeindex's 0 index represents the oldest data point
    val newIndex = index.islice(maxLag, data.rows)

    new TimeSeries[U](newIndex, laggedDataBreeze, newKeys.asInstanceOf[Array[U]])
  }

  /**
   * This is equivalent to lags(lagsPerCol, TimeSeries.laggedPairKey _).
   * It returns TimeSeries with a new key that is a pair of (original key, lag order).
   *
   */
  def lags[U >: (K, Int)](lagsPerCol: Map[K, (Boolean, Int)])
    : TimeSeries[(K, Int)] = {
    lags(lagsPerCol, laggedPairKey[K]_)
  }

  def slice(range: Range): TimeSeries[K] = {
    new TimeSeries[K](index.islice(range), dataToBreeze(range, ::), keys)
  }

  def union(vec: Vector, key: K): TimeSeries[K] = {
    val mat = BDM.zeros[Double](data.rows, data.cols + 1)
    (0 until data.cols).foreach(c => mat(::, c to c) := dataToBreeze(::, c to c))
    mat(::, -1 to -1) := toBreeze(vec)
    new TimeSeries[K](index, mat, keys :+ key)
  }

  /**
   * Returns a TimeSeries where each time series is differenced with the given order. The new
   * TimeSeries will be missing the first n date-times.
   */
  def differences(lag: Int): TimeSeries[K] = {
    mapSeries(vec => diff(toBreeze(vec).toDenseVector, lag), index.islice(lag, index.size))
  }

  /**
   * Returns a TimeSeries where each time series is differenced with order 1. The new TimeSeries
   * will be missing the first date-time.
   */
  def differences(): TimeSeries[K] = differences(1)

  /**
   * Returns a TimeSeries where each time series is quotiented with the given order. The new
   * TimeSeries will be missing the first n date-times.
   */
  def quotients(lag: Int): TimeSeries[K] = {
    mapSeries(vec => UnivariateTimeSeries.quotients(vec, lag), index.islice(lag, index.size))
  }

  /**
   * Returns a TimeSeries where each time series is quotiented with order 1. The new TimeSeries will
   * be missing the first date-time.
   */
  def quotients(): TimeSeries[K] = quotients(1)

  /**
   * Returns a return series for each time series. Assumes periodic (as opposed to continuously
   * compounded) returns.
   */
  def price2ret(): TimeSeries[K] = {
    mapSeries(vec => UnivariateTimeSeries.price2ret(vec, 1), index.islice(1, index.size))
  }

  def univariateSeriesIterator(): Iterator[Vector] = {
    new Iterator[Vector] {
      var i = 0
      def hasNext: Boolean = i < data.cols
      def next(): Vector = {
        i += 1
        dataToBreeze(::, i - 1)
      }
    }
  }

  def univariateKeyAndSeriesIterator(): Iterator[(K, Vector)] = {
    new Iterator[(K, Vector)] {
      var i = 0
      def hasNext: Boolean = i < data.cols
      def next(): (K, Vector) = {
        i += 1
        (keys(i - 1), dataToBreeze(::, i - 1))
      }
    }
  }

  def toInstants(): IndexedSeq[(ZonedDateTime, Vector)] = {
    (0 until data.rows).map(rowIndex => (index.dateTimeAtLoc(rowIndex),
      fromBreeze(dataToBreeze(rowIndex, ::).inner.toVector)))
  }

  /**
   * Applies a transformation to each series that preserves the time index.
   */
  def mapSeries(f: (Vector) => Vector): TimeSeries[K] = {
    mapSeries(f, index)
  }

  /**
   * Applies a transformation to each series that preserves the time index. Passes the key along
   * with each series.
   */
  def mapSeriesWithKey(f: (K, Vector) => Vector): TimeSeries[K] = {
    val newData = new BDM[Double](index.size, data.cols)
    univariateKeyAndSeriesIterator().zipWithIndex.foreach { case ((key, series), i) =>
      newData(::, i) := toBreeze(f(key, series))
    }
    new TimeSeries[K](index, newData, keys)
  }

  /**
   * Applies a transformation to each series such that the resulting series align with the given
   * time index.
   */
  def mapSeries(f: (Vector) => Vector, newIndex: DateTimeIndex): TimeSeries[K] = {
    val newSize = newIndex.size
    val newData = new BDM[Double](newSize, data.cols)
    univariateSeriesIterator().zipWithIndex.foreach { case (vec, i) =>
      newData(::, i) := toBreeze(f(vec))
    }
    new TimeSeries[K](newIndex, newData, keys)
  }

  def mapValues[U](f: (Vector) => U): Seq[(K, U)] = {
    univariateKeyAndSeriesIterator().map(ks => (ks._1, f(ks._2))).toSeq
  }

  /**
   * Gets the first univariate series and its key.
   */
  def head(): (K, Vector) = univariateKeyAndSeriesIterator().next()

  /**
   * Returns a TimeSeries with each univariate series resampled to a new date-time index. Resampling
   * provides flexible semantics for specifying which date-times in each input series correspond to
   * which date-times in the output series, and for aggregating observations when downsampling.
   *
   * Based on the closedRight and stampRight parameters, resampling partitions time into non-
   * overlapping intervals, each corresponding to a date-time in the target index. Each resulting
   * value in the output series is determined by applying an aggregation function over all the
   * values that fall within the corresponding window in the input series. If no values in the
   * input series fall within the window, a NaN is used.
   *
   * Compare with the equivalent functionality in Pandas:
   * http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html
   *
   * @param targetIndex The date-time index of the resulting series.
   * @param aggr Function for aggregating multiple points that fall within a window.
   * @param closedRight If true, the windows are open on the left and closed on the right. Otherwise
   *                    the windows are closed on the left and open on the right.
   * @param stampRight If true, each date-time in the resulting series marks the end of a window.
   *                   This means that all observations after the end of the last window will be
   *                   ignored. Otherwise, each date-time in the resulting series marks the start of
   *                   a window. This means that all observations after the end of the last window
   *                   will be ignored.
   * @return The values of the resampled series.
   */

  def resample(
      targetIndex: DateTimeIndex,
      aggr: (Array[Double], Int, Int) => Double,
      closedRight: Boolean,
      stampRight: Boolean): TimeSeries[K] = {
    mapSeries(Resample.resample(_, index, targetIndex, aggr, closedRight, stampRight), targetIndex)
  }
}

object TimeSeries {
  def laggedStringKey(key: String, lagOrder: Int): String =
    if (lagOrder > 0) s"lag${lagOrder}($key)" else key

  def laggedPairKey[K](key: K, lagOrder: Int): (K, Int) = (key, lagOrder)

  def timeSeriesFromIrregularSamples[K](
      samples: Seq[(ZonedDateTime, Array[Double])],
      keys: Array[K],
      zone: ZoneId = ZoneId.systemDefault())
      (implicit kClassTag: ClassTag[K])
    : TimeSeries[K] = {
    val mat = new BDM[Double](samples.length, samples.head._2.length)
    val dts: Array[Long] = samples.map(pair => TimeSeriesUtils.zonedDateTimeToLong(pair._1)).toArray
    for (i <- samples.indices) {
      val (_, values) = samples(i)
      mat(i to i, ::) := new BDV[Double](values)
    }
    new TimeSeries[K](new IrregularDateTimeIndex(dts, zone), mat, keys)
  }

  /**
   * This function should only be called when you can safely make the assumption that the time
   * samples are uniform (monotonously increasing) across time.
   */
  def timeSeriesFromUniformSamples[K](
      samples: Seq[Array[Double]],
      index: UniformDateTimeIndex,
      keys: Array[K])
     (implicit kClassTag: ClassTag[K]): TimeSeries[K] = {
    val mat = new BDM[Double](samples.length, samples.head.length)

    for (i <- samples.indices) {
      mat(i to i, ::) := new BDV[Double](samples(i))
    }
    new TimeSeries[K](index, mat, keys)
  }

  def timeSeriesFromVectors[K](
      vectors: Iterable[Vector],
      index: DateTimeIndex,
      keys: Array[K])
      (implicit kClassTag: ClassTag[K])
    : TimeSeries[K] = {
    val mat = new BDM[Double](index.size, vectors.size)

    var i = 0
    for (series <- vectors) {
      mat(::, i to i) := toBreeze(series)
      i += 1
    }

    new TimeSeries[K](index, mat, keys)
  }
}

trait TimeSeriesFilter extends Serializable {
  /**
   * Takes a time series of i.i.d. observations and filters it to take on this model's
   * characteristics.
   * @param ts Time series of i.i.d. observations.
   * @param dest Array to put the filtered time series, can be the same as ts.
   * @return the dest param.
   */
  def filter(ts: Array[Double], dest: Array[Double]): Array[Double]
}