net.seninp.jmotif.sax.TSProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jmotif-sax Show documentation
An implementation of time series Symbolic Aggregate approXimation and HOTSAX algorithms.
The newest version!
package net.seninp.jmotif.sax;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Implements algorithms for low-level data manipulation.
 * 
 * @author Pavel Senin
 * 
 */
public class TSProcessor {

  private static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;

  /** The latin alphabet, lower case letters a-z. */
  public static final char[] ALPHABET = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
      'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };

  // static block - we instantiate the logger
  //
  private static final Logger LOGGER = LoggerFactory.getLogger(TSProcessor.class);

  /**
   * Constructor.
   */
  public TSProcessor() {
    super();
  }

  /**
   * Reads timeseries from a file. Assumes that file has a single double value on every line.
   * Assigned timestamps are the line numbers.
   * 
   * @param filename The file to read from.
   * @param columnIdx The column index.
   * @param sizeLimit The number of lines to read, 0 == all.
   * @return data.
   * @throws IOException if error occurs.
   * @throws SAXException if error occurs.
   */
  public static double[] readFileColumn(String filename, int columnIdx, int sizeLimit)
      throws IOException, SAXException {

    // make sure the path exists
    Path path = Paths.get(filename);
    if (!(Files.exists(path))) {
      throw new SAXException("unable to load data - data source not found.");
    }

    BufferedReader br = new BufferedReader(
        new InputStreamReader(new FileInputStream(filename), "UTF-8"));

    return readTS(br, columnIdx, sizeLimit);
  }

  /**
   * Reads timeseries from a file. Assumes that file has a single double value on every line.
   * Assigned timestamps are the line numbers.
   * 
   * @param br The reader to use.
   * @param columnIdx The column index.
   * @param sizeLimit The number of lines to read, 0 == all.
   * @return data.
   * @throws IOException if error occurs.
   * @throws SAXException if error occurs.
   */
  public static double[] readTS(BufferedReader br, int columnIdx, int sizeLimit)
      throws IOException, SAXException {
    ArrayList preRes = new ArrayList();
    int lineCounter = 0;

    String line = null;
    while ((line = br.readLine()) != null) {
      String[] split = line.trim().split("\\s+");
      if (split.length < columnIdx) {
        String message = "Unable to read data from column " + columnIdx;
        br.close();
        throw new SAXException(message);
      }
      String str = split[columnIdx];
      double num = Double.NaN;
      try {
        num = Double.valueOf(str);
      }
      catch (NumberFormatException e) {
        LOGGER.info("Skipping the row " + lineCounter + " with value \"" + str + "\"");
        continue;
      }
      preRes.add(num);
      lineCounter++;
      if ((0 != sizeLimit) && (lineCounter >= sizeLimit)) {
        break;
      }
    }
    br.close();
    double[] res = new double[preRes.size()];
    for (int i = 0; i < preRes.size(); i++) {
      res[i] = preRes.get(i);
    }
    return res;

  }

  /**
   * Read at least N elements from the one-column file.
   * 
   * @param dataFileName the file name.
   * @param loadLimit the load limit.
   * @return the read data or empty array if nothing to load.
   * @throws SAXException if error occurs.
   * @throws IOException if error occurs.
   */
  public double[] readTS(String dataFileName, int loadLimit) throws SAXException, IOException {

    Path path = Paths.get(dataFileName);
    if (!(Files.exists(path))) {
      throw new SAXException("unable to load data - data source not found.");
    }

    BufferedReader reader = Files.newBufferedReader(path, DEFAULT_CHARSET);

    return readTS(reader, 0, loadLimit);

  }

  /**
   * Finds the maximal value in timeseries.
   * 
   * @param series The timeseries.
   * @return The max value.
   */
  public double max(double[] series) {
    double max = Double.MIN_VALUE;
    for (int i = 0; i < series.length; i++) {
      if (max < series[i]) {
        max = series[i];
      }
    }
    return max;
  }

  /**
   * Finds the minimal value in timeseries.
   * 
   * @param series The timeseries.
   * @return The min value.
   */
  public double min(double[] series) {
    double min = Double.MAX_VALUE;
    for (int i = 0; i < series.length; i++) {
      if (min > series[i]) {
        min = series[i];
      }
    }
    return min;
  }

  /**
   * Computes the mean value of timeseries.
   * 
   * @param series The timeseries.
   * @return The mean value.
   */
  public double mean(double[] series) {
    double res = 0D;
    int count = 0;
    for (double tp : series) {
      res += tp;
      count += 1;

    }
    if (count > 0) {
      return res / ((Integer) count).doubleValue();
    }
    return Double.NaN;
  }

  /**
   * Computes the mean value of timeseries.
   * 
   * @param series The timeseries.
   * @return The mean value.
   */
  public double mean(int[] series) {
    double res = 0D;
    int count = 0;
    for (int tp : series) {
      res += (double) tp;
      count += 1;

    }
    if (count > 0) {
      return res / ((Integer) count).doubleValue();
    }
    return Double.NaN;
  }

  /**
   * Computes the median value of timeseries.
   * 
   * @param series The timeseries.
   * @return The median value.
   */
  public double median(double[] series) {
    double[] clonedSeries = series.clone();
    Arrays.sort(clonedSeries);

    double median;
    if (clonedSeries.length % 2 == 0) {
      median = (clonedSeries[clonedSeries.length / 2]
          + (double) clonedSeries[clonedSeries.length / 2 - 1]) / 2;
    }
    else {
      median = clonedSeries[clonedSeries.length / 2];
    }
    return median;
  }

  /**
   * Compute the variance of timeseries.
   * 
   * @param series The timeseries.
   * @return The variance.
   */
  public double var(double[] series) {
    double res = 0D;
    double mean = mean(series);
    int count = 0;
    for (double tp : series) {
      res += (tp - mean) * (tp - mean);
      count += 1;
    }
    if (count > 0) {
      return res / ((Integer) (count - 1)).doubleValue();
    }
    return Double.NaN;
  }

  /**
   * Speed-optimized implementation.
   * 
   * @param series The timeseries.
   * @return the standard deviation.
   */
  public double stDev(double[] series) {
    double num0 = 0D;
    double sum = 0D;
    int count = 0;
    for (double tp : series) {
      num0 = num0 + tp * tp;
      sum = sum + tp;
      count += 1;
    }
    double len = ((Integer) count).doubleValue();
    return Math.sqrt((len * num0 - sum * sum) / (len * (len - 1)));
  }

  /**
   * Z-Normalize routine.
   * 
   * @param series the input timeseries.
   * @param normalizationThreshold the zNormalization threshold value.
   * @return Z-normalized time-series.
   */
  public double[] znorm(double[] series, double normalizationThreshold) {
    double[] res = new double[series.length];
    double sd = stDev(series);
    if (sd < normalizationThreshold) {
      // return series.clone();
      // return array of zeros
      return res;
    }
    double mean = mean(series);
    for (int i = 0; i < res.length; i++) {
      res[i] = (series[i] - mean) / sd;
    }
    return res;
  }

  /**
   * Approximate the timeseries using PAA. If the timeseries has some NaN's they are handled as
   * follows: 1) if all values of the piece are NaNs - the piece is approximated as NaN, 2) if there
   * are some (more or equal one) values happened to be in the piece - algorithm will handle it as
   * usual - getting the mean.
   * 
   * @param ts The timeseries to approximate.
   * @param paaSize The desired length of approximated timeseries.
   * @return PAA-approximated timeseries.
   * @throws SAXException if error occurs.
   * 
   */
  public double[] paa(double[] ts, int paaSize) throws SAXException {
    // fix the length
    int len = ts.length;
    if (len < paaSize) {
      throw new SAXException("PAA size can't be greater than the timeseries size.");
    }
    // check for the trivial case
    if (len == paaSize) {
      return Arrays.copyOf(ts, ts.length);
    }
    else {
      double[] paa = new double[paaSize];
      double pointsPerSegment = (double) len / (double) paaSize;
      double[] breaks = new double[paaSize + 1];
      for (int i = 0; i < paaSize + 1; i++) {
        breaks[i] = i * pointsPerSegment;
      }

      for (int i = 0; i < paaSize; i++) {
        double segStart = breaks[i];
        double segEnd = breaks[i + 1];

        double fractionStart = Math.ceil(segStart) - segStart;
        double fractionEnd = segEnd - Math.floor(segEnd);

        int fullStart = Double.valueOf(Math.floor(segStart)).intValue();
        int fullEnd = Double.valueOf(Math.ceil(segEnd)).intValue();

        double[] segment = Arrays.copyOfRange(ts, fullStart, fullEnd);

        if (fractionStart > 0) {
          segment[0] = segment[0] * fractionStart;
        }

        if (fractionEnd > 0) {
          segment[segment.length - 1] = segment[segment.length - 1] * fractionEnd;
        }

        double elementsSum = 0.0;
        for (double e : segment) {
          elementsSum = elementsSum + e;
        }

        paa[i] = elementsSum / pointsPerSegment;

      }
      return paa;
    }
  }

  /**
   * Converts the timeseries into string using given cuts intervals. Useful for not-normal
   * distribution cuts.
   * 
   * @param vals The timeseries.
   * @param cuts The cut intervals.
   * @return The timeseries SAX representation.
   */
  public char[] ts2String(double[] vals, double[] cuts) {
    char[] res = new char[vals.length];
    for (int i = 0; i < vals.length; i++) {
      res[i] = num2char(vals[i], cuts);
    }
    return res;
  }

  /**
   * Convert the timeseries into the index using SAX cuts.
   * 
   * @param series The timeseries to convert.
   * @param cuts The alphabet cuts.
   * @return SAX cuts indices.
   * @throws Exception if error occurs.
   */
  public int[] ts2Index(double[] series, double[] cuts) throws Exception {
    int[] res = new int[series.length];
    for (int i = 0; i < series.length; i++) {
      res[i] = num2index(series[i], cuts);
    }
    return res;
  }

  /**
   * Get mapping of a number to char.
   * 
   * @param value the value to map.
   * @param cuts the array of intervals.
   * @return character corresponding to numeric value.
   */
  public char num2char(double value, double[] cuts) {
    int idx = 0;
    if (value >= 0) {
      idx = cuts.length;
      while ((idx > 0) && (cuts[idx - 1] > value)) {
        idx--;
      }
    }
    else {
      while ((idx < cuts.length) && (cuts[idx] <= value)) {
        idx++;
      }
    }
    return ALPHABET[idx];
  }

  /**
   * Converts index into char.
   * 
   * @param idx The index value.
   * @return The char by index.
   */
  public char num2char(int idx) {
    return ALPHABET[idx];
  }

  /**
   * Get mapping of number to cut index.
   * 
   * @param value the value to map.
   * @param cuts the array of intervals.
   * @return character corresponding to numeric value.
   */
  public int num2index(double value, double[] cuts) {
    int count = 0;
    while ((count < cuts.length) && (cuts[count] <= value)) {
      count++;
    }
    return count;
  }

  /**
   * Extract subseries out of series.
   * 
   * @param series The series array.
   * @param start the fragment start.
   * @param end the fragment end.
   * @return The subseries.
   * @throws IndexOutOfBoundsException If error occurs.
   */
  public double[] subseriesByCopy(double[] series, int start, int end)
      throws IndexOutOfBoundsException {
    if ((start > end) || (start < 0) || (end > series.length)) {
      throw new IndexOutOfBoundsException("Unable to extract subseries, series length: "
          + series.length + ", start: " + start + ", end: " + String.valueOf(end - start));
    }
    return Arrays.copyOfRange(series, start, end);
  }

  /**
   * Prettyfies the timeseries for screen output.
   * 
   * @param series the data.
   * @param df the number format to use.
   * 
   * @return The timeseries formatted for screen output.
   */
  public String seriesToString(double[] series, NumberFormat df) {
    StringBuffer sb = new StringBuffer();
    sb.append('[');
    for (double d : series) {
      sb.append(df.format(d)).append(',');
    }
    sb.delete(sb.length() - 2, sb.length() - 1).append("]");
    return sb.toString();
  }

  /**
   * Normalizes data in interval 0-1.
   * 
   * @param data the dataset.
   * @return normalized dataset.
   */
  public double[] normOne(double[] data) {
    double[] res = new double[data.length];
    double max = max(data);
    for (int i = 0; i < data.length; i++) {
      res[i] = data[i] / max;
    }
    return res;
  }

}