All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.h2obindings.H2OHelper Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.apache.mahout.h2obindings;

import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.SparseMatrix;
import org.apache.mahout.math.DenseVector;

import water.MRTask;
import water.Futures;
import water.fvec.Frame;
import water.fvec.Vec;
import water.fvec.Chunk;
import water.parser.ValueString;
import water.util.ArrayUtils;

import java.util.Map;
import java.util.HashMap;
import java.io.Serializable;

import org.apache.mahout.h2obindings.drm.H2ODrm;
import org.apache.mahout.h2obindings.drm.H2OBCast;

// for makeEmptyStrVec
import water.Key;
import water.DKV;
import water.fvec.CStrChunk;

import scala.Function1;
import scala.Function2;

/**
 * Collection of helper methods for H2O backend.
 */
public class H2OHelper {
  /**
   * Predicate to check if data is sparse in Frame.
   *
   * If the number of missing elements is 32x times the number of present
   * elements, consider it as sparse.
   *
   * @param frame Frame storing matrix data.
   * @return True if data is sparse in Frame.
   */
  public static boolean isSparse(Frame frame) {
    long rows = frame.numRows();
    long cols = frame.numCols();

    /**
     * MRTask to aggregate precalculated per-chunk sparse lengths
     */
    class MRTaskNZ extends MRTask {
      long sparselen;
      @Override
      public void map(Chunk chks[]) {
        for (Chunk chk : chks) {
          sparselen += chk.sparseLen();
        }
      }
      @Override
      public void reduce(MRTaskNZ other) {
        sparselen += other.sparselen;
      }
    }

    long sparselen = new MRTaskNZ().doAll(frame).sparselen;

    return (((rows * cols) / (sparselen + 1)) > 32);
  }

  /**
   * Create a Mahout Matrix from a DRM.
   *
   * Create either Sparse or Dense Matrix depending on number of missing
   * elements in DRM.
   *
   * @param drm DRM object to create Matrix from.
   * @return created Matrix.
   */
  public static Matrix matrixFromDrm(H2ODrm drm) {
    Frame frame = drm.frame;
    Vec labels = drm.keys;
    Matrix m;

    if (isSparse(frame)) {
      m = new SparseMatrix((int)frame.numRows(), frame.numCols());
    } else {
      m = new DenseMatrix((int)frame.numRows(), frame.numCols());
    }

    int c = 0;
    // Fill matrix, column at a time.
    for (Vec v : frame.vecs()) {
      for (int r = 0; r < frame.numRows(); r++) {
        double d;
        if (!v.isNA(r) && ((d = v.at(r)) != 0.0)) {
          m.setQuick(r, c, d);
        }
      }
      c++;
    }

    // If string keyed, set the stings as rowlabels.
    if (labels != null) {
      Map map = new HashMap<>();
      ValueString vstr = new ValueString();
      for (long i = 0; i < labels.length(); i++) {
        map.put(labels.atStr(vstr, i).toString(), (int)i);
      }
      m.setRowLabelBindings(map);
    }
    return m;
  }

  /**
   * Calculate Means of elements in a column, and return as a Vector.
   *
   * H2O precalculates means in a Vec, and a Vec corresponds to a column.
   *
   * @param frame Frame backing the H2O DRM.
   * @return Vector of pre-calculated means.
   */
  public static Vector colMeans(Frame frame) {
    double means[] = new double[frame.numCols()];
    for (int i = 0; i < frame.numCols(); i++) {
      means[i] = frame.vecs()[i].mean();
    }
    return new DenseVector(means);
  }

  /**
   * Calculate Sums of elements in a column, and return as a Vector.
   *
   * Run an MRTask Job to add up sums.
   * WARNING: Vulnerable to overflow. No way around it.
   *
   * @param frame Frame backing the H2O DRM.
   * @return Vector of calculated sums.
   */
  public static Vector colSums(Frame frame) {
    /**
     * MRTask to calculate sums of elements in all columns.
     */
    class MRTaskSum extends MRTask {
      public double sums[];
      @Override
      public void map(Chunk chks[]) {
        sums = new double[chks.length];

        for (int c = 0; c < chks.length; c++) {
          for (int r = 0; r < chks[c].len(); r++) {
            sums[c] += chks[c].atd(r);
          }
        }
      }
      @Override
      public void reduce(MRTaskSum other) {
        ArrayUtils.add(sums, other.sums);
      }
    }
    return new DenseVector(new MRTaskSum().doAll(frame).sums);
  }

  /**
   * Calculate Sum of squares of all elements in the DRM.
   *
   * Run an MRTask Job to add up sums of squares.
   * WARNING: Vulnerable to overflow. No way around it.
   *
   * @param frame Frame backing the H2O DRM.
   * @return Sum of squares of all elements in the DRM.
   */
  public static double sumSqr(Frame frame) {
    /**
     * MRTask to calculate sums of squares of all elements.
     */
    class MRTaskSumSqr extends MRTask {
      public double sumSqr;
      @Override
      public void map(Chunk chks[]) {
        for (Chunk chk : chks) {
          for (int r = 0; r < chk.len(); r++) {
            sumSqr += (chk.atd(r) * chk.atd(r));
          }
        }
      }
      @Override
      public void reduce(MRTaskSumSqr other) {
        sumSqr += other.sumSqr;
      }
    }
    return new MRTaskSumSqr().doAll(frame).sumSqr;
  }

  /**
   * Count non-zero elements in all columns, and return as a Vector.
   *
   * Run an MRTask Job to count non-zero elements per column.
   *
   * @param frame Frame backing the H2O DRM.
   * @return Vector of counted non-zero elements.
   */
  public static Vector nonZeroCnt(Frame frame) {
    /**
     * MRTask to count all non-zero elements.
     */
    class MRTaskNonZero extends MRTask {
      public double sums[];
      @Override
      public void map(Chunk chks[]) {
        sums = new double[chks.length];

        for (int c = 0; c < chks.length; c++) {
          for (int r = 0; r < chks[c].len(); r++) {
            if ((long)chks[c].atd(r) != 0) {
              sums[c] ++;
            }
          }
        }
      }
      @Override
      public void reduce(MRTaskNonZero other) {
        ArrayUtils.add(sums, other.sums);
      }
    }
    return new DenseVector(new MRTaskNonZero().doAll(frame).sums);
  }

  /** Convert String->Integer map to Integer->String map */
  private static Map reverseMap(Map map) {
    if (map == null) {
      return null;
    }

    Map rmap = new HashMap<>();

    for(Map.Entry entry : map.entrySet()) {
      rmap.put(entry.getValue(),entry.getKey());
    }

    return rmap;
  }

  /**
   * Calculate optimum chunk size for given parameters.
   *
   * Chunk size is the number of elements stored per partition per column.
   *
   * @param nrow Number of rows in the DRM.
   * @param minHint Minimum number of partitions to create, if passed value is not -1.
   * @param exactHint Exact number of partitions to create, if passed value is not -1.
   * @return Calculated optimum chunk size.
   */
  private static int chunkSize(long nrow, int minHint, int exactHint) {
    int chunkSz;
    int partsHint = Math.max(minHint, exactHint);

    if (partsHint < 1) {
      /* XXX: calculate based on cloud size and # of cpu */
      partsHint = 4;
    }

    chunkSz = (int)(((nrow - 1) / partsHint) + 1);
    if (exactHint > 0) {
      return chunkSz;
    }

    if (chunkSz > 1e6) {
      chunkSz = (int)1e6;
    }

    if (minHint > 0) {
      return chunkSz;
    }

    if (chunkSz < 1e3) {
      chunkSz = (int)1e3;
    }

    return chunkSz;
  }

  /**
   * Ingest a Mahout Matrix into an H2O DRM.
   *
   * Frame is the backing data structure behind CheckpointedDrm.
   *
   * @param m Mahout Matrix to ingest data from.
   * @param minHint Hint for minimum number of partitions in created DRM.
   * @param exactHint Hint for exact number of partitions in created DRM.
   * @return Created H2O backed DRM.
   */
  public static H2ODrm drmFromMatrix(Matrix m, int minHint, int exactHint) {
    // First create an empty (0-filled) frame of the required dimensions
    Frame frame = emptyFrame(m.rowSize(), m.columnSize(), minHint, exactHint);
    Vec labels = null;
    Vec.Writer writers[] = new Vec.Writer[m.columnSize()];
    Futures closer = new Futures();

    // "open" vectors for writing efficiently in bulk
    for (int i = 0; i < writers.length; i++) {
      writers[i] = frame.vecs()[i].open();
    }

    for (int r = 0; r < m.rowSize(); r++) {
      for (int c = 0; c < m.columnSize(); c++) {
        writers[c].set(r, m.getQuick(r, c));
      }
    }

    for (int c = 0; c < m.columnSize(); c++) {
      writers[c].close(closer);
    }
    // If string labeled matrix, create aux Vec
    Map map = m.getRowLabelBindings();
    if (map != null) {
      // label vector must be similarly partitioned like the Frame
      labels = makeEmptyStrVec(frame.anyVec());
      Vec.Writer writer = labels.open();
      Map rmap = reverseMap(map);
      for (int r = 0; r < m.rowSize(); r++) {
        writer.set(r, rmap.get(r));
      }

      writer.close(closer);
    }

    closer.blockForPending();

    return new H2ODrm(frame, labels);
  }

  /**
   * Create an empty (zero-filled) H2O Frame efficiently.
   *
   * Create a zero filled Frame with specified cardinality.
   * Do not actually fill zeroes in each cell, create pre-compressed chunks.
   * Time taken per column asymptotically at O(nChunks), not O(nrow).
   *
   * @param nrow Number of rows in the Frame.
   * @param ncol Number of columns in the Frame.
   * @param minHint Hint for minimum number of chunks per column in created Frame.
   * @param exactHint Hint for exact number of chunks per column in created Frame.
   * @return Created Frame.
   */
  public static Frame emptyFrame(long nrow, int ncol, int minHint, int exactHint) {
    Vec.VectorGroup vg = new Vec.VectorGroup();

    return emptyFrame(nrow, ncol, minHint, exactHint, vg);
  }

  /**
   * Create an empty (zero-filled) H2O Frame efficiently.
   *
   * Create a zero filled Frame with specified cardinality.
   * Do not actually fill zeroes in each cell, create pre-compressed chunks.
   * Time taken per column asymptotically at O(nChunks), not O(nrow).
   *
   * @param nrow Number of rows in the Frame.
   * @param ncol Number of columns in the Frame.
   * @param minHint Hint for minimum number of chunks per column in created Frame.
   * @param exactHint Hint for exact number of chunks per column in created Frame.
   * @param vg Shared VectorGroup so that all columns are similarly partitioned.
   * @return Created Frame.
   */
  public static Frame emptyFrame(long nrow, int ncol, int minHint, int exactHint, Vec.VectorGroup vg) {
    int chunkSz = chunkSize(nrow, minHint, exactHint);
    int nchunks = (int)((nrow - 1) / chunkSz) + 1; // Final number of Chunks per Vec
    long espc[] = new long[nchunks + 1];

    for (int i = 0; i < nchunks; i++) {
      espc[i] = i * chunkSz;
    }
    espc[nchunks] = nrow;
    // Create a vector template for new vectors
    Vec vtemplate = new Vec(vg.addVec(), espc);
    // Make ncol-numeric vectors
    Vec[] vecs = vtemplate.makeCons(ncol, 0, null, null);

    return new Frame(vecs);
  }


  /**
   * The following two methods: vecChunkLen and makeEmptyStrVec
   * are h2o-0.1.25 specific.
   */
  public static Vec makeEmptyStrVec(final Vec template) {
    final int nChunks = template.nChunks();
    Key key = template.group().addVec();
    final Vec emptystr = new Vec(key, template._espc, null, Vec.T_NUM);

    new MRTask() {
      @Override protected void setupLocal() {
        for (int i = 0; i < nChunks; i++) {
          Key k = emptystr.chunkKey(i);
          int chklen = vecChunkLen(template, i);
          int stridx[] = new int[chklen];
          byte b[] = new byte[1]; b[0] = 0;
          for (int j = 0; j < chklen; j++) stridx[j] = -1;
          if (k.home()) DKV.put(k, new CStrChunk(1, b, chklen, stridx), _fs);
        }
        if (emptystr._key.home()) DKV.put(emptystr._key, emptystr, _fs);
      }
    }.doAllNodes();
    return emptystr;
  }

  public static int vecChunkLen(Vec template, int chunk) {
    return (int) (template._espc[chunk + 1] - template._espc[chunk]);
  }

  /**
   * Create an empty (zero-filled) H2O DRM.
   *
   * Create a zero filled DRM with specified cardinality.
   * Use the efficient emptyFrame() method internally.
   *
   * @param nrow Number of rows in the Frame.
   * @param ncol Number of columns in the Frame.
   * @param minHint Hint for minimum number of chunks per column in created Frame.
   * @param exactHint Hint for exact number of chunks per column in created Frame.
   * @return Created DRM.
   */
  public static H2ODrm emptyDrm(long nrow, int ncol, int minHint, int exactHint) {
    return new H2ODrm(emptyFrame(nrow, ncol, minHint, exactHint));
  }

  public static Matrix allreduceBlock(H2ODrm drmA, Object bmfn, Object rfn) {
    class MRTaskMR extends MRTask {
      H2OBCast bmf_out;
      Serializable bmf;
      Serializable rf;

      public MRTaskMR(Object _bmf, Object _rf) {
        bmf = (Serializable) _bmf;
        rf = (Serializable) _rf;
      }

      @Override
      public void map(Chunk chks[]) {
        Function1 f = (Function1) bmf;
        bmf_out = new H2OBCast((Matrix)f.apply(new scala.Tuple2(null, new H2OBlockMatrix(chks))));
      }

      @Override
      public void reduce(MRTaskMR that) {
        Function2 f = (Function2) rf;
        bmf_out = new H2OBCast((Matrix)f.apply(this.bmf_out.value(), that.bmf_out.value()));
      }
    }

    return new MRTaskMR(bmfn, rfn).doAll(drmA.frame).bmf_out.value();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy