All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.util.FrameUtils Maven / Gradle / Ivy

There is a newer version: 3.8.2.9
Show newest version
package water.util;

import java.io.*;
import java.net.URI;
import java.util.Random;

import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NFSFileVec;
import water.fvec.Vec;
import water.parser.ParseDataset;
import water.parser.ParseSetup;

public class FrameUtils {


  /** Parse given file(s) into the form of single frame represented by the given key.
   *
   * @param okey  destination key for parsed frame
   * @param files  files to parse
   * @return a new frame
   */
  public static Frame parseFrame(Key okey, File ...files) throws IOException {
    if (files == null || files.length == 0) {
      throw new IllegalArgumentException("List of files is empty!");
    }
    for (File f : files) {
      if (!f.exists())
        throw new FileNotFoundException("File not found " + f);
    }
    // Create output key if it is not given
    if(okey == null) okey = Key.make(files[0].getName());
    Key[] inKeys = new Key[files.length];
    for (int i=0; i {
    final int N;
    public double [] res;
    public Vec2ArryTsk(int N){this.N = N;}
    @Override public void setupLocal(){
      res = MemoryManager.malloc8d(N);
    }
    @Override public void map(Chunk c){
      final int off = (int)c.start();
      for(int i = 0; i < c._len; i = c.nextNZ(i))
        res[off+i] = c.atd(i);
    }
    @Override public void reduce(Vec2ArryTsk other){
      if(res != other.res) {
        for(int i = 0; i < res.length; ++i) {
          assert res[i] == 0 || other.res[i] == 0;
          res[i] += other.res[i]; // assuming only one nonzero
        }
      }
    }
  }
  public static double [] asDoubles(Vec v){
    if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array");
    return new Vec2ArryTsk((int)v.length()).doAll(v).res;
  }
  private static class Vec2IntArryTsk extends MRTask {
    final int N;
    public int [] res;
    public Vec2IntArryTsk(int N){this.N = N;}
    @Override public void setupLocal(){
      res = MemoryManager.malloc4(N);
    }
    @Override public void map(Chunk c){
      final int off = (int)c.start();
      for(int i = 0; i < c._len; i = c.nextNZ(i))
        res[off+i] = (int)c.at8(i);
    }
    @Override public void reduce(Vec2IntArryTsk other){
      if(res != other.res) {
        for(int i = 0; i < res.length; ++i) {
          assert res[i] == 0 || other.res[i] == 0;
          res[i] += other.res[i]; // assuming only one nonzero
        }
      }
    }
  }
  public static int [] asInts(Vec v){
    if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array");
    return new Vec2IntArryTsk((int)v.length()).doAll(v).res;
  }

  /**
   * Compute a chunk summary (how many chunks of each type, relative size, total size)
   * @param fr
   * @return chunk summary
   */
  public static ChunkSummary chunkSummary(Frame fr) {
    return new ChunkSummary().doAll(fr);
  }

  /** Generate given numbers of keys by suffixing key by given numbered suffix. */
  public static Key[] generateNumKeys(Key mk, int num) { return generateNumKeys(mk, num, "_part"); }
  public static Key[] generateNumKeys(Key mk, int num, String delim) {
    Key[] ks = new Key[num];
    String n = mk!=null ? mk.toString() : "noname";
    String suffix = "";
    if (n.endsWith(".hex")) {
      n = n.substring(0, n.length()-4); // be nice
      suffix = ".hex";
    }
    for (int i=0; i _job;
    final Key _dataset;
    final double _fraction;
    final long _seed;

    public MissingInserter(Key frame, long seed, double frac){
      _dataset = frame; _seed = seed; _fraction = frac;
    }

    /**
     * Driver for MissingInserter
     */
    class MissingInserterDriver extends H2O.H2OCountedCompleter {
      final Frame _frame;
      MissingInserterDriver(Frame frame) {_frame = frame; }
      @Override
      public void compute2() {
        new MRTask() {
          @Override public void map (Chunk[]cs){
            final Random rng = RandomUtils.getRNG(0);
            for (int c = 0; c < cs.length; c++) {
              for (int r = 0; r < cs[c]._len; r++) {
                rng.setSeed(_seed + 1234 * c ^ 1723 * (cs[c].start() + r));
                if (rng.nextDouble() < _fraction) cs[c].setNA(r);
              }
            }
            _job.update(1);
          }
        }.doAll(_frame);
        tryComplete();
      }
    }

    public Job execImpl() {
      _job = new Job(_dataset, Frame.class.getName(), "MissingValueInserter");
      if (DKV.get(_dataset) == null)
        throw new IllegalArgumentException("Invalid Frame key " + _dataset + " (Frame doesn't exist).");
      if (_fraction < 0 || _fraction > 1 ) throw new IllegalArgumentException("fraction must be between 0 and 1.");
      final Frame frame = DKV.getGet(_dataset);
      MissingInserterDriver mid = new MissingInserterDriver(frame);
      int work = frame.vecs()[0].nChunks();
      return _job.start(mid, work);
    }
  }

  /**
   * compute fraction of sparse chunks in this array.
   * @param chks
   * @return
   */
  public static double sparseRatio(Chunk [] chks) {
    double cnt = 0;
    double reg = 1.0/chks.length;
    for(Chunk c :chks)
      if(c.isSparseNA()){
        cnt += c.sparseLenNA()/(double)c.len();
      } else if(c.isSparseZero()){
        cnt += c.sparseLenZero()/(double)c.len();
      } else cnt += 1;
    return cnt * reg;
  }

  public static class WeightedMean extends MRTask {
    private double _wresponse;
    private double _wsum;
    public  double weightedMean() {
      return _wsum == 0 ? 0 : _wresponse / _wsum;
    }
    @Override public void map(Chunk response, Chunk weight, Chunk offset) {
      for (int i=0;i {
    final InputStream _csv;
    final String _path;
    final String _frameName;
    final boolean _overwrite;
    final Job _j;

    public ExportTask(InputStream csv, String path, String frameName, boolean overwrite, Job j) {
      _csv = csv;
      _path = path;
      _frameName = frameName;
      _overwrite = overwrite;
      _j = j;
    }

    private long copyStream(OutputStream os, final int buffer_size) {
      long len = 0;
      int curIdx = 0;
      try {
        byte[] bytes = new byte[buffer_size];
        for (; ; ) {
          int count = _csv.read(bytes, 0, buffer_size);
          if (count <= 0) {
            break;
          }
          len += count;
          os.write(bytes, 0, count);
          int workDone = ((Frame.CSVStream) _csv)._curChkIdx;
          if (curIdx != workDone) {
            _j.update(workDone - curIdx);
            curIdx = workDone;
          }
        }
      } catch (Exception ex) {
        throw new RuntimeException(ex);
      }
      return len;
    }

    @Override public void compute2() {
      OutputStream os = null;
      long written = -1;
      try {
        os = H2O.getPM().create(_path, _overwrite);
        written = copyStream(os, 4 * 1024 * 1024);
      } finally {
        if (os != null) {
          try {
            os.flush(); // Seems redundant, but seeing a short-file-read on windows sometimes
            os.close();
            Log.info("Key '" + _frameName + "' of "+written+" bytes was written to " + _path + ".");
          } catch (Exception e) {
            Log.err(e);
          }
        }
      }
      tryComplete();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy