water.fvec.Frame Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-core Show documentation
H2O Core
There is a newer version: 3.8.2.9
package water.fvec;

import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import water.*;
import water.exceptions.H2OIllegalArgumentException;
import water.parser.BufferedString;
import water.util.FrameUtils;
import water.util.Log;
import water.util.PrettyPrint;
import water.util.TwoDimTable;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;

/** A collection of named {@link Vec}s, essentially an R-like Distributed Data Frame.
 *
 *  Frames represent a large distributed 2-D table with named columns
 *  ({@link Vec}s) and numbered rows.  A reasonable column limit is
 *  100K columns, but there's no hard-coded limit.  There's no real row
 *  limit except memory; Frames (and Vecs) with many billions of rows are used
 *  routinely.
 *
 *  
A Frame is a collection of named Vecs; a Vec is a collection of numbered
 *  {@link Chunk}s.  A Frame is small, cheaply and easily manipulated, it is
 *  commonly passed-by-Value.  It exists on one node, and may be
 *  stored in the {@link DKV}.  Vecs, on the other hand, must be stored in the
 *  {@link DKV}, as they represent the shared common management state for a collection
 *  of distributed Chunks.
 *
 *  
Multiple Frames can reference the same Vecs, although this sharing can
 *  make Vec lifetime management complex.  Commonly temporary Frames are used
 *  to work with a subset of some other Frame (often during algorithm
 *  execution, when some columns are dropped from the modeling process).  The
 *  temporary Frame can simply be ignored, allowing the normal GC process to
 *  reclaim it.  Such temp Frames usually have a {@code null} key.
 *
 *  
All the Vecs in a Frame belong to the same {@link Vec.VectorGroup} which
 *  then enforces {@link Chunk} row alignment across Vecs (or at least enforces
 *  a low-cost access model).  Parallel and distributed execution touching all
 *  the data in a Frame relies on this alignment to get good performance.
 *  
 *  
Example: Make a Frame from a CSV file:
 *  File file = ...
 *  NFSFileVec nfs = NFSFileVec.make(file); // NFS-backed Vec, lazily read on demand
 *  Frame fr = water.parser.ParseDataset.parse(Key.make("myKey"),nfs._key);
 *  
 * 
 *  Example: Find and remove the Vec called "unique_id" from the Frame,
 *  since modeling with a unique_id can lead to overfitting:
 *  
 *  Vec uid = fr.remove("unique_id");
 *  
 *
 *  Example: Move the response column to the last position:
 *  
 *  fr.add("response",fr.remove("response"));
 *  
 *
 */
public class Frame extends Lockable {
  /** Vec names */
  public String[] _names;
  private boolean _lastNameBig; // Last name is "Cxxx" and has largest number
  private Key[] _keys;     // Keys for the vectors
  private transient Vec[] _vecs; // The Vectors (transient to avoid network traffic)
  private transient Vec _col0; // First readable vec; fast access to the VectorGroup's Chunk layout

  public boolean hasNAs(){
    for(Vec v:_vecs)
      if(v.naCnt() > 0) return true;
    return false;
  }

  /** Creates an internal frame composed of the given Vecs and default names.  The frame has no key. */
  public Frame( Vec... vecs ){ this(null,vecs);}
  /** Creates an internal frame composed of the given Vecs and names.  The frame has no key. */
  public Frame( String names[], Vec vecs[] ) { this(null,names,vecs); }
  /** Creates an empty frame with given key. */
  public Frame( Key key ) {
    this(key,null,new Vec[0]);
  }

  /**
   * Special constructor for data with unnamed columns (e.g. svmlight) bypassing *all* checks.
   * @param key
   * @param vecs
   * @param noChecks
   */
  public Frame( Key key, Vec vecs[], boolean noChecks) {
    super(key);
    assert noChecks;
    _vecs = vecs;
    _names = new String[vecs.length];
    _keys = new Key[vecs.length];
    for (int i = 0; i < vecs.length; i++) {
      _names[i] = defaultColName(i);
      _keys[i] = vecs[i]._key;
    }
  }

  /** Creates a frame with given key, names and vectors. */
  public Frame( Key key, String names[], Vec vecs[] ) {
    super(key);

    // Require all Vecs already be installed in the K/V store
    for( Vec vec : vecs ) DKV.prefetch(vec._key);
    for( Vec vec : vecs ) assert DKV.get(vec._key) != null : " null vec: "+vec._key;

    // Always require names
    if( names==null ) {         // Make default names, all known to be unique
      _names = new String[vecs.length];
      _keys  = new Key   [vecs.length];
      _vecs  = vecs;
      for( int i=0; imake() );
    _names= fr._names.clone();
    _keys = fr._keys .clone();
    _vecs = fr.vecs().clone();
    _lastNameBig = fr._lastNameBig;
  }

  /** Default column name maker */
  public static String defaultColName( int col ) { return "C"+(1+col); }

  // Make unique names.  Efficient for the special case of appending endless
  // versions of "C123" style names where the next name is +1 over the prior
  // name.  All other names take the O(n^2) lookup.
  private int pint( String name ) {
    try { return Integer.valueOf(name.substring(1)); } 
    catch( NumberFormatException fe ) { }
    return 0;
  }
  public String uniquify( String name ) {
    String n = name;
    int lastName = 0;
    if( name.length() > 0 && name.charAt(0)=='C' )
      lastName = pint(name);
    if( _lastNameBig && _names.length > 0 ) {
      String last = _names[_names.length-1];
      if( !last.equals("") && last.charAt(0)=='C' && lastName == pint(last)+1 )
        return name;
    }
    int cnt=0, again, max=0;
    do {
      again = cnt;
      for( String s : _names ) {
        if( lastName > 0 && s.charAt(0)=='C' )
          max = Math.max(max,pint(s));
        if( n.equals(s) )
          n = name+(cnt++);
      }
    } while( again != cnt );
    if( lastName == max+1 ) _lastNameBig = true;
    return n;
  }

  /** Check that the vectors are all compatible.  All Vecs have their content
   *  sharded using same number of rows per chunk, and all names are unique.
   *  Throw an IAE if something does not match.  */
  private void checkCompatible( String name, Vec vec ) {
    if( vec instanceof AppendableVec ) return; // New Vectors are endlessly compatible
    Vec v0 = anyVec();
    if( v0 == null ) return; // No fixed-size Vecs in the Frame
    // Vector group has to be the same, or else the layout has to be the same,
    // or else the total length has to be small.
    if( !v0.checkCompatible(vec) ) {
      if(!Vec.VectorGroup.sameGroup(v0,vec))
        Log.err("Unexpected incompatible vector group, " + v0.group() + " != " + vec.group());
      if(!Arrays.equals(v0.espc(), vec.espc()))
        Log.err("Unexpected incompatible espc, " + Arrays.toString(v0.espc()) + " != " + Arrays.toString(vec.espc()));
      throw new IllegalArgumentException("Vec " + name + " is not compatible with the rest of the frame");
    }
  }

  /** Quick compatibility check between Frames.  Used by some tests for efficient equality checks. */
  public boolean isCompatible( Frame fr ) {
    if( numCols() != fr.numCols() ) return false;
    if( numRows() != fr.numRows() ) return false;
    for( int i=0; i key : _keys ) DKV.prefetch(key);
    Vec [] vecs = new Vec[_keys.length];
    for( int i=0; i<_keys.length; i++ ) vecs[i] = _keys[i].get();
    return vecs;
  }

  /** Convenience to accessor for last Vec 
   *  @return last Vec */
  public Vec lastVec() { vecs(); return _vecs [_vecs.length -1]; }
  /** Convenience to accessor for last Vec name
   *  @return last Vec name */
  public String lastVecName() {  return _names[_names.length-1]; }

  /** Force a cache-flush and reload, assuming vec mappings were altered
   *  remotely, or that the _vecs array was shared and now needs to be a
   *  defensive copy. 
   *  @return the new instance of the Frame's Vec[] */
  public final Vec[] reloadVecs() { _vecs=null; return vecs(); }
  
  /** Returns the Vec by given index, implemented by code: {@code vecs()[idx]}.
   *  @param idx idx of column
   *  @return this frame idx-th vector, never returns null */
  public final Vec vec(int idx) { return vecs()[idx]; }

  /**  Return a Vec by name, or null if missing
   *  @return a Vec by name, or null if missing */
  public Vec vec(String name) { int idx = find(name); return idx==-1 ? null : vecs()[idx]; }

  /**   Finds the column index with a matching name, or -1 if missing
   *  @return the column index with a matching name, or -1 if missing */
  public int find( String name ) {
    if( name == null ) return -1;
    assert _names != null;
    for( int i=0; i<_names.length; i++ )
      if( name.equals(_names[i]) )
        return i;
    return -1;
  }

  /**   Finds the matching column index, or -1 if missing
   *  @return the matching column index, or -1 if missing */
  public int find( Vec vec ) {
    Vec[] vecs = vecs(); //warning: side-effect
    if (vec == null) return -1;
    for( int i=0; i _frame;
    String _column_name;

    public Vec vec() {
      Value v = DKV.get(_frame);
      if (null == v) return null;
      Frame f = v.get();
      if (null == f) return null;
      return f.vec(_column_name);
    }
  }

  /** Type for every Vec */
  public byte[] types() {
    Vec[] vecs = vecs();
    byte bs[] = new byte[vecs.length];
    for( int i=0; i 1e-6;
  }

  /** The {@code Vec.byteSize} of all Vecs
   *  @return the {@code Vec.byteSize} of all Vecs */
  public long byteSize() {
    Vec[] vecs = bulkRollups();
    long sum=0;
    for (Vec vec : vecs) sum += vec.byteSize();
    return sum;
  }

  /** 64-bit checksum of the checksums of the vecs.  SHA-265 checksums of the
   *  chunks are XORed together.  Since parse always parses the same pieces of
   *  files into the same offsets in some chunk this checksum will be
   *  consistent across reparses.
   *  @return 64-bit Frame checksum */
  @Override protected long checksum_impl() {
    Vec[] vecs = vecs();
    long _checksum = 0;
    for( int i = 0; i < _names.length; ++i ) {
      long vec_checksum = vecs[i].checksum();
      _checksum ^= vec_checksum;
      long tmp = (2147483647L * i);
      _checksum ^= tmp;
    }
    _checksum *= (0xBABE + Arrays.hashCode(_names));

    // TODO: include column types?  Vec.checksum() should include type?
    return _checksum;
  }

  // Add a bunch of vecs
  public void add( String[] names, Vec[] vecs) {
    bulkAdd(names, vecs);
  }
  public void add( String[] names, Vec[] vecs, int cols ) {
    if (null == vecs || null == names) return;
    if (cols == names.length && cols == vecs.length) {
      bulkAdd(names, vecs);
    } else {
      for (int i = 0; i < cols; i++)
        add(names[i], vecs[i]);
    }
  }

  /** Append multiple named Vecs to the Frame.  Names are forced unique, by appending a
   *  unique number if needed.
   */
  private void bulkAdd(String[] names, Vec[] vecs) {
    String[] tmpnames = names.clone();
    int N = names.length;
    assert(names.length == vecs.length):"names = " + Arrays.toString(names) + ", vecs len = " + vecs.length;
    for (int i=0; ireplaceBy is false.
   *  Else replace a missing column by a constant column with given value.
   *
   *  @param names list of column names to extract
   *  @param replaceBy should be missing column replaced by a constant column
   *  @param c value for constant column
   *  @return array of 2 frames, the first is containing a desired subframe, the second one contains newly created columns or null
   *  @throws IllegalArgumentException if replaceBy is false and there is a missing column in this frame
   */
  private Frame[] subframe(String[] names, boolean replaceBy, double c){
    Vec [] vecs     = new Vec[names.length];
    Vec [] cvecs    = replaceBy ? new Vec   [names.length] : null;
    String[] cnames = replaceBy ? new String[names.length] : null;
    int ccv = 0; // counter of constant columns
    vecs();                     // Preload the vecs
    HashMap map = new HashMap<>((int) ((names.length/0.75f)+1)); // avoid rehashing by set up initial capacity
    for(int i = 0; i < _names.length; ++i) map.put(_names[i], i);
    for(int i = 0; i < names.length; ++i)
      if(map.containsKey(names[i])) vecs[i] = _vecs[map.get(names[i])];
      else if (replaceBy) {
        Log.warn("Column " + names[i] + " is missing, filling it in with " + c);
        assert cnames != null;
        cnames[ccv] = names[i];
        vecs[i] = cvecs[ccv++] = anyVec().makeCon(c);
      }
    return new Frame[] { new Frame(Key.make("subframe"+Key.make().toString()), names,vecs), ccv>0 ?  new Frame(Key.make("subframe"+Key.make().toString()), Arrays.copyOf(cnames, ccv), Arrays.copyOf(cvecs,ccv)) : null };
  }

  /** Allow rollups for all written-into vecs; used by {@link MRTask} once
   *  writing is complete.
   *  @return the original Futures, for flow-coding */
  public Futures postWrite(Futures fs) {
    for( Vec v : vecs() ) v.postWrite(fs);
    return fs;
  }

  /** Actually remove/delete all Vecs from memory, not just from the Frame.
   *  @return the original Futures, for flow-coding */
  @Override protected Futures remove_impl(Futures fs) {
    final Key[] keys = _keys;
    if( keys.length==0 ) return fs;

    // Get the nChunks without calling anyVec - which loads all Vecs eagerly,
    // only to delete them.  Supports Frames with some Vecs already deleted, as
    // a Scope cleanup action might delete Vecs out of order.
    Vec v = _col0;
    if( v == null ) {
      Vec[] vecs = _vecs;       // Read once, in case racily being cleared
      if( vecs != null )
        for( int i=0; i= vecs().length)
        throw new ArrayIndexOutOfBoundsException();
    Arrays.sort(idxs);
    Vec[] res = new Vec[idxs.length];
    Vec[] rem = new Vec[_vecs.length-idxs.length];
    String[] names = new String[rem.length];
    Key   [] keys  = new Key   [rem.length];
    int j = 0;
    int k = 0;
    int l = 0;
    for(int i = 0; i < _vecs.length; ++i) {
      if(j < idxs.length && i == idxs[j]) {
        ++j;
        res[k++] = _vecs[i];
      } else {
        rem  [l] = _vecs [i];
        names[l] = _names[i];
        keys [l] = _keys [i];
        ++l;
      }
    }
    _vecs = rem;
    _names= names;
    _keys = keys;
    assert l == rem.length && k == idxs.length;
    return res;
  }

  /**  Removes a numbered column. 
   *  @return the removed column */
  public final Vec remove( int idx ) {
    int len = _names.length;
    if( idx < 0 || idx >= len ) return null;
    Vec v = vecs()[idx];
    System.arraycopy(_names,idx+1,_names,idx,len-idx-1);
    System.arraycopy(_vecs ,idx+1,_vecs ,idx,len-idx-1);
    System.arraycopy(_keys ,idx+1,_keys ,idx,len-idx-1);
    _names = Arrays.copyOf(_names,len-1);
    _vecs  = Arrays.copyOf(_vecs ,len-1);
    _keys  = Arrays.copyOf(_keys ,len-1);
    if( v == _col0 ) _col0 = null;
    return v;
  }

  /** Remove given interval of columns from frame.  Motivated by R intervals.
   *  @param startIdx - start index of column (inclusive)
   *  @param endIdx - end index of column (exclusive)
   *  @return array of removed columns  */
  Vec[] remove(int startIdx, int endIdx) {
    int len = _names.length;
    int nlen = len - (endIdx-startIdx);
    String[] names = new String[nlen];
    Key[] keys = new Key[nlen];
    Vec[] vecs = new Vec[nlen];
    vecs();
    if (startIdx > 0) {
      System.arraycopy(_names, 0, names, 0, startIdx);
      System.arraycopy(_vecs,  0, vecs,  0, startIdx);
      System.arraycopy(_keys,  0, keys,  0, startIdx);
    }
    nlen -= startIdx;
    if (endIdx < _names.length+1) {
      System.arraycopy(_names, endIdx, names, startIdx, nlen);
      System.arraycopy(_vecs,  endIdx, vecs,  startIdx, nlen);
      System.arraycopy(_keys,  endIdx, keys,  startIdx, nlen);
    }

    Vec[] vecX = Arrays.copyOfRange(_vecs,startIdx,endIdx);
    _names = names;
    _vecs = vecs;
    _keys = keys;
    _col0 = null;
    return vecX;
  }

  /** Restructure a Frame completely */
  public void restructure( String[] names, Vec[] vecs) {
    restructure(names, vecs, vecs.length);
  }

  /** Restructure a Frame completely, but only for a specified number of columns (counting up)  */
  public void restructure( String[] names, Vec[] vecs, int cols) {
    // Make empty to dodge asserts, then "add()" them all which will check for
    // compatible Vecs & names.
    _names = new String[0];
    _keys  = new Key   [0];
    _vecs  = new Vec   [0];
    add(names,vecs,cols);
  }

  // --------------------------------------------
  // Utilities to help external Frame constructors, e.g. Spark.

  // Make an initial Frame & lock it for writing.  Build Vec Keys.
  void preparePartialFrame( String[] names ) {
    // Nuke any prior frame (including freeing storage) & lock this one
    if( _keys != null ) delete_and_lock();
    else write_lock();
    _names = names;
    _keys = new Vec.VectorGroup().addVecs(names.length);
    // No Vectors tho!!! These will be added *after* the import
  }

  // Only serialize strings, not H2O internal structures

  // Make NewChunks to for holding data from e.g. Spark.  Once per set of
  // Chunks in a Frame, before filling them.  This can be called in parallel
  // for different Chunk#'s (cidx); each Chunk can be filled in parallel.
  static NewChunk[] createNewChunks(String name, byte[] type, int cidx) {
    Frame fr = (Frame) Key.make(name).get();
    NewChunk[] nchks = new NewChunk[fr.numCols()];
    for (int i = 0; i < nchks.length; i++) {
      nchks[i] = new NewChunk(new AppendableVec(fr._keys[i], type[i]), cidx);
    }
    return nchks;
  }

  // Compress & DKV.put NewChunks.  Once per set of Chunks in a Frame, after
  // filling them.  Can be called in parallel for different sets of Chunks.
  static void closeNewChunks(NewChunk[] nchks) {
    Futures fs = new Futures();
    for (NewChunk nchk : nchks) {
      nchk.close(fs);
    }
    fs.blockForPending();
  }

  // Build real Vecs from loose Chunks, and finalize this Frame.  Called once
  // after any number of [create,close]NewChunks.
  void finalizePartialFrame( long[] espc, String[][] domains, byte[] types ) {
    // Compute elems-per-chunk.
    // Roll-up elem counts, so espc[i] is the starting element# of chunk i.
    int nchunk = espc.length;
    long espc2[] = new long[nchunk+1]; // Shorter array
    long x=0;                   // Total row count so far
    for( int i=0; iSemantics are a little odd, to match R's.  Each dimension spec can be:
   *  null - all of them
   *  
a sorted list of negative numbers (no dups) - all BUT these
   *  
an unordered list of positive - just these, allowing dups
   *  
   *
   *  The numbering is 1-based; zero's are not allowed in the lists, nor are out-of-range values.
   *  @return the sliced Frame
   */
  public Frame deepSlice( Object orows, Object ocols ) {
    // ocols is either a long[] or a Frame-of-1-Vec
    long[] cols;
    if( ocols == null ) cols = null;
    else if (ocols instanceof long[]) cols = (long[])ocols;
    else if (ocols instanceof Frame) {
      Frame fr = (Frame) ocols;
      if (fr.numCols() != 1)
        throw new IllegalArgumentException("Columns Frame must have only one column (actually has " + fr.numCols() + " columns)");
      long n = fr.anyVec().length();
      if (n > MAX_EQ2_COLS)
        throw new IllegalArgumentException("Too many requested columns (requested " + n +", max " + MAX_EQ2_COLS + ")");
      cols = new long[(int)n];
      Vec.Reader v = fr.anyVec().new Reader();
      for (long i = 0; i < v.length(); i++)
        cols[(int)i] = v.at8(i);
    } else
      throw new IllegalArgumentException("Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")");

    // Since cols is probably short convert to a positive list.
    int c2[];
    if( cols==null ) {
      c2 = new int[numCols()];
      for( int i=0; i= 0 ) {
      c2 = new int[cols.length];
      for( int i=0; i= cols.length || i < (-(1+cols[j])) ) c2[i-j] = i;
        else j++;
      }
    }
    for (int aC2 : c2)
      if (aC2 >= numCols())
        throw new IllegalArgumentException("Trying to select column " + (aC2 + 1) + " but only " + numCols() + " present.");
    if( c2.length==0 )
      throw new IllegalArgumentException("No columns selected (did you try to select column 0 instead of column 1?)");

    // Do Da Slice
    // orows is either a long[] or a Vec
    if (numRows() == 0) {
      return new MRTask() {
        @Override public void map(Chunk[] chks, NewChunk[] nchks) { for (NewChunk nc : nchks) nc.addNA(); }
      }.doAll(types(c2), this).outputFrame(names(c2), domains(c2));
    }
    if (orows == null)
      return new DeepSlice(null,c2,vecs()).doAll(types(c2),this).outputFrame(names(c2),domains(c2));
    else if (orows instanceof long[]) {
      final long CHK_ROWS=1000000;
      final long[] rows = (long[])orows;
      if (this.numRows() == 0) {
        return this;
      }
      if( rows.length==0 || rows[0] < 0 ) {
        if (rows.length != 0 && rows[0] < 0) {
          Vec v0 = this.anyVec().makeZero();
          Vec v = new MRTask() {
            @Override public void map(Chunk cs) {
              for (long er : rows) {
                if (er >= 0) continue;
                er = Math.abs(er);
                if (er < cs._start || er > (cs._len + cs._start - 1)) continue;
                cs.set((int) (er - cs._start), 1);
              }
            }
          }.doAll(v0).getResult()._fr.anyVec();
          Keyed.remove(v0._key);
          Frame slicedFrame = new DeepSlice(rows, c2, vecs()).doAll(types(c2), this.add("select_vec", v)).outputFrame(names(c2), domains(c2));
          Keyed.remove(v._key);
          Keyed.remove(this.remove(this.numCols() - 1)._key);
          return slicedFrame;
        } else {
          return new DeepSlice(rows.length == 0 ? null : rows, c2, vecs()).doAll(types(c2), this).outputFrame(names(c2), domains(c2));
        }
      }
      // Vec'ize the index array
      Futures fs = new Futures();
      AppendableVec av = new AppendableVec(Vec.newKey(),Vec.T_NUM);
      int r = 0;
      int c = 0;
      while (r < rows.length) {
        NewChunk nc = new NewChunk(av, c);
        long end = Math.min(r+CHK_ROWS, rows.length);
        for (; r < end; r++) {
          nc.addNum(rows[r]);
        }
        nc.close(c++, fs);
      }
      Vec c0 = av.layout_and_close(fs);   // c0 is the row index vec
      fs.blockForPending();
      Frame ff = new Frame(new String[]{"rownames"}, new Vec[]{c0});
      Frame fr2 = new Slice(c2, this).doAll(types(c2),ff).outputFrame(names(c2), domains(c2));
      Keyed.remove(c0._key);
      Keyed.remove(av._key);
      ff.delete();
      return fr2;
    }
    Frame frows = (Frame)orows;
    // It's a compatible Vec; use it as boolean selector.
    // Build column names for the result.
    Vec [] vecs = new Vec[c2.length];
    String [] names = new String[c2.length];
    for(int i = 0; i < c2.length; ++i){
      vecs[i] = _vecs[c2[i]];
      names[i] = _names[c2[i]];
    }
    Frame ff = new Frame(names, vecs);
    ff.add("predicate", frows.anyVec());
    return new DeepSelect().doAll(types(c2),ff).outputFrame(names(c2),domains(c2));
  }

  // Slice and return in the form of new chunks.
  private static class Slice extends MRTask {
    final Frame  _base;   // the base frame to slice from
    final int[]  _cols;
    Slice(int[] cols, Frame base) { _cols = cols; _base = base; }
    @Override public void map(Chunk[] ix, NewChunk[] ncs) {
      final Vec[] vecs = new Vec[_cols.length];
      final Vec   anyv = _base.anyVec();
      final long  nrow = anyv.length();
      long  r    = ix[0].at8(0);
      int   last_ci = anyv.elem2ChunkIdx(r= nrow) {
          for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN);
        } else {
          if (r < last_c0 || r >= last_c1) {
            last_ci = anyv.elem2ChunkIdx(r);
            last_c0 = anyv.espc()[last_ci];
            last_c1 = anyv.espc()[last_ci + 1];
            for (int c = 0; c < vecs.length; c++)
              last_cs[c] = vecs[c].chunkForChunkIdx(last_ci);
          }
          for (int c = 0; c < vecs.length; c++)
            if( vecs[c].isUUID() ) ncs[c].addUUID(last_cs[c], r);
            else if( vecs[c].isString() ) ncs[c].addStr(last_cs[c],r);
            else                   ncs[c].addNum (last_cs[c].at_abs(r));
        }
      }
    }
  }




  // Convert first 100 rows to a 2-d table
  @Override public String toString( ) { return toString(0,20); }

  // Convert len rows starting at off to a 2-d ascii table
  public String toString( long off, int len ) {
    if( off > numRows() ) off = numRows();
    if( off+len > numRows() ) len = (int)(numRows()-off);

    String[] rowHeaders = new String[len+5];
    rowHeaders[0] = "min";
    rowHeaders[1] = "mean";
    rowHeaders[2] = "stddev";
    rowHeaders[3] = "max";
    rowHeaders[4] = "missing";
    for( int i=0; i {
    final int  _cols[];
    final long _rows[];
    final byte _isInt[];
    DeepSlice( long rows[], int cols[], Vec vecs[] ) {
      _cols=cols;
      _rows=rows;
      _isInt = new byte[cols.length];
      for( int i=0; i= _rows.length) break; // All done with row selections
          long r = _rows[rx++];// Next row selector
          if (r < rstart) continue;
          rlo = (int) (r - rstart);
          rhi = rlo + 1;        // Stop at the next row
          while (rx < _rows.length && (_rows[rx] - rstart) == rhi && rhi < rlen) {
            rx++;
            rhi++;      // Grab sequential rows
          }
        }
        // Process this next set of rows
        // For all cols in the new set;
        BufferedString tmpStr = new BufferedString();
        for (int i = 0; i < _cols.length; i++) {
          Chunk oc = chks[_cols[i]];
          NewChunk nc = nchks[i];
          if (_isInt[i] == 1) { // Slice on integer columns
            for (int j = rlo; j < rhi; j++)
              if (oc._vec.isUUID()) nc.addUUID(oc, j);
              else if (oc.isNA(j)) nc.addNA();
              else nc.addNum(oc.at8(j), 0);
          } else if (oc._vec.isString()) {
            for (int j = rlo; j < rhi; j++)
              nc.addStr(oc.atStr(tmpStr, j));
          } else {// Slice on double columns
            for (int j = rlo; j < rhi; j++)
              nc.addNum(oc.atd(j));
          }
        }
        rlo = rhi;
        if (_rows == null) break;
      }
    }
  }

  /**
   * Create a copy of the input Frame and return that copied Frame. All Vecs in this are copied in parallel.
   * Caller mut do the DKV.put
   * @param keyName Key for resulting frame. If null, no key will be given.
   * @return The fresh copy of fr.
   */
  public Frame deepCopy(String keyName) {
    return new MRTask() {
      @Override public void map(Chunk[] cs, NewChunk[] ncs) {
        for(int col=0;col {
    final Vec[] _vecs;
    DoCopyFrame(Vec[] vecs) {
      _vecs = new Vec[vecs.length];
      int rowLayout = _vecs[0]._rowLayout;
      for(int i=0;i {

    @Override public void map( Chunk chks[], NewChunk nchks[] ) {
      Chunk pred = chks[chks.length - 1];
      int[] ids = new int[pred._len];
      double[] vals = new double[ids.length];
      int selected = pred.nonzeros(ids);
      int[] selectedIds = new int[selected];
      // todo keeping old behavior of ignoring missing, while R does insert missing into result
      // filter out missing
      int non_nas = 0;
      for(int i = 0; i < selected; ++i)
        if(!pred.isNA(ids[i]))
          selectedIds[non_nas++] = ids[i];
      selected = non_nas;
      for (int i = 0; i < chks.length - 1; ++i) {
        Chunk c = chks[i]; // do not need to inflate cause sparse does not compress doubles
        NewChunk nc = nchks[i];
        if (c.isSparseZero() || c.isSparseNA()) {
          nc.alloc_indices(selectedIds.length);
          nc.alloc_doubles(selectedIds.length);
          nc._sparseNA = c.isSparseNA();
          int n = c.asSparseDoubles(vals, ids);
          int k = 0;
          int l = -1;
          for (int j = 0; j < n; ++j) {
            while (k < selected && selectedIds[k] < ids[j]) ++k;
            if (k == selected) break;
            if (selectedIds[k] == ids[j]) {
              int add = k - l - 1;
              if(add > 0) {
                if (c.isSparseZero()) nc.addZeros(add);
                else nc.addNAs(add);
              }
              nc.addNum(vals[j]);
              l = k;
              k++;
            }
          }
          int add = selected - l - 1;
          if(add > 0) {
            if (c.isSparseZero()) nc.addZeros(add);
            else nc.addNAs(add);
          }
          assert nc.len() == selected:"len = " + nc.len() + ", selected = " + selected;
        } else {
          NewChunk src = new NewChunk(c);
          src = c.inflate_impl(src);
//          if(src.sparseNA() || src.sparseZero())
//            src.cancel_sparse();
          for (int j = 0; j < selected; ++j)
            src.add2Chunk(nc, selectedIds[j]);
        }
      }
    }
  }
  private String[][] domains(int [] cols){
    Vec[] vecs = vecs();
    String[][] res = new String[cols.length][];
    for(int i = 0; i < cols.length; ++i)
      res[i] = vecs[cols[i]].domain();
    return res;
  }

  private String [] names(int [] cols){
    if(_names == null)return null;
    String [] res = new String[cols.length];
    for(int i = 0; i < cols.length; ++i)
      res[i] = _names[cols[i]];
    return res;
  }

  private byte[] types(int [] cols){
    Vec[] vecs = vecs();
    byte[] res = new byte[cols.length];
    for(int i = 0; i < cols.length; ++i)
      res[i] = vecs[cols[i]]._type;
    return res;
  }

  public Vec[] makeCompatible( Frame f) {return makeCompatible(f,false);}
  /** Return array of Vectors if 'f' is compatible with 'this', else return a new
   *  array of Vectors compatible with 'this' and a copy of 'f's data otherwise.  Note
   *  that this can, in the worst case, copy all of {@code this}s' data.
   *  @return This Frame's data in an array of Vectors that is compatible with {@code f}. */
  public Vec[] makeCompatible( Frame f, boolean force) {
    // Small data frames are always "compatible"
    if (anyVec() == null)      // Or it is small
      return f.vecs();                 // Then must be compatible
    Vec v1 = anyVec();
    Vec v2 = f.anyVec();
    if(v1.length() != v2.length())
      throw new IllegalArgumentException("Can not make vectors of different length compatible!");
    if (v2 == null || (!force && v1.checkCompatible(v2)))
      return f.vecs();
    // Ok, here make some new Vecs with compatible layout
    Key k = Key.make();
    H2O.submitTask(new RebalanceDataSet(this, f, k)).join();
    Frame f2 = (Frame)k.get();
    DKV.remove(k);
    for (Vec v : f2.vecs()) Scope.track(v);
    return f2.vecs();
  }

  private boolean isLastRowOfCurrentNonEmptyChunk(int chunkIdx, long row) {
    long[] espc = anyVec().espc();
    long lastRowOfCurrentChunk = espc[chunkIdx + 1] - 1;
    // Assert chunk is non-empty.
    assert espc[chunkIdx + 1] > espc[chunkIdx];
    // Assert row numbering sanity.
    assert row <= lastRowOfCurrentChunk;
    return row >= lastRowOfCurrentChunk;
  }

  public static Job export(Frame fr, String path, String frameName, boolean overwrite) {
    // Validate input
    boolean fileExists = H2O.getPM().exists(path);
    if (overwrite && fileExists) {
      Log.warn("File " + path + " exists, but will be overwritten!");
    } else if (!overwrite && fileExists) {
      throw new H2OIllegalArgumentException(path, "exportFrame", "File " + path + " already exists!");
    }
    InputStream is = (fr).toCSV(true, false);
    Job job =  new Job(fr._key,"water.fvec.Frame","Export dataset");
    FrameUtils.ExportTask t = new FrameUtils.ExportTask(is, path, frameName, overwrite, job);
    return job.start(t, fr.anyVec().nChunks());
  }

  /** Convert this Frame to a CSV (in an {@link InputStream}), that optionally
   *  is compatible with R 3.1's recent change to read.csv()'s behavior.
   *  @return An InputStream containing this Frame as a CSV */
  public InputStream toCSV(boolean headers, boolean hex_string) {
    return new CSVStream(headers, hex_string);
  }

  public class CSVStream extends InputStream {
    private final boolean _hex_string;
    byte[] _line;
    int _position;
    public volatile int _curChkIdx;
    long _row;

    CSVStream(boolean headers, boolean hex_string) {
      _curChkIdx=0;
      _hex_string = hex_string;
      StringBuilder sb = new StringBuilder();
      Vec vs[] = vecs();
      if( headers ) {
        sb.append('"').append(_names[0]).append('"');
        for(int i = 1; i < vs.length; i++)
          sb.append(',').append('"').append(_names[i]).append('"');
        sb.append('\n');
      }
      _line = sb.toString().getBytes();
    }

    byte[] getBytesForRow() {
      StringBuilder sb = new StringBuilder();
      Vec vs[] = vecs();
      BufferedString tmpStr = new BufferedString();
      for( int i = 0; i < vs.length; i++ ) {
        if(i > 0) sb.append(',');
        if(!vs[i].isNA(_row)) {
          if( vs[i].isCategorical() ) sb.append('"').append(vs[i].factor(vs[i].at8(_row))).append('"');
          else if( vs[i].isUUID() ) sb.append(PrettyPrint.UUID(vs[i].at16l(_row), vs[i].at16h(_row)));
          else if( vs[i].isInt() ) sb.append(vs[i].at8(_row));
          else if (vs[i].isString()) sb.append('"').append(vs[i].atStr(tmpStr, _row)).append('"');
          else {
            double d = vs[i].at(_row);
            // R 3.1 unfortunately changed the behavior of read.csv().
            // (Really type.convert()).
            //
            // Numeric values with too much precision now trigger a type conversion in R 3.1 into a factor.
            //
            // See these discussions:
            //   https://bugs.r-project.org/bugzilla/show_bug.cgi?id=15751
            //   https://stat.ethz.ch/pipermail/r-devel/2014-April/068778.html
            //   http://stackoverflow.com/questions/23072988/preserve-old-pre-3-1-0-type-convert-behavior
            String s = _hex_string ? Double.toHexString(d) : Double.toString(d);
            sb.append(s);
          }
        }
      }
      sb.append('\n');
      return sb.toString().getBytes();
    }

    @Override public int available() throws IOException {
      // Case 1:  There is more data left to read from the current line.
      if (_position != _line.length) {
        return _line.length - _position;
      }

      // Case 2:  Out of data.
      if (_row == numRows()) {
        return 0;
      }

      // Case 3:  Return data for the current row.
      //          Note this will fast-forward past empty chunks.
      _curChkIdx = anyVec().elem2ChunkIdx(_row);
      _line = getBytesForRow();
      _position = 0;

      // Flush non-empty remote chunk if we're done with it.
      if (isLastRowOfCurrentNonEmptyChunk(_curChkIdx, _row)) {
        for (Vec v : vecs()) {
          Key k = v.chunkKey(_curChkIdx);
          if( !k.home() )
            H2O.raw_remove(k);
        }
      }

      _row++;

      return _line.length;
    }

    @Override public void close() throws IOException {
      super.close();
      _line = null;
    }

    @Override public int read() throws IOException {
      return available() == 0 ? -1 : _line[_position++];
    }

    @Override public int read(byte[] b, int off, int len) throws IOException {
      int n = available();
      if(n > 0) {
        n = Math.min(n, len);
        System.arraycopy(_line, _position, b, off, n);
        _position += n;
      }
      return n;
    }
  }

  @Override public Class makeSchema() { return water.api.KeyV3.FrameKeyV3.class; }
}