
water.fvec.Frame Maven / Gradle / Ivy
package water.fvec; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import water.*; import water.exceptions.H2OIllegalArgumentException; import water.parser.BufferedString; import water.util.FrameUtils; import water.util.Log; import water.util.PrettyPrint; import water.util.TwoDimTable; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.HashMap; /** A collection of named {@link Vec}s, essentially an R-like Distributed Data Frame. * *
isFrames represent a large distributed 2-D table with named columns * ({@link Vec}s) and numbered rows. A reasonable column limit is * 100K columns, but there's no hard-coded limit. There's no real row * limit except memory; Frames (and Vecs) with many billions of rows are used * routinely. * *
A Frame is a collection of named Vecs; a Vec is a collection of numbered * {@link Chunk}s. A Frame is small, cheaply and easily manipulated, it is * commonly passed-by-Value. It exists on one node, and may be * stored in the {@link DKV}. Vecs, on the other hand, must be stored in the * {@link DKV}, as they represent the shared common management state for a collection * of distributed Chunks. * *
Multiple Frames can reference the same Vecs, although this sharing can * make Vec lifetime management complex. Commonly temporary Frames are used * to work with a subset of some other Frame (often during algorithm * execution, when some columns are dropped from the modeling process). The * temporary Frame can simply be ignored, allowing the normal GC process to * reclaim it. Such temp Frames usually have a {@code null} key. * *
All the Vecs in a Frame belong to the same {@link Vec.VectorGroup} which * then enforces {@link Chunk} row alignment across Vecs (or at least enforces * a low-cost access model). Parallel and distributed execution touching all * the data in a Frame relies on this alignment to get good performance. * *
Example: Make a Frame from a CSV file:
* File file = ... * NFSFileVec nfs = NFSFileVec.make(file); // NFS-backed Vec, lazily read on demand * Frame fr = water.parser.ParseDataset.parse(Key.make("myKey"),nfs._key); ** *Example: Find and remove the Vec called "unique_id" from the Frame, * since modeling with a unique_id can lead to overfitting: *
* Vec uid = fr.remove("unique_id"); ** *Example: Move the response column to the last position: *
* fr.add("response",fr.remove("response")); ** */ public class Frame extends Lockable { /** Vec names */ public String[] _names; private boolean _lastNameBig; // Last name is "Cxxx" and has largest number private Key[] _keys; // Keys for the vectors private transient Vec[] _vecs; // The Vectors (transient to avoid network traffic) private transient Vec _col0; // First readable vec; fast access to the VectorGroup's Chunk layout public boolean hasNAs(){ for(Vec v:_vecs) if(v.naCnt() > 0) return true; return false; } /** Creates an internal frame composed of the given Vecs and default names. The frame has no key. */ public Frame( Vec... vecs ){ this(null,vecs);} /** Creates an internal frame composed of the given Vecs and names. The frame has no key. */ public Frame( String names[], Vec vecs[] ) { this(null,names,vecs); } /** Creates an empty frame with given key. */ public Frame( Key key ) { this(key,null,new Vec[0]); } /** * Special constructor for data with unnamed columns (e.g. svmlight) bypassing *all* checks. * @param key * @param vecs * @param noChecks */ public Frame( Key key, Vec vecs[], boolean noChecks) { super(key); assert noChecks; _vecs = vecs; _names = new String[vecs.length]; _keys = new Key[vecs.length]; for (int i = 0; i < vecs.length; i++) { _names[i] = defaultColName(i); _keys[i] = vecs[i]._key; } } /** Creates a frame with given key, names and vectors. */ public Frame( Key key, String names[], Vec vecs[] ) { super(key); // Require all Vecs already be installed in the K/V store for( Vec vec : vecs ) DKV.prefetch(vec._key); for( Vec vec : vecs ) assert DKV.get(vec._key) != null : " null vec: "+vec._key; // Always require names if( names==null ) { // Make default names, all known to be unique _names = new String[vecs.length]; _keys = new Key [vecs.length]; _vecs = vecs; for( int i=0; i make() ); _names= fr._names.clone(); _keys = fr._keys .clone(); _vecs = fr.vecs().clone(); _lastNameBig = fr._lastNameBig; } /** Default column name maker */ public static String defaultColName( int col ) { return "C"+(1+col); } // Make unique names. Efficient for the special case of appending endless // versions of "C123" style names where the next name is +1 over the prior // name. All other names take the O(n^2) lookup. private int pint( String name ) { try { return Integer.valueOf(name.substring(1)); } catch( NumberFormatException fe ) { } return 0; } public String uniquify( String name ) { String n = name; int lastName = 0; if( name.length() > 0 && name.charAt(0)=='C' ) lastName = pint(name); if( _lastNameBig && _names.length > 0 ) { String last = _names[_names.length-1]; if( !last.equals("") && last.charAt(0)=='C' && lastName == pint(last)+1 ) return name; } int cnt=0, again, max=0; do { again = cnt; for( String s : _names ) { if( lastName > 0 && s.charAt(0)=='C' ) max = Math.max(max,pint(s)); if( n.equals(s) ) n = name+(cnt++); } } while( again != cnt ); if( lastName == max+1 ) _lastNameBig = true; return n; } /** Check that the vectors are all compatible. All Vecs have their content * sharded using same number of rows per chunk, and all names are unique. * Throw an IAE if something does not match. */ private void checkCompatible( String name, Vec vec ) { if( vec instanceof AppendableVec ) return; // New Vectors are endlessly compatible Vec v0 = anyVec(); if( v0 == null ) return; // No fixed-size Vecs in the Frame // Vector group has to be the same, or else the layout has to be the same, // or else the total length has to be small. if( !v0.checkCompatible(vec) ) { if(!Vec.VectorGroup.sameGroup(v0,vec)) Log.err("Unexpected incompatible vector group, " + v0.group() + " != " + vec.group()); if(!Arrays.equals(v0.espc(), vec.espc())) Log.err("Unexpected incompatible espc, " + Arrays.toString(v0.espc()) + " != " + Arrays.toString(vec.espc())); throw new IllegalArgumentException("Vec " + name + " is not compatible with the rest of the frame"); } } /** Quick compatibility check between Frames. Used by some tests for efficient equality checks. */ public boolean isCompatible( Frame fr ) { if( numCols() != fr.numCols() ) return false; if( numRows() != fr.numRows() ) return false; for( int i=0; i key : _keys ) DKV.prefetch(key); Vec [] vecs = new Vec[_keys.length]; for( int i=0; i<_keys.length; i++ ) vecs[i] = _keys[i].get(); return vecs; } /** Convenience to accessor for last Vec * @return last Vec */ public Vec lastVec() { vecs(); return _vecs [_vecs.length -1]; } /** Convenience to accessor for last Vec name * @return last Vec name */ public String lastVecName() { return _names[_names.length-1]; } /** Force a cache-flush and reload, assuming vec mappings were altered * remotely, or that the _vecs array was shared and now needs to be a * defensive copy. * @return the new instance of the Frame's Vec[] */ public final Vec[] reloadVecs() { _vecs=null; return vecs(); } /** Returns the Vec by given index, implemented by code: {@code vecs()[idx]}. * @param idx idx of column * @return this frame idx-th vector, never returns null
*/ public final Vec vec(int idx) { return vecs()[idx]; } /** Return a Vec by name, or null if missing * @return a Vec by name, or null if missing */ public Vec vec(String name) { int idx = find(name); return idx==-1 ? null : vecs()[idx]; } /** Finds the column index with a matching name, or -1 if missing * @return the column index with a matching name, or -1 if missing */ public int find( String name ) { if( name == null ) return -1; assert _names != null; for( int i=0; i<_names.length; i++ ) if( name.equals(_names[i]) ) return i; return -1; } /** Finds the matching column index, or -1 if missing * @return the matching column index, or -1 if missing */ public int find( Vec vec ) { Vec[] vecs = vecs(); //warning: side-effect if (vec == null) return -1; for( int i=0; i_frame; String _column_name; public Vec vec() { Value v = DKV.get(_frame); if (null == v) return null; Frame f = v.get(); if (null == f) return null; return f.vec(_column_name); } } /** Type for every Vec */ public byte[] types() { Vec[] vecs = vecs(); byte bs[] = new byte[vecs.length]; for( int i=0; i 1e-6; } /** The {@code Vec.byteSize} of all Vecs * @return the {@code Vec.byteSize} of all Vecs */ public long byteSize() { Vec[] vecs = bulkRollups(); long sum=0; for (Vec vec : vecs) sum += vec.byteSize(); return sum; } /** 64-bit checksum of the checksums of the vecs. SHA-265 checksums of the * chunks are XORed together. Since parse always parses the same pieces of * files into the same offsets in some chunk this checksum will be * consistent across reparses. * @return 64-bit Frame checksum */ @Override protected long checksum_impl() { Vec[] vecs = vecs(); long _checksum = 0; for( int i = 0; i < _names.length; ++i ) { long vec_checksum = vecs[i].checksum(); _checksum ^= vec_checksum; long tmp = (2147483647L * i); _checksum ^= tmp; } _checksum *= (0xBABE + Arrays.hashCode(_names)); // TODO: include column types? Vec.checksum() should include type? return _checksum; } // Add a bunch of vecs public void add( String[] names, Vec[] vecs) { bulkAdd(names, vecs); } public void add( String[] names, Vec[] vecs, int cols ) { if (null == vecs || null == names) return; if (cols == names.length && cols == vecs.length) { bulkAdd(names, vecs); } else { for (int i = 0; i < cols; i++) add(names[i], vecs[i]); } } /** Append multiple named Vecs to the Frame. Names are forced unique, by appending a * unique number if needed. */ private void bulkAdd(String[] names, Vec[] vecs) { String[] tmpnames = names.clone(); int N = names.length; assert(names.length == vecs.length):"names = " + Arrays.toString(names) + ", vecs len = " + vecs.length; for (int i=0; i replaceBy false
. * Else replace a missing column by a constant column with given value. * * @param names list of column names to extract * @param replaceBy should be missing column replaced by a constant column * @param c value for constant column * @return array of 2 frames, the first is containing a desired subframe, the second one contains newly created columns or null * @throws IllegalArgumentException ifreplaceBy
is false and there is a missing column in this frame */ private Frame[] subframe(String[] names, boolean replaceBy, double c){ Vec [] vecs = new Vec[names.length]; Vec [] cvecs = replaceBy ? new Vec [names.length] : null; String[] cnames = replaceBy ? new String[names.length] : null; int ccv = 0; // counter of constant columns vecs(); // Preload the vecs HashMapmap = new HashMap<>((int) ((names.length/0.75f)+1)); // avoid rehashing by set up initial capacity for(int i = 0; i < _names.length; ++i) map.put(_names[i], i); for(int i = 0; i < names.length; ++i) if(map.containsKey(names[i])) vecs[i] = _vecs[map.get(names[i])]; else if (replaceBy) { Log.warn("Column " + names[i] + " is missing, filling it in with " + c); assert cnames != null; cnames[ccv] = names[i]; vecs[i] = cvecs[ccv++] = anyVec().makeCon(c); } return new Frame[] { new Frame(Key.make("subframe"+Key.make().toString()), names,vecs), ccv>0 ? new Frame(Key.make("subframe"+Key.make().toString()), Arrays.copyOf(cnames, ccv), Arrays.copyOf(cvecs,ccv)) : null }; } /** Allow rollups for all written-into vecs; used by {@link MRTask} once * writing is complete. * @return the original Futures, for flow-coding */ public Futures postWrite(Futures fs) { for( Vec v : vecs() ) v.postWrite(fs); return fs; } /** Actually remove/delete all Vecs from memory, not just from the Frame. * @return the original Futures, for flow-coding */ @Override protected Futures remove_impl(Futures fs) { final Key[] keys = _keys; if( keys.length==0 ) return fs; // Get the nChunks without calling anyVec - which loads all Vecs eagerly, // only to delete them. Supports Frames with some Vecs already deleted, as // a Scope cleanup action might delete Vecs out of order. Vec v = _col0; if( v == null ) { Vec[] vecs = _vecs; // Read once, in case racily being cleared if( vecs != null ) for( int i=0; i = vecs().length) throw new ArrayIndexOutOfBoundsException(); Arrays.sort(idxs); Vec[] res = new Vec[idxs.length]; Vec[] rem = new Vec[_vecs.length-idxs.length]; String[] names = new String[rem.length]; Key [] keys = new Key [rem.length]; int j = 0; int k = 0; int l = 0; for(int i = 0; i < _vecs.length; ++i) { if(j < idxs.length && i == idxs[j]) { ++j; res[k++] = _vecs[i]; } else { rem [l] = _vecs [i]; names[l] = _names[i]; keys [l] = _keys [i]; ++l; } } _vecs = rem; _names= names; _keys = keys; assert l == rem.length && k == idxs.length; return res; } /** Removes a numbered column. * @return the removed column */ public final Vec remove( int idx ) { int len = _names.length; if( idx < 0 || idx >= len ) return null; Vec v = vecs()[idx]; System.arraycopy(_names,idx+1,_names,idx,len-idx-1); System.arraycopy(_vecs ,idx+1,_vecs ,idx,len-idx-1); System.arraycopy(_keys ,idx+1,_keys ,idx,len-idx-1); _names = Arrays.copyOf(_names,len-1); _vecs = Arrays.copyOf(_vecs ,len-1); _keys = Arrays.copyOf(_keys ,len-1); if( v == _col0 ) _col0 = null; return v; } /** Remove given interval of columns from frame. Motivated by R intervals. * @param startIdx - start index of column (inclusive) * @param endIdx - end index of column (exclusive) * @return array of removed columns */ Vec[] remove(int startIdx, int endIdx) { int len = _names.length; int nlen = len - (endIdx-startIdx); String[] names = new String[nlen]; Key[] keys = new Key[nlen]; Vec[] vecs = new Vec[nlen]; vecs(); if (startIdx > 0) { System.arraycopy(_names, 0, names, 0, startIdx); System.arraycopy(_vecs, 0, vecs, 0, startIdx); System.arraycopy(_keys, 0, keys, 0, startIdx); } nlen -= startIdx; if (endIdx < _names.length+1) { System.arraycopy(_names, endIdx, names, startIdx, nlen); System.arraycopy(_vecs, endIdx, vecs, startIdx, nlen); System.arraycopy(_keys, endIdx, keys, startIdx, nlen); } Vec[] vecX = Arrays.copyOfRange(_vecs,startIdx,endIdx); _names = names; _vecs = vecs; _keys = keys; _col0 = null; return vecX; } /** Restructure a Frame completely */ public void restructure( String[] names, Vec[] vecs) { restructure(names, vecs, vecs.length); } /** Restructure a Frame completely, but only for a specified number of columns (counting up) */ public void restructure( String[] names, Vec[] vecs, int cols) { // Make empty to dodge asserts, then "add()" them all which will check for // compatible Vecs & names. _names = new String[0]; _keys = new Key [0]; _vecs = new Vec [0]; add(names,vecs,cols); } // -------------------------------------------- // Utilities to help external Frame constructors, e.g. Spark. // Make an initial Frame & lock it for writing. Build Vec Keys. void preparePartialFrame( String[] names ) { // Nuke any prior frame (including freeing storage) & lock this one if( _keys != null ) delete_and_lock(); else write_lock(); _names = names; _keys = new Vec.VectorGroup().addVecs(names.length); // No Vectors tho!!! These will be added *after* the import } // Only serialize strings, not H2O internal structures // Make NewChunks to for holding data from e.g. Spark. Once per set of // Chunks in a Frame, before filling them. This can be called in parallel // for different Chunk#'s (cidx); each Chunk can be filled in parallel. static NewChunk[] createNewChunks(String name, byte[] type, int cidx) { Frame fr = (Frame) Key.make(name).get(); NewChunk[] nchks = new NewChunk[fr.numCols()]; for (int i = 0; i < nchks.length; i++) { nchks[i] = new NewChunk(new AppendableVec(fr._keys[i], type[i]), cidx); } return nchks; } // Compress & DKV.put NewChunks. Once per set of Chunks in a Frame, after // filling them. Can be called in parallel for different sets of Chunks. static void closeNewChunks(NewChunk[] nchks) { Futures fs = new Futures(); for (NewChunk nchk : nchks) { nchk.close(fs); } fs.blockForPending(); } // Build real Vecs from loose Chunks, and finalize this Frame. Called once // after any number of [create,close]NewChunks. void finalizePartialFrame( long[] espc, String[][] domains, byte[] types ) { // Compute elems-per-chunk. // Roll-up elem counts, so espc[i] is the starting element# of chunk i. int nchunk = espc.length; long espc2[] = new long[nchunk+1]; // Shorter array long x=0; // Total row count so far for( int i=0; i Semantics are a little odd, to match R's. Each dimension spec can be: *
* *- null - all of them *
- a sorted list of negative numbers (no dups) - all BUT these *
- an unordered list of positive - just these, allowing dups *
The numbering is 1-based; zero's are not allowed in the lists, nor are out-of-range values. * @return the sliced Frame */ public Frame deepSlice( Object orows, Object ocols ) { // ocols is either a long[] or a Frame-of-1-Vec long[] cols; if( ocols == null ) cols = null; else if (ocols instanceof long[]) cols = (long[])ocols; else if (ocols instanceof Frame) { Frame fr = (Frame) ocols; if (fr.numCols() != 1) throw new IllegalArgumentException("Columns Frame must have only one column (actually has " + fr.numCols() + " columns)"); long n = fr.anyVec().length(); if (n > MAX_EQ2_COLS) throw new IllegalArgumentException("Too many requested columns (requested " + n +", max " + MAX_EQ2_COLS + ")"); cols = new long[(int)n]; Vec.Reader v = fr.anyVec().new Reader(); for (long i = 0; i < v.length(); i++) cols[(int)i] = v.at8(i); } else throw new IllegalArgumentException("Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")"); // Since cols is probably short convert to a positive list. int c2[]; if( cols==null ) { c2 = new int[numCols()]; for( int i=0; i
= 0 ) { c2 = new int[cols.length]; for( int i=0; i = cols.length || i < (-(1+cols[j])) ) c2[i-j] = i; else j++; } } for (int aC2 : c2) if (aC2 >= numCols()) throw new IllegalArgumentException("Trying to select column " + (aC2 + 1) + " but only " + numCols() + " present."); if( c2.length==0 ) throw new IllegalArgumentException("No columns selected (did you try to select column 0 instead of column 1?)"); // Do Da Slice // orows is either a long[] or a Vec if (numRows() == 0) { return new MRTask() { @Override public void map(Chunk[] chks, NewChunk[] nchks) { for (NewChunk nc : nchks) nc.addNA(); } }.doAll(types(c2), this).outputFrame(names(c2), domains(c2)); } if (orows == null) return new DeepSlice(null,c2,vecs()).doAll(types(c2),this).outputFrame(names(c2),domains(c2)); else if (orows instanceof long[]) { final long CHK_ROWS=1000000; final long[] rows = (long[])orows; if (this.numRows() == 0) { return this; } if( rows.length==0 || rows[0] < 0 ) { if (rows.length != 0 && rows[0] < 0) { Vec v0 = this.anyVec().makeZero(); Vec v = new MRTask() { @Override public void map(Chunk cs) { for (long er : rows) { if (er >= 0) continue; er = Math.abs(er); if (er < cs._start || er > (cs._len + cs._start - 1)) continue; cs.set((int) (er - cs._start), 1); } } }.doAll(v0).getResult()._fr.anyVec(); Keyed.remove(v0._key); Frame slicedFrame = new DeepSlice(rows, c2, vecs()).doAll(types(c2), this.add("select_vec", v)).outputFrame(names(c2), domains(c2)); Keyed.remove(v._key); Keyed.remove(this.remove(this.numCols() - 1)._key); return slicedFrame; } else { return new DeepSlice(rows.length == 0 ? null : rows, c2, vecs()).doAll(types(c2), this).outputFrame(names(c2), domains(c2)); } } // Vec'ize the index array Futures fs = new Futures(); AppendableVec av = new AppendableVec(Vec.newKey(),Vec.T_NUM); int r = 0; int c = 0; while (r < rows.length) { NewChunk nc = new NewChunk(av, c); long end = Math.min(r+CHK_ROWS, rows.length); for (; r < end; r++) { nc.addNum(rows[r]); } nc.close(c++, fs); } Vec c0 = av.layout_and_close(fs); // c0 is the row index vec fs.blockForPending(); Frame ff = new Frame(new String[]{"rownames"}, new Vec[]{c0}); Frame fr2 = new Slice(c2, this).doAll(types(c2),ff).outputFrame(names(c2), domains(c2)); Keyed.remove(c0._key); Keyed.remove(av._key); ff.delete(); return fr2; } Frame frows = (Frame)orows; // It's a compatible Vec; use it as boolean selector. // Build column names for the result. Vec [] vecs = new Vec[c2.length]; String [] names = new String[c2.length]; for(int i = 0; i < c2.length; ++i){ vecs[i] = _vecs[c2[i]]; names[i] = _names[c2[i]]; } Frame ff = new Frame(names, vecs); ff.add("predicate", frows.anyVec()); return new DeepSelect().doAll(types(c2),ff).outputFrame(names(c2),domains(c2)); } // Slice and return in the form of new chunks. private static class Slice extends MRTask { final Frame _base; // the base frame to slice from final int[] _cols; Slice(int[] cols, Frame base) { _cols = cols; _base = base; } @Override public void map(Chunk[] ix, NewChunk[] ncs) { final Vec[] vecs = new Vec[_cols.length]; final Vec anyv = _base.anyVec(); final long nrow = anyv.length(); long r = ix[0].at8(0); int last_ci = anyv.elem2ChunkIdx(r = nrow) { for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN); } else { if (r < last_c0 || r >= last_c1) { last_ci = anyv.elem2ChunkIdx(r); last_c0 = anyv.espc()[last_ci]; last_c1 = anyv.espc()[last_ci + 1]; for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].chunkForChunkIdx(last_ci); } for (int c = 0; c < vecs.length; c++) if( vecs[c].isUUID() ) ncs[c].addUUID(last_cs[c], r); else if( vecs[c].isString() ) ncs[c].addStr(last_cs[c],r); else ncs[c].addNum (last_cs[c].at_abs(r)); } } } } // Convert first 100 rows to a 2-d table @Override public String toString( ) { return toString(0,20); } // Convert len rows starting at off to a 2-d ascii table public String toString( long off, int len ) { if( off > numRows() ) off = numRows(); if( off+len > numRows() ) len = (int)(numRows()-off); String[] rowHeaders = new String[len+5]; rowHeaders[0] = "min"; rowHeaders[1] = "mean"; rowHeaders[2] = "stddev"; rowHeaders[3] = "max"; rowHeaders[4] = "missing"; for( int i=0; i { final int _cols[]; final long _rows[]; final byte _isInt[]; DeepSlice( long rows[], int cols[], Vec vecs[] ) { _cols=cols; _rows=rows; _isInt = new byte[cols.length]; for( int i=0; i = _rows.length) break; // All done with row selections long r = _rows[rx++];// Next row selector if (r < rstart) continue; rlo = (int) (r - rstart); rhi = rlo + 1; // Stop at the next row while (rx < _rows.length && (_rows[rx] - rstart) == rhi && rhi < rlen) { rx++; rhi++; // Grab sequential rows } } // Process this next set of rows // For all cols in the new set; BufferedString tmpStr = new BufferedString(); for (int i = 0; i < _cols.length; i++) { Chunk oc = chks[_cols[i]]; NewChunk nc = nchks[i]; if (_isInt[i] == 1) { // Slice on integer columns for (int j = rlo; j < rhi; j++) if (oc._vec.isUUID()) nc.addUUID(oc, j); else if (oc.isNA(j)) nc.addNA(); else nc.addNum(oc.at8(j), 0); } else if (oc._vec.isString()) { for (int j = rlo; j < rhi; j++) nc.addStr(oc.atStr(tmpStr, j)); } else {// Slice on double columns for (int j = rlo; j < rhi; j++) nc.addNum(oc.atd(j)); } } rlo = rhi; if (_rows == null) break; } } } /** * Create a copy of the input Frame and return that copied Frame. All Vecs in this are copied in parallel. * Caller mut do the DKV.put * @param keyName Key for resulting frame. If null, no key will be given. * @return The fresh copy of fr. */ public Frame deepCopy(String keyName) { return new MRTask() { @Override public void map(Chunk[] cs, NewChunk[] ncs) { for(int col=0;col { final Vec[] _vecs; DoCopyFrame(Vec[] vecs) { _vecs = new Vec[vecs.length]; int rowLayout = _vecs[0]._rowLayout; for(int i=0;i { @Override public void map( Chunk chks[], NewChunk nchks[] ) { Chunk pred = chks[chks.length - 1]; int[] ids = new int[pred._len]; double[] vals = new double[ids.length]; int selected = pred.nonzeros(ids); int[] selectedIds = new int[selected]; // todo keeping old behavior of ignoring missing, while R does insert missing into result // filter out missing int non_nas = 0; for(int i = 0; i < selected; ++i) if(!pred.isNA(ids[i])) selectedIds[non_nas++] = ids[i]; selected = non_nas; for (int i = 0; i < chks.length - 1; ++i) { Chunk c = chks[i]; // do not need to inflate cause sparse does not compress doubles NewChunk nc = nchks[i]; if (c.isSparseZero() || c.isSparseNA()) { nc.alloc_indices(selectedIds.length); nc.alloc_doubles(selectedIds.length); nc._sparseNA = c.isSparseNA(); int n = c.asSparseDoubles(vals, ids); int k = 0; int l = -1; for (int j = 0; j < n; ++j) { while (k < selected && selectedIds[k] < ids[j]) ++k; if (k == selected) break; if (selectedIds[k] == ids[j]) { int add = k - l - 1; if(add > 0) { if (c.isSparseZero()) nc.addZeros(add); else nc.addNAs(add); } nc.addNum(vals[j]); l = k; k++; } } int add = selected - l - 1; if(add > 0) { if (c.isSparseZero()) nc.addZeros(add); else nc.addNAs(add); } assert nc.len() == selected:"len = " + nc.len() + ", selected = " + selected; } else { NewChunk src = new NewChunk(c); src = c.inflate_impl(src); // if(src.sparseNA() || src.sparseZero()) // src.cancel_sparse(); for (int j = 0; j < selected; ++j) src.add2Chunk(nc, selectedIds[j]); } } } } private String[][] domains(int [] cols){ Vec[] vecs = vecs(); String[][] res = new String[cols.length][]; for(int i = 0; i < cols.length; ++i) res[i] = vecs[cols[i]].domain(); return res; } private String [] names(int [] cols){ if(_names == null)return null; String [] res = new String[cols.length]; for(int i = 0; i < cols.length; ++i) res[i] = _names[cols[i]]; return res; } private byte[] types(int [] cols){ Vec[] vecs = vecs(); byte[] res = new byte[cols.length]; for(int i = 0; i < cols.length; ++i) res[i] = vecs[cols[i]]._type; return res; } public Vec[] makeCompatible( Frame f) {return makeCompatible(f,false);} /** Return array of Vectors if 'f' is compatible with 'this', else return a new * array of Vectors compatible with 'this' and a copy of 'f's data otherwise. Note * that this can, in the worst case, copy all of {@code this}s' data. * @return This Frame's data in an array of Vectors that is compatible with {@code f}. */ public Vec[] makeCompatible( Frame f, boolean force) { // Small data frames are always "compatible" if (anyVec() == null) // Or it is small return f.vecs(); // Then must be compatible Vec v1 = anyVec(); Vec v2 = f.anyVec(); if(v1.length() != v2.length()) throw new IllegalArgumentException("Can not make vectors of different length compatible!"); if (v2 == null || (!force && v1.checkCompatible(v2))) return f.vecs(); // Ok, here make some new Vecs with compatible layout Key k = Key.make(); H2O.submitTask(new RebalanceDataSet(this, f, k)).join(); Frame f2 = (Frame)k.get(); DKV.remove(k); for (Vec v : f2.vecs()) Scope.track(v); return f2.vecs(); } private boolean isLastRowOfCurrentNonEmptyChunk(int chunkIdx, long row) { long[] espc = anyVec().espc(); long lastRowOfCurrentChunk = espc[chunkIdx + 1] - 1; // Assert chunk is non-empty. assert espc[chunkIdx + 1] > espc[chunkIdx]; // Assert row numbering sanity. assert row <= lastRowOfCurrentChunk; return row >= lastRowOfCurrentChunk; } public static Job export(Frame fr, String path, String frameName, boolean overwrite) { // Validate input boolean fileExists = H2O.getPM().exists(path); if (overwrite && fileExists) { Log.warn("File " + path + " exists, but will be overwritten!"); } else if (!overwrite && fileExists) { throw new H2OIllegalArgumentException(path, "exportFrame", "File " + path + " already exists!"); } InputStream is = (fr).toCSV(true, false); Job job = new Job(fr._key,"water.fvec.Frame","Export dataset"); FrameUtils.ExportTask t = new FrameUtils.ExportTask(is, path, frameName, overwrite, job); return job.start(t, fr.anyVec().nChunks()); } /** Convert this Frame to a CSV (in an {@link InputStream}), that optionally * is compatible with R 3.1's recent change to read.csv()'s behavior. * @return An InputStream containing this Frame as a CSV */ public InputStream toCSV(boolean headers, boolean hex_string) { return new CSVStream(headers, hex_string); } public class CSVStream extends InputStream { private final boolean _hex_string; byte[] _line; int _position; public volatile int _curChkIdx; long _row; CSVStream(boolean headers, boolean hex_string) { _curChkIdx=0; _hex_string = hex_string; StringBuilder sb = new StringBuilder(); Vec vs[] = vecs(); if( headers ) { sb.append('"').append(_names[0]).append('"'); for(int i = 1; i < vs.length; i++) sb.append(',').append('"').append(_names[i]).append('"'); sb.append('\n'); } _line = sb.toString().getBytes(); } byte[] getBytesForRow() { StringBuilder sb = new StringBuilder(); Vec vs[] = vecs(); BufferedString tmpStr = new BufferedString(); for( int i = 0; i < vs.length; i++ ) { if(i > 0) sb.append(','); if(!vs[i].isNA(_row)) { if( vs[i].isCategorical() ) sb.append('"').append(vs[i].factor(vs[i].at8(_row))).append('"'); else if( vs[i].isUUID() ) sb.append(PrettyPrint.UUID(vs[i].at16l(_row), vs[i].at16h(_row))); else if( vs[i].isInt() ) sb.append(vs[i].at8(_row)); else if (vs[i].isString()) sb.append('"').append(vs[i].atStr(tmpStr, _row)).append('"'); else { double d = vs[i].at(_row); // R 3.1 unfortunately changed the behavior of read.csv(). // (Really type.convert()). // // Numeric values with too much precision now trigger a type conversion in R 3.1 into a factor. // // See these discussions: // https://bugs.r-project.org/bugzilla/show_bug.cgi?id=15751 // https://stat.ethz.ch/pipermail/r-devel/2014-April/068778.html // http://stackoverflow.com/questions/23072988/preserve-old-pre-3-1-0-type-convert-behavior String s = _hex_string ? Double.toHexString(d) : Double.toString(d); sb.append(s); } } } sb.append('\n'); return sb.toString().getBytes(); } @Override public int available() throws IOException { // Case 1: There is more data left to read from the current line. if (_position != _line.length) { return _line.length - _position; } // Case 2: Out of data. if (_row == numRows()) { return 0; } // Case 3: Return data for the current row. // Note this will fast-forward past empty chunks. _curChkIdx = anyVec().elem2ChunkIdx(_row); _line = getBytesForRow(); _position = 0; // Flush non-empty remote chunk if we're done with it. if (isLastRowOfCurrentNonEmptyChunk(_curChkIdx, _row)) { for (Vec v : vecs()) { Key k = v.chunkKey(_curChkIdx); if( !k.home() ) H2O.raw_remove(k); } } _row++; return _line.length; } @Override public void close() throws IOException { super.close(); _line = null; } @Override public int read() throws IOException { return available() == 0 ? -1 : _line[_position++]; } @Override public int read(byte[] b, int off, int len) throws IOException { int n = available(); if(n > 0) { n = Math.min(n, len); System.arraycopy(_line, _position, b, off, n); _position += n; } return n; } } @Override public Class makeSchema() { return water.api.KeyV3.FrameKeyV3.class; } }