
water.util.FrameUtils Maven / Gradle / Ivy
package water.util;
import java.io.*;
import java.net.URI;
import java.util.Random;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NFSFileVec;
import water.fvec.Vec;
import water.parser.ParseDataset;
import water.parser.ParseSetup;
public class FrameUtils {
/** Parse given file(s) into the form of single frame represented by the given key.
*
* @param okey destination key for parsed frame
* @param files files to parse
* @return a new frame
*/
public static Frame parseFrame(Key okey, File ...files) throws IOException {
if (files == null || files.length == 0) {
throw new IllegalArgumentException("List of files is empty!");
}
for (File f : files) {
if (!f.exists())
throw new FileNotFoundException("File not found " + f);
}
// Create output key if it is not given
if(okey == null) okey = Key.make(files[0].getName());
Key[] inKeys = new Key[files.length];
for (int i=0; i {
final int N;
public double [] res;
public Vec2ArryTsk(int N){this.N = N;}
@Override public void setupLocal(){
res = MemoryManager.malloc8d(N);
}
@Override public void map(Chunk c){
final int off = (int)c.start();
for(int i = 0; i < c._len; i = c.nextNZ(i))
res[off+i] = c.atd(i);
}
@Override public void reduce(Vec2ArryTsk other){
if(res != other.res) {
for(int i = 0; i < res.length; ++i) {
assert res[i] == 0 || other.res[i] == 0;
res[i] += other.res[i]; // assuming only one nonzero
}
}
}
}
public static double [] asDoubles(Vec v){
if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array");
return new Vec2ArryTsk((int)v.length()).doAll(v).res;
}
private static class Vec2IntArryTsk extends MRTask {
final int N;
public int [] res;
public Vec2IntArryTsk(int N){this.N = N;}
@Override public void setupLocal(){
res = MemoryManager.malloc4(N);
}
@Override public void map(Chunk c){
final int off = (int)c.start();
for(int i = 0; i < c._len; i = c.nextNZ(i))
res[off+i] = (int)c.at8(i);
}
@Override public void reduce(Vec2IntArryTsk other){
if(res != other.res) {
for(int i = 0; i < res.length; ++i) {
assert res[i] == 0 || other.res[i] == 0;
res[i] += other.res[i]; // assuming only one nonzero
}
}
}
}
public static int [] asInts(Vec v){
if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array");
return new Vec2IntArryTsk((int)v.length()).doAll(v).res;
}
/**
* Compute a chunk summary (how many chunks of each type, relative size, total size)
* @param fr
* @return chunk summary
*/
public static ChunkSummary chunkSummary(Frame fr) {
return new ChunkSummary().doAll(fr);
}
/** Generate given numbers of keys by suffixing key by given numbered suffix. */
public static Key[] generateNumKeys(Key mk, int num) { return generateNumKeys(mk, num, "_part"); }
public static Key[] generateNumKeys(Key mk, int num, String delim) {
Key[] ks = new Key[num];
String n = mk!=null ? mk.toString() : "noname";
String suffix = "";
if (n.endsWith(".hex")) {
n = n.substring(0, n.length()-4); // be nice
suffix = ".hex";
}
for (int i=0; i _job;
final Key _dataset;
final double _fraction;
final long _seed;
public MissingInserter(Key frame, long seed, double frac){
_dataset = frame; _seed = seed; _fraction = frac;
}
/**
* Driver for MissingInserter
*/
class MissingInserterDriver extends H2O.H2OCountedCompleter {
final Frame _frame;
MissingInserterDriver(Frame frame) {_frame = frame; }
@Override
public void compute2() {
new MRTask() {
@Override public void map (Chunk[]cs){
final Random rng = RandomUtils.getRNG(0);
for (int c = 0; c < cs.length; c++) {
for (int r = 0; r < cs[c]._len; r++) {
rng.setSeed(_seed + 1234 * c ^ 1723 * (cs[c].start() + r));
if (rng.nextDouble() < _fraction) cs[c].setNA(r);
}
}
_job.update(1);
}
}.doAll(_frame);
tryComplete();
}
}
public Job execImpl() {
_job = new Job(_dataset, Frame.class.getName(), "MissingValueInserter");
if (DKV.get(_dataset) == null)
throw new IllegalArgumentException("Invalid Frame key " + _dataset + " (Frame doesn't exist).");
if (_fraction < 0 || _fraction > 1 ) throw new IllegalArgumentException("fraction must be between 0 and 1.");
final Frame frame = DKV.getGet(_dataset);
MissingInserterDriver mid = new MissingInserterDriver(frame);
int work = frame.vecs()[0].nChunks();
return _job.start(mid, work);
}
}
/**
* compute fraction of sparse chunks in this array.
* @param chks
* @return
*/
public static double sparseRatio(Chunk [] chks) {
double cnt = 0;
double reg = 1.0/chks.length;
for(Chunk c :chks)
if(c.isSparseNA()){
cnt += c.sparseLenNA()/(double)c.len();
} else if(c.isSparseZero()){
cnt += c.sparseLenZero()/(double)c.len();
} else cnt += 1;
return cnt * reg;
}
public static class WeightedMean extends MRTask {
private double _wresponse;
private double _wsum;
public double weightedMean() {
return _wsum == 0 ? 0 : _wresponse / _wsum;
}
@Override public void map(Chunk response, Chunk weight, Chunk offset) {
for (int i=0;i {
final InputStream _csv;
final String _path;
final String _frameName;
final boolean _overwrite;
final Job _j;
public ExportTask(InputStream csv, String path, String frameName, boolean overwrite, Job j) {
_csv = csv;
_path = path;
_frameName = frameName;
_overwrite = overwrite;
_j = j;
}
private long copyStream(OutputStream os, final int buffer_size) {
long len = 0;
int curIdx = 0;
try {
byte[] bytes = new byte[buffer_size];
for (; ; ) {
int count = _csv.read(bytes, 0, buffer_size);
if (count <= 0) {
break;
}
len += count;
os.write(bytes, 0, count);
int workDone = ((Frame.CSVStream) _csv)._curChkIdx;
if (curIdx != workDone) {
_j.update(workDone - curIdx);
curIdx = workDone;
}
}
} catch (Exception ex) {
throw new RuntimeException(ex);
}
return len;
}
@Override public void compute2() {
OutputStream os = null;
long written = -1;
try {
os = H2O.getPM().create(_path, _overwrite);
written = copyStream(os, 4 * 1024 * 1024);
} finally {
if (os != null) {
try {
os.flush(); // Seems redundant, but seeing a short-file-read on windows sometimes
os.close();
Log.info("Key '" + _frameName + "' of "+written+" bytes was written to " + _path + ".");
} catch (Exception e) {
Log.err(e);
}
}
}
tryComplete();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy