All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.fvec.TestFrameBuilder Maven / Gradle / Ivy

The newest version!
package water.fvec;

import org.junit.Ignore;
import water.DKV;
import water.Key;
import water.Scope;
import water.rapids.Env;
import water.rapids.Session;
import water.util.ArrayUtils;

import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * Class used for creating simple test frames using builder pattern
 * 

* Example usage: *

{@code
 * final Frame builder = new TestFrameBuilder()
 *   .withName("testFrame")
 *   .withColNames("ColA", "ColB", "ColC")
 *   .withVecTypes(Vec.T_NUM, Vec.T_STR, Vec.T_CAT)
 *   .withDataForCol(0, ard(Double.NaN, 1, 2, 3, 4, 5.6, 7))
 *   .withDataForCol(1, ar("A", "B", "C", "E", "F", "I", "J"))
 *   .withDataForCol(2, ar("A", "B,", "A", "C", "A", "B", "A"))
 *   .withChunkLayout(2, 2, 2, 1)
 *   .build();
 * }
 * 
* Data for categorical column are set in the same way as for string column and leves are created automatically.
* All methods in this builder are optional: *
    *
  • Frame name is created it not provided.
  • *
  • Column names are created automatically if not provided.
  • *
  • Vector types are initialized to all T_NUMs when not provided. For example, creating empty frame ( * no data, co columns) can be created as {@code Frame fr = new TestFrameBuilder().build()}.
  • *
  • Column data are initialized to empty array when not provided. The following example creates frames with 2 columns, * but no data. {@code Frame fr = new TestFrameBuilder().withVecTypes(Vec.T_NUM).build()}.
  • *
  • Only one chunk is created when chunk layout is not provided.
  • *
* * The frame created will be automatically tracked in the currently active {@link Scope}. */ @Ignore public class TestFrameBuilder { private static final long NOT_SET = -1; private Map stringData = new HashMap<>(); private Map numericData = new HashMap<>(); private Map givenDomains = new HashMap<>(); private String frameName; private byte[] vecTypes; private String[] colNames; private long[] chunkLayout; private int numCols; private Key key; private long numRows = NOT_SET; private String[][] domains = null; private Map categoriesPerCol = new HashMap<>(); /** * Sets the name for the frame. Default name is created if this method is not called. */ public TestFrameBuilder withName(String frameName) { throwIf(frameName.startsWith("$"), "Frame name " + frameName + " may only be used with a Session object."); this.frameName = frameName; return this; } public TestFrameBuilder withName(String frameName, Session session) { return withName(new Env(session).expand(frameName)); } /** * Sets the names for the columns. Default names are created if this method is not called. */ public TestFrameBuilder withColNames(String... colNames) { this.colNames = colNames; return this; } /** * Sets the vector types. Vector types are initialized to empty array if this method is not called. */ public TestFrameBuilder withVecTypes(byte... vecTypes) { this.vecTypes = vecTypes; return this; } /** * Sets the vectors types to a single, uniform value for each vector * * @param nvecs Number of vectors in the frame * @param vecType Uniform type of the vectors */ public TestFrameBuilder withUniformVecTypes(final int nvecs, final byte vecType) { byte[] vecTypes = new byte[nvecs]; for (int i = 0; i < nvecs; i++) { vecTypes[i] = vecType; } this.vecTypes = vecTypes; return this; } /** * Generate sequence of integer data * @param column for which to set data * @param from minimal value to generate (included) * @param to maximum value to generate (excluded) */ public TestFrameBuilder withSequenceIntDataForCol(int column, int from, int to) { assert to > from; int size = to-from; double[] arr = new double[size]; for(int i = from; i < to; i++) { arr[i] = i; } numericData.put(column, arr); return this; } /** * Genarate random double data for a particular column * @param column for which to set data * @param size size of randomly generated column * @param min minimal value to generate * @param max maximum value to generate */ public TestFrameBuilder withRandomIntDataForCol(int column, int size, int min, int max, long seed) { assert max > min; assert seed + size * size <= Long.MAX_VALUE; double[] arr = new double[size]; for(int i = 0; i < size; i++) { arr[i] = min + new Random(seed + i * size).nextInt(max - min); } numericData.put(column, arr); return this; } /** * Genarate random double data for a particular column * @param column for which to set data * @param size size of randomly generated column * @param min minimal value to generate * @param max maximum value to generate */ public TestFrameBuilder withRandomDoubleDataForCol(int column, int size, int min, int max, long seed) { assert max >= min; double[] arr = new double[size]; for(int i = 0; i < size; i++) { arr[i] = min + (max - min) * new Random(seed + i * size).nextDouble(); } numericData.put(column, arr); return this; } /** * Genarate random binary data for a particular column * * @param column for which to set data */ public TestFrameBuilder withRandomBinaryDataForCol(int column, int size, long seed) { String[] arr = new String[size]; Random generator = new Random(); long multiplierFromRandomClass = 0x5DEECE66DL; assert seed + size * multiplierFromRandomClass < Long.MAX_VALUE; for(int i = 0; i < size; i++) { generator.setSeed(seed + i * multiplierFromRandomClass); arr[i] = Boolean.toString( generator.nextBoolean()); } stringData.put(column, arr); return this; } /** * Sets data for a particular column * * @param column for which to set data * @param data array of string data */ public TestFrameBuilder withDataForCol(int column, String[] data) { stringData.put(column, data); return this; } /** * Sets data for a particular column * * @param column for which to set data * @param data array of double data */ public TestFrameBuilder withDataForCol(int column, double[] data) { numericData.put(column, data); return this; } /** * Sets data for a particular column * * @param column for which to set data * @param data array of long data */ public TestFrameBuilder withDataForCol(int column, long[] data) { if(data == null){ numericData.put(column, null); }else { double[] doubles = new double[data.length]; for (int i = 0; i < data.length; i++) { doubles[i] = data[i]; } numericData.put(column, doubles); } return this; } /** * Sets data for a particular column * * @param column for which to set data * @param data array of long data */ public TestFrameBuilder withDataForCol(int column, int[] data) { double[] doubles = ArrayUtils.toDouble(data); return withDataForCol(column, doubles); } public TestFrameBuilder withDomain(int column, String[] domain) { givenDomains.put(column, domain); return this; } public TestFrameBuilder withChunkLayout(long... chunkLayout) { this.chunkLayout = chunkLayout; return this; } public Frame build() { prepareAndCheck(); // Create a frame Frame f = new Frame(key); f.preparePartialFrame(colNames); f.update(); // Create chunks int cidx = 0; long start = 0; for (long chnkSize : chunkLayout) { createChunks(start, chnkSize, cidx); cidx++; start = start + chnkSize; } // Reload frame from DKV f = DKV.get(key).get(); // Finalize frame f.finalizePartialFrame(chunkLayout, domains, vecTypes); Scope.track(f); return f; } //-------------------------------------------------------------------------------------------------------------------- // Private //-------------------------------------------------------------------------------------------------------------------- private void prepareAndCheck(){ // this check has to be run as the first one checkVecTypes(); checkNames(); // check that we have data for all columns and all columns have the same number of elements checkColumnData(); checkFrameName(); checkChunkLayout(); prepareCategoricals(); } // Utility method to get unique values from categorical domain private String[] getUniqueValues(Map mapping){ String[] values = new String[mapping.size()]; for (String key : mapping.keySet()) values[mapping.get(key)] = key; return values; } // Utility method to convert domain into categories private Integer[] applyDomainMapping(Map mapping, String[] original){ Integer[] categoricals = new Integer[original.length]; for(int i = 0; i < original.length; i++) { categoricals[i] = original[i] == null ? null : mapping.get(original[i]); } return categoricals; } private Map getMapping(String[] array) { return getMapping(array, false); } // Utility method to get mapping from domain member to its level private Map getMapping(String[] array, boolean useOrderInArray){ Map mapping = new TreeMap<>(); int level = 0; for (String item : array) { if ((item != null) && (! mapping.containsKey(item))) { mapping.put(item, useOrderInArray ? level++ : 0); } } if (!useOrderInArray) { // use lexicographic order instead (default behaviour of H2O parser) for (Map.Entry entry : mapping.entrySet()) { entry.setValue(level++); } } return mapping; } private void prepareCategoricals(){ // domains is not null if there is any T_CAT for (int colIdx = 0; colIdx < vecTypes.length; colIdx++) { if (givenDomains.containsKey(colIdx)) { // domain set explicitly String[] doms = givenDomains.get(colIdx); domains[colIdx] = doms; Map mapping = getMapping(doms, true); Integer[] categories = applyDomainMapping(mapping, stringData.get(colIdx)); categoriesPerCol.put(colIdx, categories); } else if (vecTypes[colIdx]==Vec.T_CAT) { // default domain extraction (use lexicographical order) Map mapping = getMapping(stringData.get(colIdx)); Integer[] categories = applyDomainMapping(mapping, stringData.get(colIdx)); domains[colIdx] = getUniqueValues(mapping); categoriesPerCol.put(colIdx, categories); } else { if(domains != null) { domains[colIdx] = null; } } } } private void createChunks(long start, long length, int cidx) { NewChunk[] nchunks = Frame.createNewChunks(key.toString(), vecTypes, cidx); for (int i = (int) start; i < start + length; i++) { for (int colIdx = 0; colIdx < vecTypes.length; colIdx++) { switch (vecTypes[colIdx]) { case Vec.T_NUM: nchunks[colIdx].addNum(numericData.get(colIdx)[i]); break; case Vec.T_STR: nchunks[colIdx].addStr(stringData.get(colIdx)[i]); break; case Vec.T_TIME: nchunks[colIdx].addNum(numericData.get(colIdx)[i]); break; case Vec.T_CAT: Integer cat = categoriesPerCol.get(colIdx)[i]; if (cat != null) nchunks[colIdx].addCategorical(cat); else nchunks[colIdx].addNA(); break; case Vec.T_UUID: final String strValue = stringData.get(colIdx)[i]; if (strValue == null) nchunks[colIdx].addNA(); else { UUID uuidValue = UUID.fromString(strValue); nchunks[colIdx].addUUID(uuidValue); } break; case Vec.T_BAD: nchunks[colIdx].addNum(numericData.get(colIdx)[i]); break; default: throw new UnsupportedOperationException("Unsupported Vector type for the builder"); } } } Frame.closeNewChunks(nchunks); } // this check has to be called as the first one private void checkVecTypes() { if(vecTypes==null){ if (colNames == null) { vecTypes = new byte[0]; } else { vecTypes = new byte[colNames.length]; for (int i = 0; i < colNames.length; i++) vecTypes[i] = Vec.T_NUM; } } numCols = vecTypes.length; for(int i=0; i numRows, "Total chunk capacity is higher then available number of elements. " + "Check withChunkLayout() and make sure that sum of the arguments is equal to number of the rows in frame."); throwIf(sum < numRows, "Not enough chunk capacity to store " + numRows + " rows. " + "Check withChunkLayout() and make sure that sum of the arguments is equal to number of the rows in frame."); } else { // create chunk layout - by default 1 chunk chunkLayout = new long[]{numRows}; } } private void checkColumnData() { for (int colIdx = 0; colIdx < numCols; colIdx++) { switch (vecTypes[colIdx]) { case Vec.T_TIME: // fall-through to T_NUM case Vec.T_NUM: if (numRows == NOT_SET) { numRows = numericData.get(colIdx).length; } else { throwIf(numRows != numericData.get(colIdx).length, "Columns have different number of elements"); } break; case Vec.T_CAT: // fall-through to T_CAT case Vec.T_STR: case Vec.T_UUID: if (numRows == NOT_SET) { numRows = stringData.get(colIdx).length; } else { throwIf(numRows != stringData.get(colIdx).length, "Columns have different number of elements"); } break; case Vec.T_BAD: final double[] data = numericData.get(colIdx); numRows = data.length; for (double datum : data) { throwIf(!Double.isNaN(datum), "All elements in a bad column must be NAs."); } break; default: throw new UnsupportedOperationException("Unsupported Vector type for the builder"); } } } private void throwIf(boolean condition, String msg){ if(condition){ throw new IllegalArgumentException(msg); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy