
water.parser.ParseSetup Maven / Gradle / Ivy
package water.parser;
import water.DKV;
import water.Iced;
import water.Key;
import water.MRTask;
import water.api.ParseSetupV3;
import water.exceptions.H2OIllegalArgumentException;
import water.exceptions.H2OParseException;
import water.exceptions.H2OParseSetupException;
import water.fvec.Frame;
import water.fvec.Vec;
import water.fvec.UploadFileVec;
import water.fvec.FileVec;
import water.fvec.ByteVec;
import java.util.Arrays;
import java.util.HashSet;
/**
* Configuration and base guesser for a parse;
*/
public final class ParseSetup extends Iced {
public static final byte GUESS_SEP = -1;
public static final int NO_HEADER = -1;
public static final int GUESS_HEADER = 0;
public static final int HAS_HEADER = 1;
public static final int GUESS_COL_CNT = -1;
ParserType _parse_type; // CSV, XLS, XSLX, SVMLight, Auto, ARFF
byte _separator; // Field separator, usually comma ',' or TAB or space ' '
// Whether or not single-quotes quote a field. E.g. how do we parse:
// raw data: 123,'Mally,456,O'Mally
// singleQuotes==True ==> 2 columns: 123 and Mally,456,OMally
// singleQuotes==False ==> 4 columns: 123 and 'Mally and 456 and O'Mally
boolean _single_quotes;
int _check_header; // 1st row: 0: guess, +1 header, -1 data
int _number_columns; // Columns to parse
String[] _column_names;
byte[] _column_types; // Column types
String[][] _domains; // Domains for each column (null if numeric)
String[][] _na_strings; // Strings for NA in a given column
String[][] _data; // First few rows of parsed/tokenized data
int _chunk_size = FileVec.DFLT_CHUNK_SIZE; // Optimal chunk size to be used store values
PreviewParseWriter _column_previews = null;
public ParseSetup(ParseSetup ps) {
this(ps._parse_type,
ps._separator, ps._single_quotes, ps._check_header, ps._number_columns,
ps._column_names, ps._column_types, ps._domains, ps._na_strings, ps._data, ps._chunk_size);
}
public ParseSetup(ParserType t, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, int chunkSize) {
_parse_type = t;
_separator = sep;
_single_quotes = singleQuotes;
_check_header = checkHeader;
_number_columns = ncols;
_column_names = columnNames;
_column_types = ctypes;
_domains = domains;
_na_strings = naStrings;
_data = data;
_chunk_size = chunkSize;
}
/**
* Create a ParseSetup with parameters from the client.
*
* Typically used to guide sampling in the data
* to verify chosen settings, and fill in missing settings.
*
* @param ps Parse setup settings from client
*/
public ParseSetup(ParseSetupV3 ps) {
this(ps.parse_type, ps.separator, ps.single_quotes, ps.check_header,
GUESS_COL_CNT, ps.column_names, strToColumnTypes(ps.column_types),
null, ps.na_strings, null, ps.chunk_size);
if(ps.parse_type == null) _parse_type = ParserType.GUESS;
if(ps.separator == 0) _separator = GUESS_SEP;
}
/**
* Create a ParseSetup with all parameters except chunk size.
*
* Typically used by file type parsers for returning final valid results
* _chunk_size will be set later using results from all files.
*/
public ParseSetup(ParserType t, byte sep, boolean singleQuotes, int checkHeader,
int ncols, String[] columnNames, byte[] ctypes,
String[][] domains, String[][] naStrings, String[][] data) {
this(t, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes,
domains, naStrings, data, FileVec.DFLT_CHUNK_SIZE);
}
/**
* Create a ParseSetup without any column information
*
* Typically used by file type parsers for returning final invalid results
*/
public ParseSetup(ParserType t, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[][] data) {
this(t, sep, singleQuotes, checkHeader, ncols, null, null, null, null, data, FileVec.DFLT_CHUNK_SIZE);
}
/**
* Create a default ParseSetup
*
* Used by Ray's schema magic
*/
public ParseSetup() {}
public String[] getColumnNames() { return _column_names; }
public String[][] getData() { return _data; }
public String[] getColumnTypeStrings() {
String[] types = new String[_column_types.length];
for(int i=0; i< types.length; i++)
types[i] = Vec.TYPE_STR[_column_types[i]];
return types;
}
public static byte[] strToColumnTypes(String[] strs) {
if (strs == null) return null;
byte[] types = new byte[strs.length];
for(int i=0; i< types.length;i++) {
switch (strs[i].toLowerCase()) {
case "unknown": types[i] = Vec.T_BAD; break;
case "uuid": types[i] = Vec.T_UUID; break;
case "string": types[i] = Vec.T_STR; break;
case "numeric": types[i] = Vec.T_NUM; break;
case "enum": types[i] = Vec.T_ENUM; break;
case "time": types[i] = Vec.T_TIME; break;
default: types[i] = Vec.T_BAD;
throw new H2OIllegalArgumentException("Provided column type "+ strs[i] + " is unknown. ",
"Cannot proceed with parse due to invalid argument.");
}
}
return types;
}
public Parser parser(Key jobKey) {
switch(_parse_type) {
case CSV: return new CsvParser(this, jobKey);
case XLS: return new XlsParser(this, jobKey);
case SVMLight: return new SVMLightParser(this, jobKey);
case ARFF: return new ARFFParser(this, jobKey);
}
throw new H2OIllegalArgumentException("Unknown file type. Parse cannot be completed.",
"Attempted to invoke a parser for ParseType:" + _parse_type +", which doesn't exist.");
}
// Set of duplicated column names
HashSet checkDupColumnNames() {
HashSet conflictingNames = new HashSet<>();
if( _column_names ==null ) return conflictingNames;
HashSet uniqueNames = new HashSet<>();
for( String n : _column_names)
if (n != null)
(uniqueNames.contains(n) ? conflictingNames : uniqueNames).add(n);
return conflictingNames;
}
@Override public String toString() {
return _parse_type.toString(_number_columns, _separator);
}
static boolean allStrings(String [] line){
ValueString str = new ValueString();
for( String s : line ) {
try {
Double.parseDouble(s);
return false; // Number in 1st row guesses: No Column Header
} catch (NumberFormatException e) { /*Pass - determining if number is possible*/ }
str.setTo(s);
if(ParseTime.isTime(str)) return false;
if(ParseUUID.isUUID(str)) return false;
}
return true;
}
// simple heuristic to determine if we have headers:
// return true iff the first line is all strings and second line has at least one number
static boolean hasHeader(String[] l1, String[] l2) {
return allStrings(l1) && !allStrings(l2);
}
/**
* Used by test harnesses for simple parsing of test data. Presumes
* auto-detection for file and separator types.
*
* @param fkeys Keys to input vectors to be parsed
* @param singleQuote single quotes quote fields
* @param checkHeader check for a header
* @return ParseSetup settings from looking at all files
*/
public static ParseSetup guessSetup(Key[] fkeys, boolean singleQuote, int checkHeader) {
return guessSetup(fkeys, new ParseSetup(ParserType.GUESS, GUESS_SEP, singleQuote, checkHeader, GUESS_COL_CNT, null));
}
/**
* Discover the parse setup needed to correctly parse all files.
* This takes a ParseSetup as guidance. Each file is examined
* individually and then results merged. If a conflict exists
* between any results all files are re-examined using the
* best guess from the first examination.
*
* @param fkeys Keys to input vectors to be parsed
* @param userSetup Setup guidance from user
* @return ParseSetup settings from looking at all files
*/
public static ParseSetup guessSetup( Key[] fkeys, ParseSetup userSetup ) {
//Guess setup of each file and collect results
GuessSetupTsk t = new GuessSetupTsk(userSetup);
t.doAll(fkeys).getResult();
//Calc chunk-size
Iced ice = DKV.getGet(fkeys[0]);
if (ice instanceof Frame && ((Frame) ice).vec(0) instanceof UploadFileVec) {
t._gblSetup._chunk_size = FileVec.DFLT_CHUNK_SIZE;
} else {
t._gblSetup._chunk_size = FileVec.calcOptimalChunkSize(t._totalParseSize, t._gblSetup._number_columns);
}
return t._gblSetup;
}
/**
* Try to determine the ParseSetup on a file by file basis
* and merge results.
*/
public static class GuessSetupTsk extends MRTask {
// Input
final ParseSetup _userSetup;
boolean _empty = true;
// Output
public ParseSetup _gblSetup;
public long _totalParseSize;
/**
*
* @param userSetup ParseSetup to guide examination of files
*/
public GuessSetupTsk(ParseSetup userSetup) { _userSetup = userSetup; }
/**
* Runs once on each file to guess that file's ParseSetup
*
* For ByteVecs, UploadFileVecs, compressed files and small files,
* the ParseSetup is guessed from a single DFLT_CHUNK_SIZE chunk from
* the start of the file. This is because UploadFileVecs and compressed
* files don't allow random sampling, small files don't need it, and
* ByteVecs tend to be small.
*
* For larger NSFFileVecs and HDFSFileVecs 1M samples are taken at the
* beginning of every 100M, and an additional sample is taken from the
* last chunk of the file. The results of these samples are merged
* together (and compared for consistency).
*
* Sampling more than the first bytes is preferred, since large data sets
* with sorted columns may have all the same value in their first bytes,
* making for poor type guesses.
*
*/
@Override public void map(Key key) {
Iced ice = DKV.getGet(key);
if(ice == null) throw new H2OIllegalArgumentException("Missing data","Did not find any data under key " + key);
ByteVec bv = (ByteVec)(ice instanceof ByteVec ? ice : ((Frame)ice).vecs()[0]);
byte [] bits = ZipUtil.getFirstUnzippedBytes(bv);
if(bits.length > 0) {
_empty = false;
// get file size
float decompRatio = ZipUtil.decompressionRatio(bv);
if (decompRatio > 1.0)
_totalParseSize += bv.length() * decompRatio; // estimate file size
else // avoid numerical distortion of file size when not compressed
_totalParseSize += bv.length();
// Check for supported character encodings
checkCharEncoding(bits);
// only preview 1 DFLT_CHUNK_SIZE for ByteVecs, UploadFileVecs, compressed, and small files
/* if (ice instanceof ByteVec
|| ((Frame)ice).vecs()[0] instanceof UploadFileVec
|| bv.length() <= FileVec.DFLT_CHUNK_SIZE
|| decompRatio > 1.0) { */
try {
_gblSetup = guessSetup(bits, _userSetup);
} catch (H2OParseException pse) {
throw new H2OParseSetupException(key, pse);
}
/* } else { // file is aun uncompressed NFSFileVec or HDFSFileVec & larger than the DFLT_CHUNK_SIZE
FileVec fv = (FileVec) ((Frame) ice).vecs()[0];
// reset chunk size to 1M (uncompressed)
int chkSize = (int) ((1<<20) /decompRatio);
fv.setChunkSize((Frame) ice, chkSize);
// guessSetup from first chunk
_gblSetup = guessSetup(fv.getPreviewChunkBytes(0), _userSetup);
_userSetup._check_header = -1; // remaining chunks shouldn't check for header
_userSetup._parse_type = _gblSetup._parse_type; // or guess parse type
//preview 1M data every 100M
int numChunks = fv.nChunks();
for (int i=100; i < numChunks;i += 100) {
bits = fv.getPreviewChunkBytes(i);
if (bits != null)
_gblSetup = mergeSetups(_gblSetup, guessSetup(bits, _userSetup));
}
// grab sample at end of file (if not done by prev loop)
if (numChunks % 100 > 1){
bits = fv.getPreviewChunkBytes(numChunks - 1);
if (bits != null)
_gblSetup = mergeSetups(_gblSetup, guessSetup(bits, _userSetup));
}
// return chunk size to DFLT
fv.setChunkSize((Frame) ice, FileVec.DFLT_CHUNK_SIZE);
} */
// report if multiple files exist in zip archive
/* if (ZipUtil.getFileCount(bv) > 1) {
if (_gblSetup._errors != null)
_gblSetup._errors = Arrays.copyOf(_gblSetup._errors, _gblSetup._errors.length + 1);
else
_gblSetup._errors = new String[1];
_gblSetup._errors[_gblSetup._errors.length - 1] = "Only single file zip " +
"archives are currently supported, only the first file has been parsed. " +
"Remaining files have been ignored.";
}*/
}
}
/**
* Merges ParseSetup results, conflicts, and errors from several files
*/
@Override
public void reduce(GuessSetupTsk other) {
if (other._empty) return;
if (_gblSetup == null) {
_empty = false;
_gblSetup = other._gblSetup;
assert (_gblSetup != null);
return;
}
_gblSetup = mergeSetups(_gblSetup, other._gblSetup);
_totalParseSize += other._totalParseSize;
}
@Override public void postGlobal() {
if (_gblSetup._column_previews != null && _gblSetup._parse_type != ParserType.ARFF) {
_gblSetup._column_types = _gblSetup._column_previews.guessTypes();
if (_userSetup._na_strings == null)
_gblSetup._na_strings = _gblSetup._column_previews.guessNAStrings(_gblSetup._column_types);
else
_gblSetup._na_strings = _userSetup._na_strings;
}
}
private ParseSetup mergeSetups(ParseSetup setupA, ParseSetup setupB) {
if (setupA == null) return setupB;
ParseSetup mergedSetup = setupA;
mergedSetup._check_header = unifyCheckHeader(setupA._check_header, setupB._check_header);
mergedSetup._separator = unifyColumnSeparators(setupA._separator, setupB._separator);
mergedSetup._number_columns = unifyColumnCount(setupA._number_columns, setupB._number_columns);
mergedSetup._column_names = unifyColumnNames(setupA._column_names, setupB._column_names);
if (setupA._parse_type == ParserType.ARFF && setupB._parse_type == ParserType.CSV)
;// do nothing parse_type and col_types are already set correctly
else if (setupA._parse_type == ParserType.CSV && setupB._parse_type == ParserType.ARFF) {
mergedSetup._parse_type = ParserType.ARFF;
mergedSetup._column_types = setupB._column_types;
} else if (setupA._parse_type == setupB._parse_type) {
mergedSetup._column_previews = PreviewParseWriter.unifyColumnPreviews(setupA._column_previews, setupB._column_previews);
} else
throw new H2OParseSetupException("File type mismatch. Cannot parse files of type "
+ setupA._parse_type + " and " + setupB._parse_type + " as one dataset.");
if (mergedSetup._data.length < PreviewParseWriter.MAX_PREVIEW_LINES) {
int n = mergedSetup._data.length;
int m = Math.min(PreviewParseWriter.MAX_PREVIEW_LINES, n + setupB._data.length - 1);
mergedSetup._data = Arrays.copyOf(mergedSetup._data, m);
System.arraycopy(setupB._data, 0, mergedSetup._data, n, m - n);
}
return mergedSetup;
}
private static int unifyCheckHeader(int chkHdrA, int chkHdrB){
if (chkHdrA == GUESS_HEADER || chkHdrB == GUESS_HEADER)
throw new H2OParseSetupException("Unable to determine header on a file. Not expected.");
if (chkHdrA == HAS_HEADER || chkHdrB == HAS_HEADER) return HAS_HEADER;
else return NO_HEADER;
}
private static byte unifyColumnSeparators(byte sepA, byte sepB) {
if( sepA == sepB) return sepA;
else if (sepA == GUESS_SEP) return sepB;
else if (sepB == GUESS_SEP) return sepA;
// TODO: Point out which file is problem
throw new H2OParseSetupException("Column separator mismatch. One file seems to use \""
+ (char) sepA + "\" and the other uses \"" + (char) sepB + "\".");
}
private static int unifyColumnCount(int cntA, int cntB) {
if (cntA == cntB) return cntA;
else if (cntA == 0) return cntB;
else if (cntB == 0) return cntA;
else { // files contain different numbers of columns
// TODO: Point out which file is problem
throw new H2OParseSetupException("Files conflict in number of columns. " + cntA
+ " vs. " + cntB + ".");
}
}
private static String[] unifyColumnNames(String[] namesA, String[] namesB){
if (namesA == null) return namesB;
else if (namesB == null) return namesA;
else {
for (int i = 0; i < namesA.length; i++) {
if (i > namesB.length || !namesA[i].equals(namesB[i])) {
// TODO improvement: if files match except for blanks, merge?
throw new H2OParseSetupException("Column names do not match between files.");
}
}
return namesA;
}
}
}
/**
* Guess everything from a single pile-o-bits. Used in tests, or in initial
* parser inspections when the user has not told us anything about separators
* or headers.
*
* @param bits Initial bytes from a parse source
* @return ParseSetup settings from looking at all files
*/
public static ParseSetup guessSetup( byte[] bits, ParseSetup userSetup ) {
return guessSetup(bits, userSetup._parse_type, userSetup._separator, GUESS_COL_CNT, userSetup._single_quotes, userSetup._check_header, userSetup._column_names, userSetup._column_types, null, null);
}
private static final ParserType guessFileTypeOrder[] = {ParserType.ARFF, ParserType.XLS,ParserType.XLSX,ParserType.SVMLight,ParserType.CSV};
public static ParseSetup guessSetup( byte[] bits, ParserType pType, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ) {
switch( pType ) {
case CSV: return CsvParser.guessSetup(bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, naStrings);
case SVMLight: return SVMLightParser.guessSetup(bits);
case XLS: return XlsParser.guessSetup(bits);
case ARFF: return ARFFParser.guessSetup(bits, sep, singleQuotes, columnNames, naStrings);
case GUESS:
for( ParserType pTypeGuess : guessFileTypeOrder ) {
try {
ParseSetup ps = guessSetup(bits,pTypeGuess,sep,ncols,singleQuotes,checkHeader,columnNames,columnTypes, domains, naStrings);
if( ps != null) return ps;
} catch( Throwable ignore ) { /*ignore failed parse attempt*/ }
}
}
throw new H2OParseSetupException("Cannot determine file type.");
}
/**
* Cleans up the file name to make .hex name
* to be used as a destination key. Eliminates
* common file extensions, and replaces odd
* characters.
*
* @param n filename to be cleaned
* @return cleaned name
*/
public static String createHexName(String n) {
// blahblahblah/myName.ext ==> myName
// blahblahblah/myName.csv.ext ==> myName
int sep = n.lastIndexOf(java.io.File.separatorChar);
if( sep > 0 ) n = n.substring(sep+1);
int dot = n.lastIndexOf('.');
while (dot > 0 && (n.endsWith("zip")
|| n.endsWith("gz")
|| n.endsWith("csv")
|| n.endsWith("xls")
|| n.endsWith("txt")
|| n.endsWith("svm")
|| n.endsWith("arff"))) {
n = n.substring(0, dot);
dot = n.lastIndexOf('.');
}
// "2012_somedata" ==> "X2012_somedata"
if( !Character.isJavaIdentifierStart(n.charAt(0)) ) n = "X"+n;
// "human%Percent" ==> "human_Percent"
char[] cs = n.toCharArray();
for( int i=1; i "myName.hex"
n = new String(cs);
int i = 0;
String res = n + ".hex";
Key k = Key.make(res);
// Renumber to handle dup names
while(DKV.get(k) != null)
k = Key.make(res = n + ++i + ".hex");
return res;
}
/**
* Reject unsupported encodings
*
* For the curious, this is hardly a complete test, it only catches the
* most polite UTF-16 cases. Switch to jChardet or guessEncoding libraries
* for more robust solutions. WARNING: not all UTF-16 files
* use BOM to indicate their encoding. Even worse, some datasets may be
* made from disparate sources, and could used a mix that wouldn't be
* detected by this.
*
* @param bits data to be examined for encoding
*/
private static final void checkCharEncoding(byte[] bits) {
if (bits.length >= 2) {
if ((bits[0] == (byte) 0xff && bits[1] == (byte) 0xfe) /* UTF-16, little endian */ ||
(bits[0] == (byte) 0xfe && bits[1] == (byte) 0xff) /* UTF-16, big endian */) {
throw new H2OParseSetupException("UTF16 encoding detected, but is not supported.");
}
}
}
} // ParseSetup state class
© 2015 - 2025 Weber Informatics LLC | Privacy Policy