water.parser.ARFFParser Maven / Gradle / Ivy

Go to download
package water.parser;

import java.util.ArrayList;
import water.Key;
import water.exceptions.H2OParseSetupException;
import water.fvec.Vec;

class ARFFParser extends CsvParser {
  private static final byte GUESS_SEP = ParseSetup.GUESS_SEP;

  ARFFParser(ParseSetup ps, Key jobKey) { super(ps, jobKey); }

  /** Try to parse the bytes as ARFF format  */
  static ParseSetup guessSetup(byte[] bits, byte sep, boolean singleQuotes, String[] columnNames, String[][] naStrings) {
    if (columnNames != null) throw new UnsupportedOperationException("ARFFParser doesn't accept columnNames.");

    // Parse all lines starting with @ until EOF or @DATA
    boolean haveData = false;
    int offset = 0;
    String[][] data;
    String[] labels;
    String[][] domains;
    String[] headerlines = new String[0];
    byte[] ctypes;

    // header section
    ArrayList header = new ArrayList<>();
    offset = readArffHeader(offset, header, bits, singleQuotes);
    if (offset < bits.length && !CsvParser.isEOL(bits[offset]))
      haveData = true; //more than just the header

    if (header.size() == 0)
      throw new H2OParseSetupException("No data!");
    headerlines = header.toArray(headerlines);

    // process header
    final int nlines = headerlines.length;
    int ncols = nlines;
    data = new String[ncols][];
    labels = new String[ncols];
    domains = new String[ncols][];
    ctypes = new byte[ncols];
    processArffHeader(ncols, headerlines, data, labels, domains, ctypes);

    // data section (for preview)
    if (haveData) {
      String[] datalines = new String[0];
      ArrayList datablock = new ArrayList<>();
      while (offset < bits.length) {
        int lineStart = offset;
        while (offset < bits.length && !CsvParser.isEOL(bits[offset])) ++offset;
        int lineEnd = offset;
        ++offset;
        // For Windoze, skip a trailing LF after CR
        if ((offset < bits.length) && (bits[offset] == CsvParser.CHAR_LF)) ++offset;
        if (bits[lineStart] == '#') continue; // Ignore      comment lines
        if (bits[lineStart] == '%') continue; // Ignore ARFF comment lines
        if (lineEnd > lineStart) {
          String str = new String(bits, lineStart, lineEnd - lineStart).trim();
          if (!str.isEmpty()) datablock.add(str);
        }
      }
      if (datablock.size() == 0)
        throw new H2OParseSetupException("Unexpected line.");
      datalines = datablock.toArray(datalines);

      // process data section
      int nlines2 = Math.min(10, datalines.length);
      data = new String[nlines2][];

      // First guess the field separator by counting occurrences in first few lines
      if (nlines2 == 1) {
        if (sep == GUESS_SEP) {
          if (datalines[0].split(",").length > 2) sep = (byte) ',';
          else if (datalines[0].split(" ").length > 2) sep = ' ';
          else
            throw new H2OParseSetupException("Failed to detect separator.");
        }
        data[0] = determineTokens(datalines[0], sep, singleQuotes);
        ncols = (ncols > 0) ? ncols : data[0].length;
        labels = null;
      } else {                    // 2 or more lines
        if (sep == GUESS_SEP) {   // first guess the separator
          sep = guessSeparator(datalines[0], datalines[1], singleQuotes);
          if (sep == GUESS_SEP && nlines2 > 2) {
            sep = guessSeparator(datalines[1], datalines[2], singleQuotes);
            if (sep == GUESS_SEP) sep = guessSeparator(datalines[0], datalines[2], singleQuotes);
          }
          if (sep == GUESS_SEP) sep = (byte) ' '; // Bail out, go for space
        }

        for (int i = 0; i < nlines2; ++i) {
          data[i] = determineTokens(datalines[i], sep, singleQuotes);
        }
      }
    }

    // Return the final setup
    return new ParseSetup(ParserType.ARFF, sep, singleQuotes, ParseSetup.NO_HEADER, ncols, labels, ctypes, domains, naStrings, data);
  }

  private static int readArffHeader(int offset, ArrayList header, byte[] bits, boolean singleQuotes) {
    while (offset < bits.length) {
      int lineStart = offset;
      while (offset < bits.length && !CsvParser.isEOL(bits[offset])) ++offset;
      int lineEnd = offset;
      ++offset;
      // For Windoze, skip a trailing LF after CR
      if ((offset < bits.length) && (bits[offset] == CsvParser.CHAR_LF)) ++offset;
      if (bits[lineStart] == '#') continue; // Ignore      comment lines
      if (bits[lineStart] == '%') continue; // Ignore ARFF comment lines
      if (lineEnd > lineStart) {
        if (bits[lineStart] == '@' &&
                (bits[lineStart+1] == 'D' || bits[lineStart+1] =='d' ) &&
                (bits[lineStart+2] == 'A' || bits[lineStart+2] =='a' ) &&
                (bits[lineStart+3] == 'T' || bits[lineStart+3] =='t' ) &&
                (bits[lineStart+4] == 'A' || bits[lineStart+4] =='a' )){
          break;
        }
        String str = new String(bits, lineStart, lineEnd - lineStart).trim();
        String[] tok = determineTokens(str, CHAR_SPACE, singleQuotes);
        if (tok.length > 0 && tok[0].equalsIgnoreCase("@RELATION")) continue; // Ignore name of dataset
        if (!str.isEmpty()) header.add(str);
      }
    }
    return offset;
  }

  private static void processArffHeader(int ncols, String[] headerlines, String[][] data, String[] labels, String[][] domains, byte[] ctypes) {
    for (int i=0; i ");
        }
        labels[i] = data[i][1];
        String type = data[i][2];
        domains[i] = null;
        if (type.equalsIgnoreCase("NUMERIC") || type.equalsIgnoreCase("REAL") || type.equalsIgnoreCase("INTEGER") || type.equalsIgnoreCase("INT")) {
          ctypes[i] = Vec.T_NUM;
          continue;
        }
        else if (type.equalsIgnoreCase("DATE") || type.equalsIgnoreCase("TIME")) {
          ctypes[i] = Vec.T_TIME;
          continue;
        }
        else if (type.equalsIgnoreCase("ENUM")) {
          ctypes[i] = Vec.T_ENUM;
          continue;
        }
        else if (type.equalsIgnoreCase("STRING")) {
          ctypes[i] = Vec.T_STR;
          continue;
        }
        else if (type.equalsIgnoreCase("UUID")) { //extension of ARFF
          ctypes[i] = Vec.T_UUID;
          continue;
        }
        else if (type.equalsIgnoreCase("RELATIONAL")) {
          throw new UnsupportedOperationException("Relational ARFF format is not supported.");
        }
        else if (type.startsWith("{") && data[i][data[i].length-1].endsWith("}")) {
          StringBuilder builder = new StringBuilder();
          for(int j = 2; j < data[i].length; j++) {
            builder.append(data[i][j]);
          }
          domains[i] = builder.toString().replaceAll("[{}]", "").split(",");
          if (domains[i][0].length() > 0) {
            // case of {A,B,C} (valid list of factors)
            ctypes[i] = Vec.T_ENUM;
            continue;
          }
        }

        // only get here if data is invalid ARFF
        throw new H2OParseSetupException("Unexpected line.");
      }
    }

  }
}