All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.parser.ParseDataset Maven / Gradle / Ivy

package water.parser;

import jsr166y.CountedCompleter;
import jsr166y.ForkJoinTask;
import jsr166y.RecursiveAction;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.exceptions.H2OIllegalArgumentException;
import water.exceptions.H2OParseException;
import water.fvec.*;
import water.fvec.Vec.VectorGroup;
import water.nbhm.NonBlockingHashMap;
import water.nbhm.NonBlockingSetInt;
import water.util.*;

import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import com.google.common.base.Charsets;

public final class ParseDataset extends Job {
  private MultiFileParseTask _mfpt; // Access to partially built vectors for cleanup after parser crash

  // Keys are limited to ByteVec Keys and Frames-of-1-ByteVec Keys
  public static Frame parse(Key okey, Key... keys) { return parse(okey,keys,true, false, ParseSetup.GUESS_HEADER); }

  // Guess setup from inspecting the first Key only, then parse.
  // Suitable for e.g. testing setups, where the data is known to be sane.
  // NOT suitable for random user input!
  public static Frame parse(Key okey, Key[] keys, boolean deleteOnDone, boolean singleQuote, int checkHeader) {
    return parse(okey,keys,deleteOnDone,ParseSetup.guessSetup(keys, singleQuote, checkHeader));
  }
  public static Frame parse(Key okey, Key[] keys, boolean deleteOnDone, ParseSetup globalSetup) {
    return parse(okey,keys,deleteOnDone,globalSetup,true).get();
  }
  public static ParseDataset parse(Key okey, Key[] keys, boolean deleteOnDone, ParseSetup globalSetup, boolean blocking) {
    ParseDataset job = forkParseDataset(okey, keys, globalSetup, deleteOnDone);
    if( blocking )
      job.get();
    return job;
  }

  // Allow both ByteVec keys and Frame-of-1-ByteVec
  static ByteVec getByteVec(Key key) {
    Iced ice = DKV.getGet(key);
    if(ice == null)
      throw new H2OIllegalArgumentException("Missing data","Did not find any data under key " + key);
    return (ByteVec)(ice instanceof ByteVec ? ice : ((Frame)ice).vecs()[0]);
  }
  static String [] getColumnNames(int ncols, String[] colNames) {
    if(colNames == null) { // no names, generate
      colNames = new String[ncols];
      for(int i=0; i < ncols; i++ )
        colNames[i] = "C" + String.valueOf(i+1);
    } else { // some or all names exist, fill in blanks
      HashSet nameSet = new HashSet<>(Arrays.asList(colNames));
      colNames = Arrays.copyOf(colNames, ncols);
      for(int i=0; i < ncols; i++ ) {
        if (colNames[i] == null || colNames[i].equals("")) {
          String tmp = "C" + String.valueOf(i + 1);
          while (nameSet.contains(tmp)) // keep building name until unique
            tmp = tmp + tmp;
          colNames[i] = tmp;
        }
      }
    }
    return colNames;
  }

  // Same parse, as a backgroundable Job
  public static ParseDataset forkParseDataset(final Key dest, final Key[] keys, final ParseSetup setup, boolean deleteOnDone) {
    HashSet conflictingNames = setup.checkDupColumnNames();
    for( String x : conflictingNames )
    if ( !x.equals(""))
      throw new IllegalArgumentException("Found duplicate column name "+x);
    // Some quick sanity checks: no overwriting your input key, and a resource check.
    long totalParseSize=0;
    for( int i=0; i memsz*4 )
      throw new IllegalArgumentException("Total input file size of "+PrettyPrint.bytes(totalParseSize)+" is much larger than total cluster memory of "+PrettyPrint.bytes(memsz)+", please use either a larger cluster or smaller data.");

    if (H2O.GA != null)
      GAUtils.logParse(totalParseSize, keys.length, setup._number_columns);

    // Fire off the parse
    ParseDataset job = new ParseDataset(dest);
    new Frame(job.dest(),new String[0],new Vec[0]).delete_and_lock(job._key); // Write-Lock BEFORE returning
    for( Key k : keys ) Lockable.read_lock(k,job._key); // Read-Lock BEFORE returning
    ParserFJTask fjt = new ParserFJTask(job, keys, setup, deleteOnDone); // Fire off background parse
    job.start(fjt, totalParseSize, true);
    return job;
  }

  // Setup a private background parse job
  private ParseDataset(Key dest) {
    super(dest,"Parse");
  }

  // -------------------------------
  // Simple internal class doing background parsing, with trackable Job status
  public static class ParserFJTask extends water.H2O.H2OCountedCompleter {
    final ParseDataset _job;
    final Key[] _keys;
    final ParseSetup _setup;
    final boolean _deleteOnDone;

    public ParserFJTask( ParseDataset job, Key[] keys, ParseSetup setup, boolean deleteOnDone) {
      _job = job;
      _keys = keys;
      _setup = setup;
      _deleteOnDone = deleteOnDone;
    }
    @Override public void compute2() {
      parseAllKeys(_job, _keys, _setup, _deleteOnDone);
      tryComplete();
    }

    // Took a crash/NPE somewhere in the parser.  Attempt cleanup.
    @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller){
      if( _job != null ) {
        _job.failed(ex);
        parseCleanup();
        _job._mfpt = null;
      }
      return true;
    }

    @Override public void onCompletion(CountedCompleter caller) {
      if (_job.isCancelledOrCrashed()) {
        parseCleanup();
        _job._mfpt = null;
      } else {
        _job._mfpt = null;
        _job.done();
      }
    }

    private void parseCleanup() {
      if( _job != null ) {
        Futures fs = new Futures();
        assert DKV.getGet(_job._key).isStopped();
        // Find & remove all partially-built output vecs & chunks
        if (_job._mfpt != null) _job._mfpt.onExceptionCleanup(fs);
        // Assume the input is corrupt - or already partially deleted after
        // parsing.  Nuke it all - no partial Vecs lying around.
        for (Key k : _keys) Keyed.remove(k, fs);
        Keyed.remove(_job._dest,fs);
        fs.blockForPending();
        DKV.put(_job._key, _job);
      }
    }
  }

  private static class CategoricalUpdateMap extends Iced {
    final int [][] map;
    public CategoricalUpdateMap(int[][] map){this.map = map;}
  }
  // --------------------------------------------------------------------------
  // Top-level parser driver
  private static void parseAllKeys(ParseDataset job, Key[] fkeys, ParseSetup setup, boolean deleteOnDone) {
    assert setup._number_columns > 0;
    if( setup._column_names != null &&
        ( (setup._column_names.length == 0) ||
          (setup._column_names.length == 1 && setup._column_names[0].isEmpty())) )
      setup._column_names = null; // // FIXME: annoyingly front end sends column names as String[] {""} even if setup returned null
    if(setup._na_strings != null && setup._na_strings.length != setup._number_columns) setup._na_strings = null;
    if( fkeys.length == 0) { job.cancel();  return;  }

    job.update(0, "Ingesting files.");
    VectorGroup vg = getByteVec(fkeys[0]).group();
    MultiFileParseTask mfpt = job._mfpt = new MultiFileParseTask(vg,setup,job._key,fkeys,deleteOnDone);
    mfpt.doAll(fkeys);
    Log.trace("Done ingesting files.");
    if ( job.isCancelledOrCrashed()) return;

    final AppendableVec [] avs = mfpt.vecs();
    setup._column_names = getColumnNames(avs.length, setup._column_names);

    Frame fr = null;
    // Calculate enum domain
    // Filter down to columns with some enums
    int n = 0;
    int[] ecols2 = new int[avs.length];
    for( int i = 0; i < ecols2.length; ++i )
      if( avs[i].shouldBeEnum()  )
        ecols2[n++] = i;
    final int[] ecols = Arrays.copyOf(ecols2, n);
    // If we have any, go gather unified enum domains
    if( n > 0 ) {
      job.update(0, "Collecting categorical domains across nodes.");
      {
        GatherCategoricalDomainsTask gcdt = new GatherCategoricalDomainsTask(mfpt._eKey, ecols).doAllNodes();
        //Test domains for excessive length.
        List offendingColNames = new ArrayList<>();
        for (int i = 0; i < ecols.length; i++) {
          if (gcdt.getDomainLength(i) < Categorical.MAX_CATEGORICAL_COUNT)
            avs[ecols[i]].setDomain(gcdt.getDomain(i));
          else
            offendingColNames.add(setup._column_names[ecols[i]]);
        }
        if (offendingColNames.size() > 0)
          throw new H2OParseException("Exceeded categorical limit on columns "+ offendingColNames+".   Consider reparsing these columns as a string.");
      }

      job.update(0, "Compressing data.");
      fr = new Frame(job.dest(), setup._column_names,AppendableVec.closeAll(avs));
      fr.update(job._key);
      // Update enums to the globally agreed numbering
      Vec[] evecs = new Vec[ecols.length];
      for( int i = 0; i < evecs.length; ++i ) evecs[i] = fr.vecs()[ecols[i]];

      job.update(0, "Unifying categoricals across nodes.");
      {
        // new CreateParse2GlobalCategoricalMaps(mfpt._eKey).doAll(evecs);
        // Using Dtask since it starts and returns faster than an MRTask
        CreateParse2GlobalCategoricalMaps[] fcdt = new CreateParse2GlobalCategoricalMaps[H2O.CLOUD.size()];
        RPC[] rpcs = new RPC[H2O.CLOUD.size()];
        for (int i = 0; i < fcdt.length; i++){
          H2ONode[] nodes = H2O.CLOUD.members();
          fcdt[i] = new CreateParse2GlobalCategoricalMaps(mfpt._eKey, fr._key, ecols.length);
          rpcs[i] = new RPC<>(nodes[i], fcdt[i]).call();
        }
        for (RPC rpc : rpcs)
          rpc.get();

        new UpdateCategoricalChunksTask(mfpt._eKey, mfpt._chunk2ParseNodeMap).doAll(evecs);
        MultiFileParseTask._enums.remove(mfpt._eKey);
      }
      Log.trace("Done unifying categoricals across nodes.");
    } else {                    // No enums case
      job.update(0,"Compressing data.");
      fr = new Frame(job.dest(), setup._column_names,AppendableVec.closeAll(avs));
      Log.trace("Done closing all Vecs.");
    }
    // Check for job cancellation
    if ( job.isCancelledOrCrashed()) return;

    // SVMLight is sparse format, there may be missing chunks with all 0s, fill them in
    if (setup._parse_type == ParserType.SVMLight)
      new SVFTask(fr).doAllNodes();

    // Check for job cancellation
    if ( job.isCancelledOrCrashed()) return;

    // Log any errors
    if( mfpt._errors != null )
      for( String err : mfpt._errors )
        Log.warn(err);
    job.update(0,"Calculating data summary.");
    logParseResults(job, fr);
    // Release the frame for overwriting
    fr.update(job._key);
    Frame fr2 = DKV.getGet(fr._key);
    assert fr2._names.length == fr2.numCols();
    fr.unlock(job._key);
    // Remove CSV files from H2O memory
    if( deleteOnDone )
      for( Key k : fkeys )
        assert DKV.get(k) == null : "Input key "+k+" not deleted during parse";
  }
  private static class CreateParse2GlobalCategoricalMaps extends DTask {
    private final Key _parseCatMapsKey;
    private final Key _frKey;
    private final byte _priority;
    private final int _eColCnt;

    private CreateParse2GlobalCategoricalMaps(Key parseCatMapsKey, Key key, int eColCnt) {
      _parseCatMapsKey = parseCatMapsKey;
      _frKey = key;
      _eColCnt = eColCnt;
      _priority = nextThrPriority();
    }

    @Override public byte priority() {return _priority;}
    @Override public void compute2() {
      Frame _fr = DKV.getGet(_frKey);
      // get the node local category->ordinal maps for each column from initial parse pass
      if( !MultiFileParseTask._enums.containsKey(_parseCatMapsKey) ) {
        tryComplete();
        return;
      }
        final Categorical[] parseCatMaps = MultiFileParseTask._enums.get(_parseCatMapsKey);
        int[][] _nodeOrdMaps = new int[_eColCnt][];

        // create old_ordinal->new_ordinal map for each cat column
        int catColIdx = 0;
        for (int colIdx = 0; colIdx < parseCatMaps.length; colIdx++) {
          if (parseCatMaps[colIdx].size() != 0) {
            _nodeOrdMaps[catColIdx] = MemoryManager.malloc4(parseCatMaps[colIdx].maxId() + 1);
            Arrays.fill(_nodeOrdMaps[catColIdx], -1);
            //Bulk String->ValueString conversion is slightly faster, but consumes memory
            final ValueString[] unifiedDomain = ValueString.toValueString(_fr.vec(colIdx).domain());
            //final String[] unifiedDomain = _fr.vec(colIdx).domain();
            for (int i = 0; i < unifiedDomain.length; i++) {
              //final ValueString cat = new ValueString(unifiedDomain[i]);
              if (parseCatMaps[colIdx].containsKey(unifiedDomain[i])) {
                _nodeOrdMaps[catColIdx][parseCatMaps[colIdx].getTokenId(unifiedDomain[i])] = i;
              }
            }
            catColIdx++;
          }
        }

        // Store the local->global ordinal maps in DKV by node parse enum key and node index
        DKV.put(Key.make(_parseCatMapsKey.toString() + "parseCatMapNode" + H2O.SELF.index()), new CategoricalUpdateMap(_nodeOrdMaps));
      tryComplete();
    }
  }

  // --------------------------------------------------------------------------
  /** Task to update enum (categorical) values to match the global numbering scheme.
   *  Performs update in place so that values originally numbered using
   *  node-local unordered numbering will be numbered using global numbering.
   *  @author tomasnykodym
   */
  private static class UpdateCategoricalChunksTask extends MRTask {
    private final Key _parseCatMapsKey;
    private final int  [] _chunk2ParseNodeMap;

    private UpdateCategoricalChunksTask(Key parseCatMapsKey, int[] chunk2ParseNodeMap) {
      _parseCatMapsKey = parseCatMapsKey;
      _chunk2ParseNodeMap = chunk2ParseNodeMap;
    }

    @Override public void map(Chunk [] chks){
      CategoricalUpdateMap temp = DKV.getGet(Key.make(_parseCatMapsKey.toString() + "parseCatMapNode" + _chunk2ParseNodeMap[chks[0].cidx()]));
      //FIXME sanity check
      int[][] _parse2GlobalCatMaps = temp.map;

      //update the chunk with the new map
      final int cidx = chks[0].cidx();
      for(int i = 0; i < chks.length; ++i) {
        Chunk chk = chks[i];
        if (!(chk instanceof CStrChunk)) {
          for( int j = 0; j < chk._len; ++j){
            if( chk.isNA(j) )continue;
            final int old = (int) chk.at8(j);
            if (old < 0 || old >= _parse2GlobalCatMaps[i].length)
              chk.reportBrokenEnum(i, j, old, _parse2GlobalCatMaps[i], _fr.vec(i).domain().length);
            if(_parse2GlobalCatMaps[i][old] < 0)
              throw new H2OParseException("Error in unifying categorical values. This is typically "
                  +"caused by unrecognized characters in the data.\n The problem categorical value "
                  +"occurred in the " + PrettyPrint.withOrdinalIndicator(i+1)+ " categorical col, "
                  +PrettyPrint.withOrdinalIndicator(chk.start() + j) +" row.");
              //throw new H2OParseException(H2O.SELF.toString() + ": missing enum at col:" + i + ", row: " + (chk.start() + j) + ", val = " + old + ", chunk=" + chk.getClass().getSimpleName() + ".", map = " + Arrays.toString(_parse2GlobalCatMaps[i]));
            chk.set(j, _parse2GlobalCatMaps[i][old]);
          }
        }
        chk.close(cidx, _fs);
      }
    }
    @Override public void postGlobal() {
      for (int i=0; i < H2O.CLOUD.size(); i++)
      DKV.remove(Key.make(_parseCatMapsKey.toString() + "parseCatMapNode" + i));
    }
  }
  private static class GatherCategoricalDomainsTask extends MRTask {
    private final Key _k;
    private final int[] _catColIdxs;
    private byte[][] _packedDomains;

    private GatherCategoricalDomainsTask(Key k, int[] ccols) {
      _k = k;
      _catColIdxs = ccols;
    }

    @Override
    public void setupLocal() {
      if (!MultiFileParseTask._enums.containsKey(_k)) return;
      _packedDomains = new byte[_catColIdxs.length][];
      final ValueString[][] _perColDomains = new ValueString[_catColIdxs.length][];
      final Categorical[] _colCats = MultiFileParseTask._enums.get(_k);
      int i = 0;
      for (int col : _catColIdxs) {
        _colCats[col].convertToUTF8(col + 1);
        _perColDomains[i] = _colCats[col].getColumnDomain();
        Arrays.sort(_perColDomains[i]);
        _packedDomains[i] = packDomain(_perColDomains[i]);
        i++;
      }
    }

    @Override
    public void reduce(final GatherCategoricalDomainsTask other) {
      if (_packedDomains == null) {
        _packedDomains = other._packedDomains;
      } else if (other._packedDomains != null) { // merge two packed domains
        H2OCountedCompleter[] domtasks = new H2OCountedCompleter[_catColIdxs.length];
        for (int i = 0; i < _catColIdxs.length; i++) {
          final int fi = i;
          final GatherCategoricalDomainsTask fOther = other;
          H2O.submitTask(domtasks[i] = new H2OCountedCompleter() {
            @Override
            public void compute2() {
              // merge sorted packed domains with duplicate removal
              final byte[] thisDom = _packedDomains[fi];
              final byte[] otherDom = fOther._packedDomains[fi];
              final int tLen = UnsafeUtils.get4(thisDom, 0), oLen = UnsafeUtils.get4(otherDom, 0);
              int tDomLen = UnsafeUtils.get4(thisDom, 4);
              int oDomLen = UnsafeUtils.get4(otherDom, 4);
              ValueString tCat = new ValueString(thisDom, 8, tDomLen);
              ValueString oCat = new ValueString(otherDom, 8, oDomLen);
              int ti = 0, oi = 0, tbi = 8, obi = 8, mbi = 4, mergeLen = 0;
              byte[] mergedDom = new byte[thisDom.length + otherDom.length];
              // merge
              while (ti < tLen && oi < oLen) {
                // compare thisDom to otherDom
                int x = tCat.compareTo(oCat);
                // this < or equal to other
                if (x <= 0) {
                  UnsafeUtils.set4(mergedDom, mbi, tDomLen); //Store str len
                  mbi += 4;
                  for (int j = 0; j < tDomLen; j++)
                    mergedDom[mbi++] = thisDom[tbi++];
                  tDomLen = UnsafeUtils.get4(thisDom, tbi);
                  tbi += 4;
                  tCat.set(thisDom, tbi, tDomLen);
                  ti++;
                  if (x == 0) { // this == other
                    obi += oDomLen;
                    oDomLen = UnsafeUtils.get4(otherDom, obi);
                    obi += 4;
                    oCat.set(otherDom, obi, oDomLen);
                    oi++;
                  }
                } else { // other < this
                  UnsafeUtils.set4(mergedDom, mbi, oDomLen); //Store str len
                  mbi += 4;
                  for (int j = 0; j < oDomLen; j++)
                    mergedDom[mbi++] = otherDom[obi++];
                  oDomLen = UnsafeUtils.get4(otherDom, obi);
                  obi += 4;
                  oCat.set(otherDom, obi, oDomLen);
                  oi++;
                }
                mergeLen++;
              }
              // merge remainder of longer list
              if (ti < tLen) {
                tbi -= 4;
                int remainder = thisDom.length - tbi;
                System.arraycopy(thisDom,tbi,mergedDom,mbi,remainder);
                mbi += remainder;
                mergeLen += tLen - ti;
              } else { //oi < oLen
                obi -= 4;
                int remainder = otherDom.length - obi;
                System.arraycopy(otherDom,obi,mergedDom,mbi,remainder);
                mbi += remainder;
                mergeLen += oLen - oi;
              }
              _packedDomains[fi]  = Arrays.copyOf(mergedDom, mbi);// reduce size
              UnsafeUtils.set4(_packedDomains[fi], 0, mergeLen);
              tryComplete();
            }
          });
        }
        for (int i = 0; i < _catColIdxs.length; i++) if (domtasks[i] != null) domtasks[i].join();
      }
    }

    private byte[] packDomain(ValueString[] domain) {
      int totStrLen =0;
      for(ValueString dom : domain)
        totStrLen += dom.length();
      final byte[] packedDom = MemoryManager.malloc1(4 + (domain.length << 2) + totStrLen, false);
      UnsafeUtils.set4(packedDom, 0, domain.length); //Store domain size
      int i = 4;
      for(ValueString dom : domain) {
        UnsafeUtils.set4(packedDom, i, dom.length()); //Store str len
        i += 4;
        byte[] buf = dom.getBuffer();
        for(int j=0; j < buf.length; j++) //Store str chars
          packedDom[i++] = buf[j];
      }
      return packedDom;
    }
    public int getDomainLength(int colIdx) {
      if (_packedDomains == null) return 0;
      else return UnsafeUtils.get4(_packedDomains[colIdx], 0);
    }

    public String[] getDomain(int colIdx) {
      if (_packedDomains == null) return null;
      final int strCnt = UnsafeUtils.get4(_packedDomains[colIdx], 0);
      final String[] res = new String[strCnt];
      int j = 4;
      for (int i=0; i < strCnt; i++) {
        final int strLen = UnsafeUtils.get4(_packedDomains[colIdx], j);
        j += 4;
        res[i] = new String(_packedDomains[colIdx], j, strLen, Charsets.UTF_8);
        j += strLen;
      }
      return res;
    }
  }

  // --------------------------------------------------------------------------
  // Run once on all nodes; fill in missing zero chunks
  private static class SVFTask extends MRTask {
    private final Frame _f;
    private SVFTask( Frame f ) { _f = f; }
    @Override public void setupLocal() {
      if( _f.numCols() == 0 ) return;
      Vec v0 = _f.anyVec();
      ArrayList rs = new ArrayList();
      for( int i = 0; i < v0.nChunks(); ++i ) {
        if( !v0.chunkKey(i).home() ) continue;
        final int fi = i;
        rs.add(new RecursiveAction() {
          @Override
          protected void compute() {
            // First find the nrows as the # rows of non-missing chunks; done on
            // locally-homed chunks only - to keep the data distribution.
            int nlines = 0;
            for( Vec vec : _f.vecs() ) {
              Value val = H2O.get(vec.chunkKey(fi)); // Local-get only
              if( val != null ) {
                nlines = ((Chunk)val.get())._len;
                break;
              }
            }
            final int fnlines = nlines;
            // Now fill in appropriate-sized zero chunks
            for(int j = 0; j < _f.numCols(); ++j) {
              Vec vec = _f.vec(j);
              Key k = vec.chunkKey(fi);
              Value val = H2O.get(k);   // Local-get only
              if( val == null )         // Missing?  Fill in w/zero chunk
                H2O.putIfMatch(k, new Value(k, new C0LChunk(0, fnlines)), null);
            }
          }
        });
      }
      ForkJoinTask.invokeAll(rs);
    }
    @Override public void reduce( SVFTask drt ) {}
  }

  // --------------------------------------------------------------------------
  // We want to do a standard MRTask with a collection of file-keys (so the
  // files are parsed in parallel across the cluster), but we want to throttle
  // the parallelism on each node.
  private static class MultiFileParseTask extends MRTask {
    private final ParseSetup _parseSetup; // The expected column layout
    private final VectorGroup _vg;    // vector group of the target dataset
    private final int _vecIdStart;    // Start of available vector keys
    // Shared against all concurrent unrelated parses, a map to the node-local
    // Enum lists for each concurrent parse.
    private static NonBlockingHashMap _enums = new NonBlockingHashMap<>();
    // The Key used to sort out *this* parse's Categorical[]
    private final Key _eKey = Key.make();
    // Eagerly delete Big Data
    private final boolean _deleteOnDone;
    // Mapping from Chunk# to node index holding the initial category mappings.
    // It is either self for all the non-parallel parses, or the Chunk-home for parallel parses.
    private int[] _chunk2ParseNodeMap;
    // Job Key, to unlock & remove raw parsed data; to report progress
    private final Key _jobKey;
    // A mapping of Key+ByteVec to rolling total Chunk counts.
    private final int[]  _fileChunkOffsets;

    // OUTPUT fields:
    FVecParseWriter[] _dout;
    String[] _errors;

    int _reservedKeys;
    MultiFileParseTask(VectorGroup vg,  ParseSetup setup, Key jobKey, Key[] fkeys, boolean deleteOnDone ) {
      _vg = vg; _parseSetup = setup;
      _vecIdStart = _vg.reserveKeys(_reservedKeys = _parseSetup._parse_type == ParserType.SVMLight ? 100000000 : setup._number_columns);
      _deleteOnDone = deleteOnDone;
      _jobKey = jobKey;

      // A mapping of Key+ByteVec to rolling total Chunk counts.
      _fileChunkOffsets = new int[fkeys.length];
      int len = 0;
      for( int i = 0; i < fkeys.length; ++i ) {
        _fileChunkOffsets[i] = len;
        len += getByteVec(fkeys[i]).nChunks();
      }

      // Mapping from Chunk# to cluster-node-number
      _chunk2ParseNodeMap = MemoryManager.malloc4(len);
      Arrays.fill(_chunk2ParseNodeMap, -1);
    }

    private AppendableVec [] _vecs;

    @Override public void postGlobal(){
      Log.trace("Begin file parse cleanup.");
      int n = _dout.length-1;
      while(_dout[n] == null && n != 0)--n;
      for(int i = 0; i <= n; ++i) {
        if (_dout[i] == null) {
          _dout[i] = _dout[n];
          n--;
          while (n > i && _dout[n] == null) n--;
        }
      }
      if(n < _dout.length-1)
        _dout = Arrays.copyOf(_dout,n+1);
      if(_dout.length == 1) {
        _vecs = _dout[0]._vecs;
        return;
      }
      int nCols = 0;
      for(FVecParseWriter dout:_dout)
        nCols = Math.max(dout._vecs.length,nCols);
      AppendableVec [] res = new AppendableVec[nCols];
      int nchunks = 0;
      for(FVecParseWriter dout:_dout)
        nchunks += dout.nChunks();
      long [] espc = MemoryManager.malloc8(nchunks);
      for(int i = 0; i < res.length; ++i) {
        res[i] = new AppendableVec(_vg.vecKey(_vecIdStart + i), espc, 0);
        res[i].setTypes(MemoryManager.malloc1(nchunks));
      }
      for(int i = 0; i < _dout.length; ++i)
        for(int j = 0; j < _dout[i]._vecs.length; ++j)
          res[j].setSubRange(_dout[i]._vecs[j]);
      if((res.length + _vecIdStart) < _reservedKeys) {
        Future f = _vg.tryReturnKeys(_vecIdStart + _reservedKeys, _vecIdStart + res.length);
        if (f != null) try { f.get(); } catch (InterruptedException e) { } catch (ExecutionException e) {}
      }
      _vecs = res;
      Log.trace("Finished file parse cleanup.");
    }
    private AppendableVec[] vecs(){ return _vecs; }

    @Override public void setupLocal() {
      _dout = new FVecParseWriter[_keys.length];
    }

    // Fetch out the node-local Categorical[] using _eKey and _enums hashtable
    private static Categorical[] enums(Key eKey, int ncols) {
      Categorical[] enums = _enums.get(eKey);
      if( enums != null ) return enums;
      enums = new Categorical[ncols];
      for( int i = 0; i < enums.length; ++i ) enums[i] = new Categorical();
      _enums.putIfAbsent(eKey, enums);
      return _enums.get(eKey); // Re-get incase lost insertion race
    }

    // Flag all chunk enums as being on local (self)
    private void chunksAreLocal( Vec vec, int chunkStartIdx, Key key ) {
      for(int i = 0; i < vec.nChunks(); ++i)
        _chunk2ParseNodeMap[chunkStartIdx + i] = H2O.SELF.index();
      // For Big Data, must delete data as eagerly as possible.
      Iced ice = DKV.get(key).get();
      if( ice==vec ) {
        if(_deleteOnDone) vec.remove();
      } else {
        Frame fr = (Frame)ice;
        if(_deleteOnDone) fr.delete(_jobKey,new Futures()).blockForPending();
        else if( fr._key != null ) fr.unlock(_jobKey);
      }
    }

    private FVecParseWriter makeDout(ParseSetup localSetup, int chunkOff, int nchunks) {
      AppendableVec [] avs = new AppendableVec[localSetup._number_columns];
      long [] espc = MemoryManager.malloc8(nchunks);
      for(int i = 0; i < avs.length; ++i)
        avs[i] = new AppendableVec(_vg.vecKey(i + _vecIdStart), espc, chunkOff);
      return localSetup._parse_type == ParserType.SVMLight
        ?new SVMLightFVecParseWriter(_vg, _vecIdStart,chunkOff, _parseSetup._chunk_size, avs)
        :new FVecParseWriter(_vg, chunkOff, enums(_eKey,localSetup._number_columns), localSetup._column_types, _parseSetup._chunk_size, avs);
    }

    // Called once per file
    @Override public void map( Key key ) {
      if (((Job)DKV.getGet(_jobKey)).isCancelledOrCrashed()) return;
      ParseSetup localSetup = new ParseSetup(_parseSetup);
      ByteVec vec = getByteVec(key);
      final int chunkStartIdx = _fileChunkOffsets[_lo];
      Log.trace("Begin a map stage of a file parse with start index " + chunkStartIdx + ".");

      byte[] zips = vec.getFirstBytes();
      ZipUtil.Compression cpr = ZipUtil.guessCompressionMethod(zips);

      if (localSetup._check_header == ParseSetup.HAS_HEADER) //check for header on local file
        localSetup._check_header = localSetup.parser(_jobKey).fileHasHeader(ZipUtil.unzipBytes(zips,cpr, localSetup._chunk_size), localSetup);

      // Parse the file
      try {
        switch( cpr ) {
        case NONE:
          if( _parseSetup._parse_type._parallelParseSupported ) {
            DistributedParse dp = new DistributedParse(_vg, localSetup, _vecIdStart, chunkStartIdx, this, key, vec.nChunks());
            addToPendingCount(1);
            dp.setCompleter(this);
            dp.asyncExec(vec);
            for( int i = 0; i < vec.nChunks(); ++i )
              _chunk2ParseNodeMap[chunkStartIdx + i] = vec.chunkKey(i).home_node().index();
          } else {
            InputStream bvs = vec.openStream(_jobKey);
            _dout[_lo] = streamParse(bvs, localSetup, makeDout(localSetup,chunkStartIdx,vec.nChunks()), bvs);
            chunksAreLocal(vec,chunkStartIdx,key);
          }
          break;
        case ZIP: {
          // Zipped file; no parallel decompression;
          InputStream bvs = vec.openStream(_jobKey);
          ZipInputStream zis = new ZipInputStream(bvs);
          ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry
          // There is at least one entry in zip file and it is not a directory.
          if( ze != null && !ze.isDirectory() )
            _dout[_lo] = streamParse(zis,localSetup,makeDout(localSetup,chunkStartIdx,vec.nChunks()), bvs);
            // check for more files in archive
            ZipEntry ze2 = zis.getNextEntry();
            if (ze2 != null && !ze.isDirectory()) {
              Log.warn("Only single file zip archives are currently supported, only file: "+ze.getName()+" has been parsed.  Remaining files have been ignored.");
            }
          else zis.close();       // Confused: which zipped file to decompress
          chunksAreLocal(vec,chunkStartIdx,key);
          break;
        }
        case GZIP: {
          InputStream bvs = vec.openStream(_jobKey);
          // Zipped file; no parallel decompression;
          _dout[_lo] = streamParse(new GZIPInputStream(bvs),localSetup,makeDout(localSetup,chunkStartIdx,vec.nChunks()),bvs);
          // set this node as the one which processed all the chunks
          chunksAreLocal(vec,chunkStartIdx,key);
          break;
        }
        }
        Log.trace("Finished a map stage of a file parse with start index "+chunkStartIdx+".");
      } catch( IOException ioe ) {
        throw new RuntimeException(ioe);
      } catch (H2OParseException pe) {
        throw new H2OParseException(key,pe);
      }
    }

    // Reduce: combine errors from across files.
    // Roll-up other meta data
    @Override public void reduce( MultiFileParseTask mfpt ) {
      assert this != mfpt;
      Log.trace("Begin a reduce stage of a file parse.");

      // Collect & combine columns across files
      if( _dout == null ) _dout = mfpt._dout;
      else if(_dout != mfpt._dout) _dout = ArrayUtils.append(_dout,mfpt._dout);
      if( _chunk2ParseNodeMap == null ) _chunk2ParseNodeMap = mfpt._chunk2ParseNodeMap;
      else if(_chunk2ParseNodeMap != mfpt._chunk2ParseNodeMap) { // we're sharing global array!
        for( int i = 0; i < _chunk2ParseNodeMap.length; ++i ) {
          if( _chunk2ParseNodeMap[i] == -1 ) _chunk2ParseNodeMap[i] = mfpt._chunk2ParseNodeMap[i];
          else assert mfpt._chunk2ParseNodeMap[i] == -1 : Arrays.toString(_chunk2ParseNodeMap) + " :: " + Arrays.toString(mfpt._chunk2ParseNodeMap);
        }
      }
      _errors = ArrayUtils.append(_errors,mfpt._errors);
      Log.trace("Finished a reduce stage of a file parse.");
    }

    // ------------------------------------------------------------------------
    // Zipped file; no parallel decompression; decompress into local chunks,
    // parse local chunks; distribute chunks later.
    private FVecParseWriter streamParse( final InputStream is, final ParseSetup localSetup, FVecParseWriter dout, InputStream bvs) throws IOException {
      // All output into a fresh pile of NewChunks, one per column
      Parser p = localSetup.parser(_jobKey);
      // assume 2x inflation rate
      if( localSetup._parse_type._parallelParseSupported ) p.streamParseZip(is, dout, bvs);
      else                                            p.streamParse   (is, dout);
      // Parse all internal "chunks", until we drain the zip-stream dry.  Not
      // real chunks, just flipping between 32K buffers.  Fills up the single
      // very large NewChunk.
      dout.close(_fs);
      return dout;
    }

    // ------------------------------------------------------------------------
    private static class DistributedParse extends MRTask {
      private final ParseSetup _setup;
      private final int _vecIdStart;
      private final int _startChunkIdx; // for multifile parse, offset of the first chunk in the final dataset
      private final VectorGroup _vg;
      private FVecParseWriter _dout;
      private final Key _eKey;  // Parse-local-Enums key
      private final Key _jobKey;
      private transient final MultiFileParseTask _outerMFPT;
      private transient final Key _srckey; // Source/text file to delete on done
      private transient NonBlockingSetInt _visited;
      private transient long [] _espc;
      final int _nchunks;

      DistributedParse(VectorGroup vg, ParseSetup setup, int vecIdstart, int startChunkIdx, MultiFileParseTask mfpt, Key srckey, int nchunks) {
        super(mfpt);
        _vg = vg;
        _setup = setup;
        _vecIdStart = vecIdstart;
        _startChunkIdx = startChunkIdx;
        _outerMFPT = mfpt;
        _eKey = mfpt._eKey;
        _jobKey = mfpt._jobKey;
        _srckey = srckey;
        _nchunks = nchunks;
      }
      @Override public void setupLocal(){
        super.setupLocal();
        _visited = new NonBlockingSetInt();
        _espc = MemoryManager.malloc8(_nchunks);
      }
      @Override public void map( Chunk in ) {
        if (((Job)DKV.getGet(_jobKey)).isCancelledOrCrashed()) return;
        AppendableVec [] avs = new AppendableVec[_setup._number_columns];
        for(int i = 0; i < avs.length; ++i)
          avs[i] = new AppendableVec(_vg.vecKey(_vecIdStart + i), _espc, _startChunkIdx);
        // Break out the input & output vectors before the parse loop
        FVecParseReader din = new FVecParseReader(in);
        FVecParseWriter dout;
        Parser p;
        switch(_setup._parse_type) {
          case ARFF:
          case CSV:
            Categorical [] enums = enums(_eKey,_setup._number_columns);
            p = new CsvParser(_setup, _jobKey);
            dout = new FVecParseWriter(_vg,_startChunkIdx + in.cidx(), enums, _setup._column_types, _setup._chunk_size, avs); //TODO: use _setup._domains instead of enums
          break;
        case SVMLight:
          p = new SVMLightParser(_setup, _jobKey);
          dout = new SVMLightFVecParseWriter(_vg, _vecIdStart, in.cidx() + _startChunkIdx, _setup._chunk_size, avs);
          break;
        default:
          throw H2O.unimpl();
        }
        p.parseChunk(in.cidx(), din, dout);
        (_dout = dout).close(_fs);
        Job.update(in._len, _jobKey); // Record bytes parsed

        // remove parsed data right away
        freeMem(in);
      }

      /**
       * This marks parsed byteVec chunks as ready to be freed. If this is the second
       * time a chunk has been marked, it is freed. The reason two marks are required
       * is that each chunk parse typically needs to read the remaining bytes of the
       * current row from the next chunk.  Thus each task typically touches two chunks.
       *
       * @param in - chunk to be marked and possibly freed
       */
      private void freeMem(Chunk in) {
        int cidx = in.cidx();
        for(int i=0; i < 2; i++) {  // iterate over this chunk and the next one
          cidx += i;
          if (!_visited.add(cidx)) { // Second visit
            Value v = H2O.get(in.vec().chunkKey(cidx));
            if (v == null || !v.isPersisted()) return; // Not found, or not on disk somewhere
            v.freePOJO();           // Eagerly toss from memory
            v.freeMem();
          }
        }
      }
      @Override public void reduce(DistributedParse dp) { _dout.reduce(dp._dout); }

      @Override public void postGlobal() {
        super.postGlobal();
        _outerMFPT._dout[_outerMFPT._lo] = _dout;
        _dout = null;           // Reclaim GC eagerly
        // For Big Data, must delete data as eagerly as possible.
        Value val = DKV.get(_srckey);
        if( val == null ) return;
        Iced ice = val.get();
        if( ice instanceof ByteVec ) {
          if( _outerMFPT._deleteOnDone) ((ByteVec)ice).remove();
        } else {
          Frame fr = (Frame)ice;
          if( _outerMFPT._deleteOnDone) fr.delete(_outerMFPT._jobKey,new Futures()).blockForPending();
          else if( fr._key != null ) fr.unlock(_outerMFPT._jobKey);
        }
      }
    }

    // Find & remove all partially built output chunks & vecs
    private Futures onExceptionCleanup(Futures fs) {
      int nchunks = _chunk2ParseNodeMap.length;
      int ncols = _parseSetup._number_columns;
      for( int i = 0; i < ncols; ++i ) {
        Key vkey = _vg.vecKey(_vecIdStart + i);
        Keyed.remove(vkey,fs);
        for( int c = 0; c < nchunks; ++c )
          DKV.remove(Vec.chunkKey(vkey,c),fs);
      }
      cancel(true);
      return fs;
    }
  }

  // ------------------------------------------------------------------------
  // Log information about the dataset we just parsed.
  private static void logParseResults(ParseDataset job, Frame fr) {
    long numRows = fr.anyVec().length();
    Log.info("Parse result for " + job.dest() + " (" + Long.toString(numRows) + " rows):");
    // get all rollups started in parallell, otherwise this takes ages!
    Futures fs = new Futures();
    Vec[] vecArr = fr.vecs();
    for(Vec v:vecArr)  v.startRollupStats(fs);
    fs.blockForPending();

    int namelen = 0;
    for (String s : fr.names()) namelen = Math.max(namelen, s.length());
    String format = " %"+namelen+"s %7s %12.12s %12.12s %11s %8s %6s";
    Log.info(String.format(format, "ColV2", "type", "min", "max", "NAs", "constant", "cardinality"));
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

    for( int i = 0; i < vecArr.length; i++ ) {
      Vec v = vecArr[i];
      boolean isCategorical = v.isEnum();
      boolean isConstant = v.isConst();
      String CStr = String.format("%"+namelen+"s:", fr.names()[i]);
      String typeStr;
      String minStr;
      String maxStr;

      switch( v.get_type() ) {
        case Vec.T_BAD:   typeStr = "all_NA" ;  minStr = "";  maxStr = "";  break;
        case Vec.T_UUID:  typeStr = "UUID"   ;  minStr = "";  maxStr = "";  break;
        case Vec.T_STR :  typeStr = "string" ;  minStr = "";  maxStr = "";  break;
        case Vec.T_NUM :  typeStr = "numeric";  minStr = String.format("%g", v.min());  maxStr = String.format("%g", v.max());  break;
        case Vec.T_ENUM:  typeStr = "factor" ;  minStr = v.factor(0);  maxStr = v.factor(v.cardinality()-1); break;
        case Vec.T_TIME:  typeStr = "time"   ;  minStr = sdf.format(v.min());  maxStr = sdf.format(v.max());  break;
        default: throw H2O.unimpl();
      }

      long numNAs = v.naCnt();
      String naStr = (numNAs > 0) ? String.format("%d", numNAs) : "";
      String isConstantStr = isConstant ? "constant" : "";
      String numLevelsStr = isCategorical ? String.format("%d", v.domain().length) : "";

      boolean printLogSeparatorToStdout = false;
      boolean printColumnToStdout;
      {
        // Print information to stdout for this many leading columns.
        final int MAX_HEAD_TO_PRINT_ON_STDOUT = 10;

        // Print information to stdout for this many trailing columns.
        final int MAX_TAIL_TO_PRINT_ON_STDOUT = 10;

        if (vecArr.length <= (MAX_HEAD_TO_PRINT_ON_STDOUT + MAX_TAIL_TO_PRINT_ON_STDOUT)) {
          // For small numbers of columns, print them all.
          printColumnToStdout = true;
        } else if (i < MAX_HEAD_TO_PRINT_ON_STDOUT) {
          printColumnToStdout = true;
        } else if (i == MAX_HEAD_TO_PRINT_ON_STDOUT) {
          printLogSeparatorToStdout = true;
          printColumnToStdout = false;
        } else if ((i + MAX_TAIL_TO_PRINT_ON_STDOUT) < vecArr.length) {
          printColumnToStdout = false;
        } else {
          printColumnToStdout = true;
        }
      }

      if (printLogSeparatorToStdout)
        Log.info("Additional column information only sent to log file...");

      String s = String.format(format, CStr, typeStr, minStr, maxStr, naStr, isConstantStr, numLevelsStr);
      Log.info(s,printColumnToStdout);
    }
    Log.info(FrameUtils.chunkSummary(fr).toString());
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy