All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.grep.Grep Maven / Gradle / Ivy

package hex.grep;

import hex.ModelBuilder;
import hex.ModelCategory;
import water.*;
import water.fvec.ByteVec;
import water.fvec.Chunk;
import water.fvec.Vec;
import water.util.Log;

import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/**
 *  Grep model builder... building a trivial GrepModel
 */
public class Grep extends ModelBuilder {
  public Grep( GrepModel.GrepParameters parms ) { super(parms); init(false); }
  @Override protected GrepDriver trainModelImpl() { return new GrepDriver(); }
  @Override public ModelCategory[] can_build() { return new ModelCategory[]{ModelCategory.Unknown}; }
  @Override public BuilderVisibility builderVisibility() { return BuilderVisibility.Experimental; }
  @Override public boolean isSupervised() { return false; }

  Pattern _pattern = null;
  /** Initialize the ModelBuilder, validating all arguments and preparing the
   *  training frame.  This call is expected to be overridden in the subclasses
   *  and each subclass will start with "super.init();".  This call is made
   *  by the front-end whenever the GUI is clicked, and needs to be fast;
   *  heavy-weight prep needs to wait for the trainModel() call.
   *
   *  Validate the regex. */
  @Override public void init(boolean expensive) {
    super.init(expensive);
    if( _parms._regex == null ) {
      error("_regex", "regex is missing");
    } else {
      try { _pattern = Pattern.compile(_parms._regex); }
      catch( PatternSyntaxException pse ) { error("regex", pse.getMessage()); }
    }
    if( _parms._train == null ) return;
    Vec[] vecs = _parms.train().vecs();
    if( vecs.length != 1 )
      error("_train","Frame must contain exactly 1 Vec (of raw text)");
    if( !(vecs[0] instanceof ByteVec) )
      error("_train","Frame must contain exactly 1 Vec (of raw text)");
  }

  // ----------------------
  private class GrepDriver extends Driver {

    @Override public void computeImpl() {
      GrepModel model = null;
      try {
        init(true);

        // The model to be built
        model = new GrepModel(dest(), _parms, new GrepModel.GrepOutput(Grep.this));
        model.delete_and_lock(_job);

        // ---
        // Run the main Grep Loop
        GrepGrep gg = new GrepGrep(_pattern).doAll(train().vecs()[0]);

        // Fill in the model
        model._output._matches = Arrays.copyOf(gg._matches,gg._cnt);
        model._output._offsets = Arrays.copyOf(gg._offsets,gg._cnt);

        StringBuilder sb = new StringBuilder("Grep: \n");
        sb.append(Arrays.toString(model._output._matches)).append("\n");
        sb.append(Arrays.toString(model._output._offsets)).append("\n");
        Log.info(sb);
      } finally {
        if( model != null ) model.unlock(_job);
      }
    }
  }

  private class ByteSeq implements CharSequence {
    private final byte _bs0[];
    private final byte _bs1[];
    ByteSeq( Chunk chk0, Chunk chk1 ) { _bs0 = chk0.getBytes(); _bs1 = chk1==null ? null : chk1.getBytes(); }

    @Override public char charAt(int idx ) {
      return (char)(idx < _bs0.length ? _bs0[idx] : _bs1[idx-_bs0.length]);
    }
    @Override public int length( ) { return _bs0.length+(_bs1==null?0:_bs1.length); }
    @Override public ByteSeq subSequence( int start, int end ) { throw H2O.unimpl(); }
    @Override public String toString() { throw H2O.unimpl(); }

    String str( int s, int e ) { return new String(_bs0,s,e-s); }
  }


  private class GrepGrep extends MRTask {
    private final Pattern _pattern;
    // Outputs, hopefully not too big for once machine!
    String[] _matches;
    long  [] _offsets;
    int _cnt;

    GrepGrep( Pattern pattern ) { _pattern = pattern; }
    @Override public void map( Chunk chk ) {
      _matches = new String[1]; // Result holders; will lazy expand
      _offsets = new long  [1];
      ByteSeq bs = new ByteSeq(chk,chk.nextChunk());
      // We already checked that this is an instance of a ByteVec, which means
      // all the Chunks contain raw text as byte arrays.
      Matcher m = _pattern.matcher(bs);
      while( m.find() && m.start() < bs._bs0.length )
        add(bs.str(m.start(),m.end()),chk.start()+m.start());
      _job.update(chk._len);         // Whole chunk of work, done all at once
    }
    @Override public void reduce( GrepGrep gg1 ) {
      GrepGrep gg0 = this;
      if( gg0._cnt < gg1._cnt ) { gg0 = gg1; gg1 = this; } // Larger result on left
      for( int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy