All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.h2obindings.H2OHdfs Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.apache.mahout.h2obindings;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.mahout.h2obindings.drm.H2ODrm;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import water.Futures;
import water.fvec.Frame;
import water.fvec.Vec;
import water.parser.ValueString;
import water.util.FrameUtils;

import java.io.File;
import java.io.IOException;
import java.net.URI;

/**
 * SequenceFile I/O class (on HDFS)
 */
public class H2OHdfs {
  /**
   * Predicate to check if a given filename is a SequenceFile.
   *
   * Inspect the first three bytes to determine the format of the file.
   *
   * @param filename Name of the file to check.
   * @return True if file is of SequenceFile format.
   */
  public static boolean isSeqfile(String filename) {
    try {
      Configuration conf = new Configuration();
      Path path = new Path(filename);
      FileSystem fs = FileSystem.get(URI.create(filename), conf);
      FSDataInputStream fin = fs.open(path);
      byte seq[] = new byte[3];

      fin.read(seq);
      fin.close();

      return seq[0] == 'S' && seq[1] == 'E' && seq[2] == 'Q';
    } catch (IOException e) {
      return false;
    }
  }

  /**
   * Create DRM from SequenceFile.
   *
   * Create a Mahout DRM backed on H2O from the specified SequenceFile.
   *
   * @param filename Name of the sequence file.
   * @param parMin Minimum number of data partitions in the DRM.
   * @return DRM object created.
   */
  public static H2ODrm drmFromFile(String filename, int parMin) {
    try {
      if (isSeqfile(filename)) {
        return drmFromSeqfile(filename, parMin);
      } else {
        return new H2ODrm(FrameUtils.parseFrame(null,new File(filename)));
      }
    } catch (IOException e) {
      return null;
    }
  }

  /**
   * Internal method called from drmFromFile if format verified.
   */
  public static H2ODrm drmFromSeqfile(String filename, int parMin) {
    long rows = 0;
    int cols = 0;
    Frame frame = null;
    Vec labels = null;

    SequenceFile.Reader reader = null;
    try {
      Configuration conf = new Configuration();
      Path path = new Path(filename);
      FileSystem fs = FileSystem.get(URI.create(filename), conf);
      Vec.Writer writers[];
      Vec.Writer labelwriter = null;
      boolean isIntKey = false, isLongKey = false, isStringKey = false;

      reader = new SequenceFile.Reader(fs, path, conf);

      if (reader.getValueClass() != VectorWritable.class) {
        System.out.println("ValueClass in file " + filename +
                           "must be VectorWritable, but found " +
                           reader.getValueClassName());
        return null;
      }

      Writable key = (Writable)
        ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      VectorWritable value = (VectorWritable)
        ReflectionUtils.newInstance(reader.getValueClass(), conf);

      long start = reader.getPosition();

      if (reader.getKeyClass() == Text.class) {
        isStringKey = true;
      } else if (reader.getKeyClass() == LongWritable.class) {
        isLongKey = true;
      } else {
        isIntKey = true;
      }

      while (reader.next(key, value)) {
        if (cols == 0) {
          Vector v = value.get();
          cols = Math.max(v.size(), cols);
        }
        if (isLongKey) {
          rows = Math.max(((LongWritable)(key)).get()+1, rows);
        }
        if (isIntKey) {
          rows = Math.max(((IntWritable)(key)).get()+1, rows);
        }
        if (isStringKey) {
          rows++;
        }
      }
      reader.seek(start);

      frame = H2OHelper.emptyFrame(rows, cols, parMin, -1);
      writers = new Vec.Writer[cols];
      for (int i = 0; i < writers.length; i++) {
        writers[i] = frame.vecs()[i].open();
      }

      if (reader.getKeyClass() == Text.class) {
        labels = H2OHelper.makeEmptyStrVec(frame.anyVec());
        labelwriter = labels.open();
      }

      long r = 0;
      while (reader.next(key, value)) {
        Vector v = value.get();
        if (isLongKey) {
          r = ((LongWritable)(key)).get();
        }
        if (isIntKey) {
          r = ((IntWritable)(key)).get();
        }
        for (int c = 0; c < v.size(); c++) {
          writers[c].set(r, v.getQuick(c));
        }
        if (labels != null) {
          labelwriter.set(r, (key).toString());
        }
        if (isStringKey) {
          r++;
        }
      }

      Futures fus = new Futures();
      for (Vec.Writer w : writers) {
        w.close(fus);
      }
      if (labelwriter != null) {
        labelwriter.close(fus);
      }
      fus.blockForPending();
    } catch (java.io.IOException e) {
      return null;
    } finally {
      IOUtils.closeStream(reader);
    }
    return new H2ODrm(frame, labels);
  }

  /**
   * Create SequenceFile on HDFS from DRM object.
   *
   * @param filename Filename to create and store DRM data in.
   * @param drm DRM object storing Matrix data in memory.
   */
  public static void drmToFile(String filename, H2ODrm drm) throws java.io.IOException {
    Frame frame = drm.frame;
    Vec labels = drm.keys;
    Configuration conf = new Configuration();
    Path path = new Path(filename);
    FileSystem fs = FileSystem.get(URI.create(filename), conf);
    SequenceFile.Writer writer;
    boolean isSparse = H2OHelper.isSparse(frame);
    ValueString vstr = new ValueString();

    if (labels != null) {
      writer = SequenceFile.createWriter(fs, conf, path, Text.class, VectorWritable.class);
    } else {
      writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class);
    }

    for (long r = 0; r < frame.anyVec().length(); r++) {
      Vector v;
      if (isSparse) {
        v = new SequentialAccessSparseVector(frame.numCols());
      } else {
        v = new DenseVector(frame.numCols());
      }

      for (int c = 0; c < frame.numCols(); c++) {
        v.setQuick(c, frame.vecs()[c].at(r));
      }

      if (labels != null) {
        writer.append(new Text(labels.atStr(vstr, r).toString()), new VectorWritable(v));
      } else {
        writer.append(new IntWritable((int)r), new VectorWritable(v));
      }
    }

    writer.close();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy