
org.apache.mahout.h2obindings.H2OHdfs Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.h2obindings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.mahout.h2obindings.drm.H2ODrm;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import water.Futures;
import water.fvec.Frame;
import water.fvec.Vec;
import water.parser.ValueString;
import water.util.FrameUtils;
import java.io.File;
import java.io.IOException;
import java.net.URI;
/**
* SequenceFile I/O class (on HDFS)
*/
public class H2OHdfs {
/**
* Predicate to check if a given filename is a SequenceFile.
*
* Inspect the first three bytes to determine the format of the file.
*
* @param filename Name of the file to check.
* @return True if file is of SequenceFile format.
*/
public static boolean isSeqfile(String filename) {
try {
Configuration conf = new Configuration();
Path path = new Path(filename);
FileSystem fs = FileSystem.get(URI.create(filename), conf);
FSDataInputStream fin = fs.open(path);
byte seq[] = new byte[3];
fin.read(seq);
fin.close();
return seq[0] == 'S' && seq[1] == 'E' && seq[2] == 'Q';
} catch (IOException e) {
return false;
}
}
/**
* Create DRM from SequenceFile.
*
* Create a Mahout DRM backed on H2O from the specified SequenceFile.
*
* @param filename Name of the sequence file.
* @param parMin Minimum number of data partitions in the DRM.
* @return DRM object created.
*/
public static H2ODrm drmFromFile(String filename, int parMin) {
try {
if (isSeqfile(filename)) {
return drmFromSeqfile(filename, parMin);
} else {
return new H2ODrm(FrameUtils.parseFrame(null,new File(filename)));
}
} catch (IOException e) {
return null;
}
}
/**
* Internal method called from drmFromFile
if format verified.
*/
public static H2ODrm drmFromSeqfile(String filename, int parMin) {
long rows = 0;
int cols = 0;
Frame frame = null;
Vec labels = null;
SequenceFile.Reader reader = null;
try {
Configuration conf = new Configuration();
Path path = new Path(filename);
FileSystem fs = FileSystem.get(URI.create(filename), conf);
Vec.Writer writers[];
Vec.Writer labelwriter = null;
boolean isIntKey = false, isLongKey = false, isStringKey = false;
reader = new SequenceFile.Reader(fs, path, conf);
if (reader.getValueClass() != VectorWritable.class) {
System.out.println("ValueClass in file " + filename +
"must be VectorWritable, but found " +
reader.getValueClassName());
return null;
}
Writable key = (Writable)
ReflectionUtils.newInstance(reader.getKeyClass(), conf);
VectorWritable value = (VectorWritable)
ReflectionUtils.newInstance(reader.getValueClass(), conf);
long start = reader.getPosition();
if (reader.getKeyClass() == Text.class) {
isStringKey = true;
} else if (reader.getKeyClass() == LongWritable.class) {
isLongKey = true;
} else {
isIntKey = true;
}
while (reader.next(key, value)) {
if (cols == 0) {
Vector v = value.get();
cols = Math.max(v.size(), cols);
}
if (isLongKey) {
rows = Math.max(((LongWritable)(key)).get()+1, rows);
}
if (isIntKey) {
rows = Math.max(((IntWritable)(key)).get()+1, rows);
}
if (isStringKey) {
rows++;
}
}
reader.seek(start);
frame = H2OHelper.emptyFrame(rows, cols, parMin, -1);
writers = new Vec.Writer[cols];
for (int i = 0; i < writers.length; i++) {
writers[i] = frame.vecs()[i].open();
}
if (reader.getKeyClass() == Text.class) {
labels = H2OHelper.makeEmptyStrVec(frame.anyVec());
labelwriter = labels.open();
}
long r = 0;
while (reader.next(key, value)) {
Vector v = value.get();
if (isLongKey) {
r = ((LongWritable)(key)).get();
}
if (isIntKey) {
r = ((IntWritable)(key)).get();
}
for (int c = 0; c < v.size(); c++) {
writers[c].set(r, v.getQuick(c));
}
if (labels != null) {
labelwriter.set(r, (key).toString());
}
if (isStringKey) {
r++;
}
}
Futures fus = new Futures();
for (Vec.Writer w : writers) {
w.close(fus);
}
if (labelwriter != null) {
labelwriter.close(fus);
}
fus.blockForPending();
} catch (java.io.IOException e) {
return null;
} finally {
IOUtils.closeStream(reader);
}
return new H2ODrm(frame, labels);
}
/**
* Create SequenceFile on HDFS from DRM object.
*
* @param filename Filename to create and store DRM data in.
* @param drm DRM object storing Matrix data in memory.
*/
public static void drmToFile(String filename, H2ODrm drm) throws java.io.IOException {
Frame frame = drm.frame;
Vec labels = drm.keys;
Configuration conf = new Configuration();
Path path = new Path(filename);
FileSystem fs = FileSystem.get(URI.create(filename), conf);
SequenceFile.Writer writer;
boolean isSparse = H2OHelper.isSparse(frame);
ValueString vstr = new ValueString();
if (labels != null) {
writer = SequenceFile.createWriter(fs, conf, path, Text.class, VectorWritable.class);
} else {
writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class);
}
for (long r = 0; r < frame.anyVec().length(); r++) {
Vector v;
if (isSparse) {
v = new SequentialAccessSparseVector(frame.numCols());
} else {
v = new DenseVector(frame.numCols());
}
for (int c = 0; c < frame.numCols(); c++) {
v.setQuick(c, frame.vecs()[c].at(r));
}
if (labels != null) {
writer.append(new Text(labels.atStr(vstr, r).toString()), new VectorWritable(v));
} else {
writer.append(new IntWritable((int)r), new VectorWritable(v));
}
}
writer.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy