org.apache.sysml.runtime.transform.BinAgent Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.transform;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.transform.MVImputeAgent.MVMethod;
import org.apache.sysml.runtime.transform.encode.Encoder;
import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;
public class BinAgent extends Encoder
{
private static final long serialVersionUID = 1917445005206076078L;
public static final String MIN_PREFIX = "min";
public static final String MAX_PREFIX = "max";
public static final String NBINS_PREFIX = "nbins";
private int[] _numBins = null;
private double[] _min=null, _max=null; // min and max among non-missing values
private double[] _binWidths = null; // width of a bin for each attribute
//frame transform-apply attributes
private double[][] _binMins = null;
private double[][] _binMaxs = null;
public BinAgent(JSONObject parsedSpec, String[] colnames, int clen)
throws JSONException, IOException
{
this(parsedSpec, colnames, clen, false);
}
public BinAgent(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly)
throws JSONException, IOException
{
super( null, clen );
if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) )
return;
if( colsOnly ) {
List collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
}
else
{
JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN);
JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS);
JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS);
initColList(attrs);
_numBins = new int[attrs.size()];
for(int i=0; i < _numBins.length; i++)
_numBins[i] = UtilFunctions.toInt(nbins.get(i));
// initialize internal transformation metadata
_min = new double[_colList.length];
Arrays.fill(_min, Double.MAX_VALUE);
_max = new double[_colList.length];
Arrays.fill(_max, -Double.MAX_VALUE);
_binWidths = new double[_colList.length];
}
}
public int[] getNumBins() { return _numBins; }
public double[] getMin() { return _min; }
public double[] getBinWidths() { return _binWidths; }
public void prepare(String[] words, TfUtils agents) {
if ( !isApplicable() )
return;
for(int i=0; i <_colList.length; i++) {
int colID = _colList[i];
String w = null;
double d = 0;
// equi-width
w = UtilFunctions.unquote(words[colID-1].trim());
if(!TfUtils.isNA(agents.getNAStrings(),w)) {
d = UtilFunctions.parseToDouble(w);
if(d < _min[i])
_min[i] = d;
if(d > _max[i])
_max[i] = d;
}
}
}
private DistinctValue prepMinOutput(int idx) throws CharacterCodingException {
String s = MIN_PREFIX + Double.toString(_min[idx]);
return new DistinctValue(s, -1L);
}
private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException {
String s = MAX_PREFIX + Double.toString(_max[idx]);
return new DistinctValue(s, -1L);
}
private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException {
String s = NBINS_PREFIX + Double.toString(_numBins[idx]);
return new DistinctValue(s, -1L);
}
/**
* Method to output transformation metadata from the mappers.
* This information is collected and merged by the reducers.
*/
@Override
public void mapOutputTransformationMetadata(OutputCollector out, int taskID, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
try {
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
IntWritable iw = new IntWritable(-colID);
out.collect(iw, prepMinOutput(i));
out.collect(iw, prepMaxOutput(i));
out.collect(iw, prepNBinsOutput(i));
}
} catch(Exception e) {
throw new IOException(e);
}
}
public ArrayList> mapOutputTransformationMetadata(int taskID, ArrayList> list, TfUtils agents) throws IOException {
if ( !isApplicable() )
return list;
try {
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
Integer iw = -colID;
list.add( new Pair(iw, prepMinOutput(i)) );
list.add( new Pair(iw, prepMaxOutput(i)) );
list.add( new Pair(iw, prepNBinsOutput(i)) );
}
} catch(Exception e) {
throw new IOException(e);
}
return list;
}
private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException
{
Path pt = new Path(tfMtdDir+"/Bin/"+ agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
BufferedWriter br = null;
try {
br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
br.write(colID + TfUtils.TXMTD_SEP + min + TfUtils.TXMTD_SEP + max + TfUtils.TXMTD_SEP + binwidth + TfUtils.TXMTD_SEP + nbins + "\n");
}
finally {
IOUtilFunctions.closeSilently(br);
}
}
/**
* Method to merge map output transformation metadata.
*/
@Override
public void mergeAndOutputTransformationMetadata(Iterator values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
double min = Double.MAX_VALUE;
double max = -Double.MAX_VALUE;
int nbins = 0;
DistinctValue val = new DistinctValue();
String w = null;
double d;
while(values.hasNext()) {
val.reset();
val = values.next();
w = val.getWord();
if(w.startsWith(MIN_PREFIX)) {
d = UtilFunctions.parseToDouble(w.substring( MIN_PREFIX.length() ));
if ( d < min )
min = d;
}
else if(w.startsWith(MAX_PREFIX)) {
d = UtilFunctions.parseToDouble(w.substring( MAX_PREFIX.length() ));
if ( d > max )
max = d;
}
else if (w.startsWith(NBINS_PREFIX)) {
nbins = (int) UtilFunctions.parseToLong( w.substring(NBINS_PREFIX.length() ) );
}
else
throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
}
// write merged metadata
double binwidth = (max-min)/nbins;
writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
}
public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
MVImputeAgent mvagent = agents.getMVImputeAgent();
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
// If the column is imputed with a constant, then adjust min and max based the value of the constant.
if ( mvagent.isApplicable(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT )
{
double cst = UtilFunctions.parseToDouble( mvagent.getReplacement(colID) );
if ( cst < _min[i])
_min[i] = cst;
if ( cst > _max[i])
_max[i] = cst;
}
double binwidth = (_max[i] - _min[i])/_numBins[i];
writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents);
}
}
// ------------------------------------------------------------------------------------------------
/**
* Method to load transform metadata for all attributes
*/
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
if(fs.isDirectory(txMtdDir)) {
for(int i=0; i<_colList.length;i++) {
int colID = _colList[i];
Path path = new Path( txMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
TfUtils.checkValidInputFile(fs, path, true);
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(fs.open(path)));
// format: colID,min,max,nbins
String[] fields = br.readLine().split(TfUtils.TXMTD_SEP);
double min = UtilFunctions.parseToDouble(fields[1]);
//double max = UtilFunctions.parseToDouble(fields[2]);
double binwidth = UtilFunctions.parseToDouble(fields[3]);
int nbins = UtilFunctions.parseToInt(fields[4]);
_numBins[i] = nbins;
_min[i] = min;
_binWidths[i] = binwidth; // (max-min)/nbins;
}
finally {
IOUtilFunctions.closeSilently(br);
}
}
}
else {
throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
}
}
@Override
public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
build(in);
return apply(in, out);
}
@Override
public void build(FrameBlock in) {
// TODO Auto-generated method stub
}
/**
* Method to apply transformations.
*/
@Override
public String[] apply(String[] words) {
if( !isApplicable() )
return words;
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
try {
double val = UtilFunctions.parseToDouble(words[colID-1]);
int binid = 1;
double tmp = _min[i] + _binWidths[i];
while(val > tmp && binid < _numBins[i]) {
tmp += _binWidths[i];
binid++;
}
words[colID-1] = Integer.toString(binid);
}
catch(NumberFormatException e) {
throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method.");
}
}
return words;
}
@Override
public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
for(int j=0; j<_colList.length; j++) {
int colID = _colList[j];
for( int i=0; i