All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.sysml.runtime.transform.BinAgent Maven / Gradle / Ivy

There is a newer version: 1.2.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.transform;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.transform.MVImputeAgent.MVMethod;
import org.apache.sysml.runtime.transform.encode.Encoder;
import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;

public class BinAgent extends Encoder 
{	
	private static final long serialVersionUID = 1917445005206076078L;

	public static final String MIN_PREFIX = "min";
	public static final String MAX_PREFIX = "max";
	public static final String NBINS_PREFIX = "nbins";

	private int[] _numBins = null;
	private double[] _min=null, _max=null;	// min and max among non-missing values
	private double[] _binWidths = null;		// width of a bin for each attribute
	
	//frame transform-apply attributes
	private double[][] _binMins = null;
	private double[][] _binMaxs = null;
	
	public BinAgent(JSONObject parsedSpec, String[] colnames, int clen) 
		throws JSONException, IOException 
	{
		this(parsedSpec, colnames, clen, false);
	}

	public BinAgent(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly) 
		throws JSONException, IOException 
	{
		super( null, clen );		
		if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) )
			return;
		
		if( colsOnly ) {
			List collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
			initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
		}
		else 
		{
			JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN);		
			JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS);
			JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS);
			initColList(attrs);
			
			_numBins = new int[attrs.size()];
			for(int i=0; i < _numBins.length; i++)
				_numBins[i] = UtilFunctions.toInt(nbins.get(i)); 
			
			// initialize internal transformation metadata
			_min = new double[_colList.length];
			Arrays.fill(_min, Double.MAX_VALUE);
			_max = new double[_colList.length];
			Arrays.fill(_max, -Double.MAX_VALUE);
			
			_binWidths = new double[_colList.length];
		}
	}

	public int[] getNumBins() { return _numBins; }
	public double[] getMin()  { return _min; }
	public double[] getBinWidths() { return _binWidths; }
	
	public void prepare(String[] words, TfUtils agents) {
		if ( !isApplicable() )
			return;
		
		for(int i=0; i <_colList.length; i++) {
			int colID = _colList[i];
			
			String w = null;
			double d = 0;
				
			// equi-width
			w = UtilFunctions.unquote(words[colID-1].trim());
			if(!TfUtils.isNA(agents.getNAStrings(),w)) {
				d = UtilFunctions.parseToDouble(w);
				if(d < _min[i])
					_min[i] = d;
				if(d > _max[i])
					_max[i] = d;
			}
		}
	}
	
	private DistinctValue prepMinOutput(int idx) throws CharacterCodingException {
		String s =  MIN_PREFIX + Double.toString(_min[idx]);
		return  new DistinctValue(s, -1L);
	}
	
	private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException {
		String s =  MAX_PREFIX + Double.toString(_max[idx]);
		return  new DistinctValue(s, -1L);
	}
	
	private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException {
		String s =  NBINS_PREFIX + Double.toString(_numBins[idx]);
		return  new DistinctValue(s, -1L);
	}
	
	/**
	 * Method to output transformation metadata from the mappers. 
	 * This information is collected and merged by the reducers.
	 */
	@Override
	public void mapOutputTransformationMetadata(OutputCollector out, int taskID, TfUtils agents) throws IOException {
		if( !isApplicable() )
			return;
		
		try { 
			for(int i=0; i < _colList.length; i++) {
				int colID = _colList[i];
				IntWritable iw = new IntWritable(-colID);
				
				out.collect(iw,  prepMinOutput(i));
				out.collect(iw,  prepMaxOutput(i));
				out.collect(iw,  prepNBinsOutput(i));
			}
		} catch(Exception e) {
			throw new IOException(e);
		}
	}
	
	public ArrayList> mapOutputTransformationMetadata(int taskID, ArrayList> list, TfUtils agents) throws IOException {
		if ( !isApplicable() )
			return list;
		
		try { 
			for(int i=0; i < _colList.length; i++) {
				int colID = _colList[i];
				Integer iw = -colID;
				
				list.add( new Pair(iw, prepMinOutput(i)) );
				list.add( new Pair(iw, prepMaxOutput(i)) );
				list.add( new Pair(iw, prepNBinsOutput(i)) );
			}
		} catch(Exception e) {
			throw new IOException(e);
		}
		return list;
	}

	private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException 
	{
		Path pt = new Path(tfMtdDir+"/Bin/"+ agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
		BufferedWriter br = null;
		try {
			br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
			br.write(colID + TfUtils.TXMTD_SEP + min + TfUtils.TXMTD_SEP + max + TfUtils.TXMTD_SEP + binwidth + TfUtils.TXMTD_SEP + nbins + "\n");
		}
		finally {
			IOUtilFunctions.closeSilently(br);
		}
	}

	/** 
	 * Method to merge map output transformation metadata.
	 */
	@Override
	public void mergeAndOutputTransformationMetadata(Iterator values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
		double min = Double.MAX_VALUE;
		double max = -Double.MAX_VALUE;
		int nbins = 0;
		
		DistinctValue val = new DistinctValue();
		String w = null;
		double d;
		while(values.hasNext()) {
			val.reset();
			val = values.next();
			w = val.getWord();
			
			if(w.startsWith(MIN_PREFIX)) {
				d = UtilFunctions.parseToDouble(w.substring( MIN_PREFIX.length() ));
				if ( d < min )
					min = d;
			}
			else if(w.startsWith(MAX_PREFIX)) {
				d = UtilFunctions.parseToDouble(w.substring( MAX_PREFIX.length() ));
				if ( d > max )
					max = d;
			}
			else if (w.startsWith(NBINS_PREFIX)) {
				nbins = (int) UtilFunctions.parseToLong( w.substring(NBINS_PREFIX.length() ) );
			}
			else
				throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
		}
		
		// write merged metadata
		double binwidth = (max-min)/nbins;
		writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
	}
	
	
	public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
		if( !isApplicable() )
			return;
		
		MVImputeAgent mvagent = agents.getMVImputeAgent();
		for(int i=0; i < _colList.length; i++) {
			int colID = _colList[i];
			
			// If the column is imputed with a constant, then adjust min and max based the value of the constant.
			if ( mvagent.isApplicable(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT ) 
			{
				double cst = UtilFunctions.parseToDouble( mvagent.getReplacement(colID) );
				if ( cst < _min[i])
					_min[i] = cst;
				if ( cst > _max[i])
					_max[i] = cst;
			}
			
			double binwidth = (_max[i] - _min[i])/_numBins[i];
			writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents);
		}
	}
	
	// ------------------------------------------------------------------------------------------------

	/**
	 * Method to load transform metadata for all attributes
	 */
	@Override
	public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
		if( !isApplicable() )
			return;
		
		if(fs.isDirectory(txMtdDir)) {
			for(int i=0; i<_colList.length;i++) {
				int colID = _colList[i];
				
				Path path = new Path( txMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
				TfUtils.checkValidInputFile(fs, path, true); 
					
				BufferedReader br = null;
				try {
					br = new BufferedReader(new InputStreamReader(fs.open(path)));
					// format: colID,min,max,nbins
					String[] fields = br.readLine().split(TfUtils.TXMTD_SEP);
					double min = UtilFunctions.parseToDouble(fields[1]);
					//double max = UtilFunctions.parseToDouble(fields[2]);
					double binwidth = UtilFunctions.parseToDouble(fields[3]);
					int nbins = UtilFunctions.parseToInt(fields[4]);
					
					_numBins[i] = nbins;
					_min[i] = min;
					_binWidths[i] = binwidth; // (max-min)/nbins;
				}
				finally {
					IOUtilFunctions.closeSilently(br);
				}
			}
		}
		else {
			throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
		}
	}
	

	@Override
	public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
		build(in);
		return apply(in, out);
	}

	@Override
	public void build(FrameBlock in) {
		// TODO Auto-generated method stub
	}
	
	/**
	 * Method to apply transformations.
	 */
	@Override
	public String[] apply(String[] words) {
		if( !isApplicable() )
			return words;
	
		for(int i=0; i < _colList.length; i++) {
			int colID = _colList[i];
			try {
				double val = UtilFunctions.parseToDouble(words[colID-1]);
				int binid = 1;
				double tmp = _min[i] + _binWidths[i];
				while(val > tmp && binid < _numBins[i]) {
					tmp += _binWidths[i];
					binid++;
				}
				words[colID-1] = Integer.toString(binid);
			} 
			catch(NumberFormatException e) {
				throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method.");
			}
		}
		
		return words;
	}

	@Override
	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
		for(int j=0; j<_colList.length; j++) {
			int colID = _colList[j];
			for( int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy