org.apache.sysml.runtime.transform.TfUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.transform;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.regex.Pattern;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.parser.DataExpression;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.io.MatrixReader;
import org.apache.sysml.runtime.matrix.CSVReblockMR;
import org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.util.MapReduceTool;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.sysml.utils.JSONHelper;


@SuppressWarnings("deprecation")
public class TfUtils implements Serializable{
	
	private static final long serialVersionUID = 526252850872633125L;

	private OmitAgent _oa = null;
	private MVImputeAgent _mia = null;
	private RecodeAgent _ra = null;	
	private BinAgent _ba = null;
	private DummycodeAgent _da = null;
	
	private long _numRecordsInPartFile;		// Total number of records in the data file
	private long _numValidRecords;			// (_numRecordsInPartFile - #of omitted records)
	private long _numTransformedRows; 		// Number of rows after applying transformations
	private long _numTransformedColumns; 	// Number of columns after applying transformations

	private String _headerLine = null;
	private boolean _hasHeader;
	private Pattern _delim = null;
	private String _delimString = null;
	private String[] _NAstrings = null;
	private String[] _outputColumnNames = null;
	private long _numInputCols = -1;
	
	private String _tfMtdDir = null;
	private String _specFile = null;
	private String _offsetFile = null;
	private String _tmpDir = null;
	private String _outputPath = null;
	
	protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err)
			throws IOException {
		// check non-existing file
		if (!fs.exists(path))
			if ( err )
				throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
			else
				return false;

		// check for empty file
		if (MapReduceTool.isFileEmpty(fs, path.toString()))
			if ( err )
			throw new EOFException("Empty input file " + path.toString() + ".");
			else
				return false;
		
		return true;
	}
	
	public static String getPartFileName(JobConf job) throws IOException {
		FileSystem fs = FileSystem.get(job);
		Path thisPath=new Path(job.get("map.input.file")).makeQualified(fs);
		return thisPath.toString();
	}
	
	public static boolean isPartFileWithHeader(JobConf job) throws IOException {
		FileSystem fs = FileSystem.get(job);
		
		String thisfile=getPartFileName(job);
		Path smallestFilePath=new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
		
		if(thisfile.toString().equals(smallestFilePath.toString()))
			return true;
		else
			return false;
	}
	
	public static JSONObject readSpec(FileSystem fs, String specFile) throws IOException {
		BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFile))));
		JSONObject obj = JSONHelper.parse(br);
		br.close();
		return obj;
	}
	
	/**
	 * Prepare NA strings so that they can be sent to workers via JobConf.
	 * A "dummy" string is added at the end to handle the case of empty strings.
	 * @param na
	 * @return
	 */
	public static String prepNAStrings(String na) {
		return na  + DataExpression.DELIM_NA_STRING_SEP + "dummy";
	}
	
	public static String[] parseNAStrings(String na) 
	{
		if ( na == null )
			return null;
		
		String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1);
		return tmp; //Arrays.copyOf(tmp, tmp.length-1);
	}
	
	public static String[] parseNAStrings(JobConf job) 
	{
		return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS));
	}
	
	private void createAgents(JSONObject spec) throws IOException, JSONException {
		_oa = new OmitAgent(spec);
		_mia = new MVImputeAgent(spec);
		_ra = new RecodeAgent(spec);
		_ba = new BinAgent(spec);
		_da = new DummycodeAgent(spec, _numInputCols);
	}
	
	public void setupAgents(OmitAgent oa, MVImputeAgent mia, RecodeAgent ra, BinAgent ba, DummycodeAgent da)  {
		_oa = oa;
		_mia = mia;
		_ra = ra;
		_ba = ba;
		_da = da;
	}
	
	private void parseColumnNames() {
		_outputColumnNames = _delim.split(_headerLine, -1);
		for(int i=0; i < _outputColumnNames.length; i++)
			_outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]);
	}
	
	private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException
	{
		_numRecordsInPartFile = 0;
		_numValidRecords = 0;
		_numTransformedRows = 0;
		_numTransformedColumns = 0;
		
		_headerLine = headerLine;
		_hasHeader = hasHeader;
		_delimString = delim;
		_delim = Pattern.compile(Pattern.quote(delim));
		_NAstrings = naStrings;
		_numInputCols = numCols;
		_offsetFile = offsetFile;
		_tmpDir = tmpPath;
		_outputPath = outputPath;
		
		parseColumnNames();		
		createAgents(spec);
	}
	
	public TfUtils(JobConf job, boolean minimal) 
		throws IOException, JSONException 
	{
		if( !InfrastructureAnalyzer.isLocalMode(job) ) {
			ConfigurationManager.setCachedJobConf(job);
		}
		
		_NAstrings = TfUtils.parseNAStrings(job);
		_specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);
		
		FileSystem fs = FileSystem.get(job);
		JSONObject spec = TfUtils.readSpec(fs, _specFile);
		
		_oa = new OmitAgent(spec);
	}
	
	// called from GenTFMtdMapper, ApplyTf (Hadoop)
	public TfUtils(JobConf job) 
		throws IOException, JSONException 
	{
		if( !InfrastructureAnalyzer.isLocalMode(job) ) {
			ConfigurationManager.setCachedJobConf(job);
		}
		
		boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER));
		//Pattern delim = Pattern.compile(Pattern.quote(job.get(MRJobConfiguration.TF_DELIM)));
		String[] naStrings = TfUtils.parseNAStrings(job);
		
		long numCols = UtilFunctions.parseToLong( job.get(MRJobConfiguration.TF_NUM_COLS) );		// #of columns in input data
			
		String specFile = job.get(MRJobConfiguration.TF_SPEC_FILE);
		String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE);
		String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC);
		String outputPath = FileOutputFormat.getOutputPath(job).toString();
		FileSystem fs = FileSystem.get(job);
		JSONObject spec = TfUtils.readSpec(fs, specFile);
		
		init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings, spec, numCols, offsetFile, tmpPath, outputPath);
	}
	
	// called from GenTfMtdReducer 
	public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException 
	{
		this(job);
		_tfMtdDir = tfMtdDir;
	}
	
	// called from GenTFMtdReducer and ApplyTf (Spark)
	public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException {
		init (headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null);
		_tfMtdDir = tfMtdDir;
	}
	
	public void incrValid() { _numValidRecords++; }
	public long getValid()  { return _numValidRecords; }
	public long getTotal()  { return _numRecordsInPartFile; }
	public long getNumTransformedRows() 	{ return _numTransformedRows; }
	public long getNumTransformedColumns() 	{ return _numTransformedColumns; }
	
	public String getHeader() 		{ return _headerLine; }
	public boolean hasHeader() 		{ return _hasHeader; }
	public String getDelimString() 	{ return _delimString; }
	public Pattern getDelim() 		{ return _delim; }
	public String[] getNAStrings() 	{ return _NAstrings; }
	public long getNumCols() 		{ return _numInputCols; }
	
	public String getSpecFile() 	{ return _specFile; }
	public String getTfMtdDir() 	{ return _tfMtdDir; }
	public String getOffsetFile() 	{ return _offsetFile; }
	public String getTmpDir() 		{ return _tmpDir; }
	public String getOutputPath()	{ return _outputPath; }
	
	public String getName(int colID) { return _outputColumnNames[colID-1]; }
	
	public void setValid(long n) { _numValidRecords = n;}
	public void incrTotal() { _numRecordsInPartFile++; }
	public void setTotal(long n) { _numRecordsInPartFile = n;}
	
	public OmitAgent 	  getOmitAgent() 	{ 	return _oa; }
	public MVImputeAgent  getMVImputeAgent(){ 	return _mia;}
	public RecodeAgent 	  getRecodeAgent() 	{ 	return _ra; }
	public BinAgent 	  getBinAgent() 	{ 	return _ba; }
	public DummycodeAgent getDummycodeAgent() { return _da; }
	
	/**
	 * Function that checks if the given string is one of NA strings.
	 * 
	 * @param w
	 * @return
	 */
	public boolean isNA(String w) {
		if(_NAstrings == null)
			return false;
		
		for(String na : _NAstrings) {
			if(w.equals(na))
				return true;
		}
		return false;
	}
	
	public String[] getWords(Text line)
	{
		return getWords(line.toString());
	}
	

	public String[] getWords(String line) 
	{
		return getDelim().split(line.trim(), -1);
	}
	
	/**
	 * Process a given row to construct transformation metadata.
	 * 
	 * @param line
	 * @return
	 * @throws IOException
	 */
	public String[] prepareTfMtd(String line) throws IOException {
		String[] words = getWords(line);
		if(!getOmitAgent().omit(words, this))
		{
			getMVImputeAgent().prepare(words, this);
			getRecodeAgent().prepare(words, this);
			getBinAgent().prepare(words, this);
			incrValid();;
		}
		incrTotal();
		
		return words;
	}
	
	public void loadTfMetadata() throws IOException 
	{
		JobConf job = ConfigurationManager.getCachedJobConf();
		loadTfMetadata(job, false);
	}
	
	public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException
	{
		Path tfMtdDir = null; 
		FileSystem fs = null;
		
		if(fromLocalFS) {
			// metadata must be read from local file system (e.g., distributed cache in the case of Hadoop)
			tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0];
			fs = FileSystem.getLocal(job);
		}
		else {
			fs = FileSystem.get(job);
			tfMtdDir = new Path(getTfMtdDir());
		}
		
		// load transformation metadata 
		getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this);
		getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this);
		getBinAgent().loadTxMtd(job, fs, tfMtdDir, this);
		
		// associate recode maps and bin definitions with dummycoding agent,
		// as recoded and binned columns are typically dummycoded
		getDummycodeAgent().setRecodeMaps( getRecodeAgent().getRecodeMaps() );
		getDummycodeAgent().setNumBins(getBinAgent().getBinList(), getBinAgent().getNumBins());
		getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this);

	}
	
	/*public void loadTfMetadata () throws IOException
	{
		Path tfMtdDir = (DistributedCache.getLocalCacheFiles(_rJob))[0];
		FileSystem localFS = FileSystem.getLocal(_rJob);
		
		loadTfMetadata(_rJob, localFS, tfMtdDir);
		
		FileSystem fs;
		fs = FileSystem.get(_rJob);
		Path thisPath=new Path(_rJob.get("map.input.file")).makeQualified(fs);
		String thisfile=thisPath.toString();
			
		Path smallestFilePath=new Path(_rJob.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
		if(thisfile.toString().equals(smallestFilePath.toString()))
			_partFileWithHeader=true;
		else
			_partFileWithHeader = false;
	}*/


	public String processHeaderLine() throws IOException 
	{
		FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
		String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim());
		getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this);
		
		// write header information (before and after transformation) to temporary path
		// these files are copied into txMtdPath, once the ApplyTf job is complete.
		DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader);

		return dcdHeader;
		//_numTransformedColumns = getDelim().split(dcdHeader, -1).length; 
		//return _numTransformedColumns;
	}

	public boolean omit(String[] words) {
		if(getOmitAgent() == null)
			return false;
		return getOmitAgent().omit(words, this);
	}
	
	
	public String[] apply(String[] words) {
		return apply(words, false);
	}
	
	/**
	 * Function to apply transformation metadata on a given row.
	 * 
	 * @param words
	 * @param optimizeMaps
	 * @return
	 */
	public String[] apply ( String[] words, boolean optimizeMaps ) 
	{
		words = getMVImputeAgent().apply(words, this);
		
		if(optimizeMaps)
			// specific case of transform() invoked from CP (to save boxing and unboxing)
			words = getRecodeAgent().cp_apply(words, this);
		else
			words = getRecodeAgent().apply(words, this);

		words = getBinAgent().apply(words, this);
		words = getDummycodeAgent().apply(words, this);
		
		_numTransformedRows++;
		
		return words;
	}
	
	public void check(String []words) throws DMLRuntimeException 
	{
		boolean checkEmptyString = ( getNAStrings() != null );
		if ( checkEmptyString ) 
		{
			final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
			for(int i=0; i