org.apache.sysml.runtime.util.MapReduceTool Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.parser.DataExpression;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.io.MatrixReader;
import org.apache.sysml.runtime.io.MatrixReaderFactory;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.sort.ReadWithZeros;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.OrderedJSONObject;


public class MapReduceTool 
{
	private static final int MAX_DELETE_RETRIES = 10;
	
	private static final Log LOG = LogFactory.getLog(MapReduceTool.class.getName());
	private static JobConf _rJob = null; //cached job conf for read-only operations
	
	static{
		_rJob = ConfigurationManager.getCachedJobConf();
	}
	
	
	public static String getUniqueKeyPerTask(JobConf job, boolean inMapper) {
		//TODO: investigate ID pattern, required for parallel jobs
		/*String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
		return String.valueOf(IDHandler.extractLongID(nodePrefix));*/
		
		String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
		int i;
		if (inMapper)
			i = nodePrefix.indexOf("_m_");
		else
			i = nodePrefix.indexOf("_r_");
		int j = nodePrefix.lastIndexOf("_");
		nodePrefix = nodePrefix.substring(i + 3, j);
		// remove all the leading 0s
		return String.valueOf(Long.parseLong(nodePrefix));
	}

	public static int getUniqueTaskId(JobConf job) {
		//TODO: investigate ID pattern, required for parallel jobs
		/*String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); 
		return IDHandler.extractIntID(nodePrefix);*/
		
		String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
		int j = nodePrefix.lastIndexOf("_");
		int i=nodePrefix.lastIndexOf("_", j-1);
		nodePrefix = nodePrefix.substring(i+1, j);
		// System.out.println("nodePrefix = " + nodePrefix) ;
		return Integer.valueOf(nodePrefix);
	}

	public static String getGloballyUniqueName(JobConf job) {
		return job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
	}

	public static boolean existsFileOnHDFS(String fname){
		try {
			return FileSystem.get(_rJob)
				.exists(new Path(fname));
		}
		catch(Exception ex) {
			LOG.error("Failed check existsFileOnHDFS.", ex);
		}
		return false;
	}
	
	public static boolean isDirectory(String fname) {
		try {
			return FileSystem.get(_rJob)
				.isDirectory(new Path(fname));
		}
		catch(Exception ex) {
			LOG.error("Failed check isDirectory.", ex);
		}
		return false;
	}
	
	public static FileStatus[] getDirectoryListing(String fname) {
		try {
			return FileSystem.get(_rJob)
				.listStatus(new Path(fname));
		}
		catch(Exception ex) {
			LOG.error("Failed listing of directory contents.", ex);
		}
		return new FileStatus[0];
	}

	public static void deleteFileWithMTDIfExistOnHDFS(String fname)  throws IOException {
		deleteFileIfExistOnHDFS(fname);
		deleteFileIfExistOnHDFS(fname + ".mtd");
	}
	
	public static void deleteFileIfExistOnHDFS(String dir) throws IOException {
		deleteFileIfExists(FileSystem.get(_rJob), new Path(dir));
	}

	public static void deleteFileIfExistOnHDFS(Path outpath, JobConf job) throws IOException {
		deleteFileIfExists(FileSystem.get(job), outpath);
	}
	
	public static void deleteFileIfExistOnLFS(Path outpath, JobConf job) throws IOException {
		deleteFileIfExists(FileSystem.getLocal(job), outpath);
	}
	
	private static void deleteFileIfExists(FileSystem fs, Path outpath) throws IOException {
		if( fs.exists(outpath) ) {
			int retries = MAX_DELETE_RETRIES;
			while( !fs.delete(outpath, true) && retries > 0 ) {
				retries--;
			}
		}
	}

	public static boolean isHDFSFileEmpty(String dir) throws IOException {
		FileSystem fs = FileSystem.get(_rJob);
		return isFileEmpty(fs, dir);
	}

	public static boolean isFileEmpty(FileSystem fs, String dir) throws IOException {
		Path pth = new Path(dir);
		FileStatus fstat = fs.getFileStatus(pth);

		if (fstat.isDirectory()) {
			// it is a directory
			FileStatus[] stats = fs.listStatus(pth);
			if (stats != null) {
				for (FileStatus stat : stats) {
					if (stat.getLen() > 0)
						return false;
				}
				return true;
			} else {
				return true;
			}
		} else {
			// it is a regular file
			if (fstat.getLen() == 0)
				return true;
			else
				return false;
		}
	}

	public static void renameFileOnHDFS(String originalDir, String newDir) throws IOException {
		Path originalpath = new Path(originalDir);
		
		deleteFileIfExistOnHDFS(newDir);
		Path newpath = new Path(newDir);
		
		FileSystem fs = FileSystem.get(_rJob);
		if (fs.exists(originalpath)) {
			fs.rename(originalpath, newpath);
		}
		else {
			throw new FileNotFoundException(originalDir);
		}
	}
	
	public static void mergeIntoSingleFile(String originalDir, String newFile) throws IOException {
		FileSystem fs = FileSystem.get(_rJob);
		FileUtil.copyMerge(fs, new Path(originalDir), fs, new Path(newFile), true, _rJob, null);
	}

	public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException {
		Path originalPath = new Path(originalDir);
		Path newPath = new Path(newDir);
		boolean deleteSource = false;
		boolean overwrite = true;
		
		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
		FileSystem fs = FileSystem.get(job);
		if (fs.exists(originalPath)) {
			FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job);
		}
	}

	/**
	 * Returns the size of a file or directory on hdfs in bytes.
	 * 
	 * @param path file system path
	 * @return file size
	 * @throws IOException if IOException occurs
	 */
	public static long getFilesizeOnHDFS( Path path ) 
		throws IOException
	{
		FileSystem fs = FileSystem.get(_rJob);
		long ret = 0; //in bytes
		if( fs.isDirectory(path) )
			ret = fs.getContentSummary(path).getLength();
		else
			ret = fs.getFileStatus(path).getLen();
		//note: filestatus would return 0 on directories
		
		return ret;
	}
	
	private static BufferedReader setupInputFile ( String filename ) throws IOException {
        Path pt=new Path(filename);
        FileSystem fs = FileSystem.get(_rJob);
        BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt)));		
        return br;
	}
	
	public static double readDoubleFromHDFSFile(String filename) throws IOException {
		return (Double)readObjectFromHDFSFile(filename, ValueType.DOUBLE);
	}
	
	public static long readIntegerFromHDFSFile(String filename) throws IOException {
		return (Long)readObjectFromHDFSFile(filename, ValueType.INT);
	}
	
	public static boolean readBooleanFromHDFSFile(String filename) throws IOException {
		return (Boolean)readObjectFromHDFSFile(filename, ValueType.BOOLEAN);
	}
	
	public static String readStringFromHDFSFile(String filename) 
		throws IOException 
	{
		StringBuilder sb = new StringBuilder();
		try( BufferedReader br = setupInputFile(filename) ) {
			// handle multi-line strings in the HDFS file
			String line = null;
			while ( (line = br.readLine()) != null ) {
				sb.append(line);
				sb.append("\n");
			}
		}
		
		//return string without last character
		return sb.substring(0, sb.length()-1);
	}
	
	public static Object readObjectFromHDFSFile(String filename, ValueType vt) throws IOException {
		String line = null;
		try( BufferedReader br = setupInputFile(filename) ) {
			line = br.readLine();
		}
		if( line == null )
			throw new IOException("Empty file on hdfs: "+filename);
		
		switch( vt ) {
			case BOOLEAN: return Boolean.parseBoolean(line);
			case DOUBLE: return Double.parseDouble(line);
			case INT: return Long.parseLong(line);
			default: return line;
		}
	}
		
	private static BufferedWriter setupOutputFile ( String filename ) throws IOException {
        Path pt=new Path(filename);
        FileSystem fs = FileSystem.get(_rJob);
        BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));		
        return br;
	}
	
	public static void writeDoubleToHDFS ( double d, String filename ) throws IOException {
		writeObjectToHDFS(d, filename);
	}
	
	public static void writeIntToHDFS ( long i, String filename ) throws IOException {
	    writeObjectToHDFS(i, filename);
	}
	
	public static void writeBooleanToHDFS ( boolean b, String filename ) throws IOException {
	    writeObjectToHDFS(b, filename);
	}
	
	public static void writeStringToHDFS ( String s, String filename ) throws IOException {
		writeObjectToHDFS(s, filename);
	}
	
	public static void writeObjectToHDFS ( Object obj, String filename ) throws IOException {
		try( BufferedWriter br = setupOutputFile(filename) ) {
			br.write(obj.toString());
		}
	}
	
	public static void writeDimsFile ( String filename, byte[] unknownFlags, long[] maxRows, long[] maxCols) throws IOException {
		try( BufferedWriter br = setupOutputFile(filename) ) {
	        StringBuilder line = new StringBuilder();
	        for ( int i=0; i < unknownFlags.length; i++ ) {
	        	if ( unknownFlags[i]  != (byte)0 ) {
	        		line.append(i);
	        		line.append(" " + maxRows[i]);
	        		line.append(" " + maxCols[i]);
	        		line.append("\n");
	        	}
	        }
	        br.write(line.toString());
        }
	}
	
	public static MatrixCharacteristics[] processDimsFiles(String dir, MatrixCharacteristics[] stats) 
		throws IOException 
	{
		Path pt=new Path(dir);
        FileSystem fs = FileSystem.get(_rJob);
		
        if ( !fs.exists(pt) )
        	return stats;
        
        FileStatus fstat = fs.getFileStatus(pt);
		
        if ( fstat.isDirectory() ) 
        {
			FileStatus[] files = fs.listStatus(pt);
			for ( int i=0; i < files.length; i++ ) {
				Path filePath = files[i].getPath();
				try( BufferedReader br = setupInputFile(filePath.toString()) ) {
					String line = "";
					while((line=br.readLine()) != null ) {
						String[] parts = line.split(" ");
						int resultIndex = Integer.parseInt(parts[0]);
						long maxRows = Long.parseLong(parts[1]);
						long maxCols = Long.parseLong(parts[2]);
						
						stats[resultIndex].setDimension( (stats[resultIndex].getRows() < maxRows ? maxRows : stats[resultIndex].getRows()), 
								                         (stats[resultIndex].getCols() < maxCols ? maxCols : stats[resultIndex].getCols()) );
					}
				}
			}
		}
		else 
		{
			throw new IOException(dir + " is expected to be a folder!");
		}

		return stats;
	}
	
	public static void writeMetaDataFile(String mtdfile, ValueType vt, MatrixCharacteristics mc, OutputInfo outinfo) 
		throws IOException {
		writeMetaDataFile(mtdfile, vt, null, DataType.MATRIX, mc, outinfo);
	}
	
	public static void writeMetaDataFile(String mtdfile, ValueType vt, ValueType[] schema, DataType dt, MatrixCharacteristics mc, OutputInfo outinfo) 
		throws IOException {
		writeMetaDataFile(mtdfile, vt, schema, dt, mc, outinfo, null);
	}

	public static void writeMetaDataFile(String mtdfile, ValueType vt, MatrixCharacteristics mc,  OutputInfo outinfo, FileFormatProperties formatProperties) 
		throws IOException {
		writeMetaDataFile(mtdfile, vt, null, DataType.MATRIX, mc, outinfo, formatProperties);
	}
	
	public static void writeMetaDataFile(String mtdfile, ValueType vt, ValueType[] schema, DataType dt, MatrixCharacteristics mc, 
			OutputInfo outinfo, FileFormatProperties formatProperties) 
		throws IOException 
	{
		Path pt = new Path(mtdfile);
		FileSystem fs = FileSystem.get(_rJob);
		try( BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))) ) {
			String mtd = metaDataToString(vt, schema, dt, mc, outinfo, formatProperties);
			br.write(mtd);
		} catch (Exception e) {
			throw new IOException("Error creating and writing metadata JSON file", e);
		}
	}

	public static void writeScalarMetaDataFile(String mtdfile, ValueType vt) 
		throws IOException 
	{
		Path pt = new Path(mtdfile);
		FileSystem fs = FileSystem.get(_rJob);
		try( BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))) ) {
			String mtd = metaDataToString(vt, null, DataType.SCALAR, null, OutputInfo.TextCellOutputInfo, null);
			br.write(mtd);
		} 
		catch (Exception e) {
			throw new IOException("Error creating and writing metadata JSON file", e);
		}
	}

	public static String metaDataToString(ValueType vt, ValueType[] schema, DataType dt, MatrixCharacteristics mc,
			OutputInfo outinfo, FileFormatProperties formatProperties) throws JSONException, DMLRuntimeException
	{
		OrderedJSONObject mtd = new OrderedJSONObject(); // maintain order in output file

		//handle data type and value types (incl schema for frames)
		mtd.put(DataExpression.DATATYPEPARAM, dt.toString().toLowerCase());
		if (schema == null) {
			mtd.put(DataExpression.VALUETYPEPARAM, vt.toString().toLowerCase());
		}	
		else {
			StringBuffer schemaSB = new StringBuffer();
			for(int i=0; i < schema.length; i++) {
				if( schema[i] == ValueType.UNKNOWN )
					schemaSB.append("*");
				else
					schemaSB.append(schema[i].toString());
				schemaSB.append(DataExpression.DEFAULT_DELIM_DELIMITER);
			}
			mtd.put(DataExpression.SCHEMAPARAM, schemaSB.toString());
		}
		
		//handle output dimensions
		if( !dt.isScalar() ) {
			mtd.put(DataExpression.READROWPARAM, mc.getRows());
			mtd.put(DataExpression.READCOLPARAM, mc.getCols());
			// handle output nnz and binary block configuration
			if( dt.isMatrix() ) {
				if (outinfo == OutputInfo.BinaryBlockOutputInfo ) {
					mtd.put(DataExpression.ROWBLOCKCOUNTPARAM, mc.getRowsPerBlock());
					mtd.put(DataExpression.COLUMNBLOCKCOUNTPARAM, mc.getColsPerBlock());
				}
				mtd.put(DataExpression.READNUMNONZEROPARAM, mc.getNonZeros());
			}
		}
			
		//handle format type and additional arguments	
		mtd.put(DataExpression.FORMAT_TYPE, OutputInfo.outputInfoToStringExternal(outinfo));
		if (outinfo == OutputInfo.CSVOutputInfo) {
			CSVFileFormatProperties csvProperties = (formatProperties==null) ?
				new CSVFileFormatProperties() : (CSVFileFormatProperties)formatProperties;
			mtd.put(DataExpression.DELIM_HAS_HEADER_ROW, csvProperties.hasHeader());
			mtd.put(DataExpression.DELIM_DELIMITER, csvProperties.getDelim());
		}

		if (formatProperties != null) {
			String description = formatProperties.getDescription();
			if (StringUtils.isNotEmpty(description)) {
				String jsonDescription = StringEscapeUtils.escapeJson(description);
				mtd.put(DataExpression.DESCRIPTIONPARAM, jsonDescription);
			}
		}

		String userName = System.getProperty("user.name");
		if (StringUtils.isNotEmpty(userName)) {
			mtd.put(DataExpression.AUTHORPARAM, userName);
		} else {
			mtd.put(DataExpression.AUTHORPARAM, "SystemML");
		}

		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z");
		mtd.put(DataExpression.CREATEDPARAM, sdf.format(new Date()));

		return mtd.toString(4); // indent with 4 spaces	
	}
	
	public static double[][] readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) 
		throws IOException, DMLRuntimeException
	{
		MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo);
		MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen);
		return DataConverter.convertToDoubleMatrix(mb);
	}
	
	public static double[] readColumnVectorFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) 
		throws IOException, DMLRuntimeException
	{
		MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo);
		MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen);
		return DataConverter.convertToDoubleVector(mb);
	}
	
	public static double median(String dir, NumItemsByEachReducerMetaData metadata) throws IOException {
		long[] counts=metadata.getNumItemsArray();
		long[] ranges=new long[counts.length];
		ranges[0]=counts[0];
		for(int i=1; i0)
			offset=(int)(pos-ranges[currentPart-1]-1);
		else
			offset=(int)pos-1;
		
		FileSystem fs=FileSystem.get(_rJob);
		Path path=new Path(dir);
		FileStatus[] files=fs.listStatus(path);
		Path fileToRead=null;
		for(FileStatus file: files)
			if(file.getPath().toString().endsWith(Integer.toString(currentPart)))
			{
				fileToRead=file.getPath();
				break;
			}
		
		if(fileToRead==null)
			throw new RuntimeException("cannot read partition "+currentPart);
		
		int buffsz = 64 * 1024;
		DoubleWritable readKey=new DoubleWritable();
	    IntWritable readValue=new IntWritable();
	    FSDataInputStream currentStream = null;
		double ret = -1;
	    try {
			currentStream = fs.open(fileToRead, buffsz);
		    
			boolean contain0s=false;
			long numZeros=0;
			if(currentPart==metadata.getPartitionOfZero())
			{
				contain0s=true;
				numZeros=metadata.getNumberOfZero();
			}
		    ReadWithZeros reader=new ReadWithZeros(currentStream, contain0s, numZeros);
	
		    int numRead=0;
		    while(numRead<=offset)
			{
		    	reader.readNextKeyValuePairs(readKey, readValue);
				numRead+=readValue.get();
				cum_weight += readValue.get();
			}
		    
		    ret = readKey.get();
		    if(average) {
		    	if(numRead<=offset+1) {
		    		reader.readNextKeyValuePairs(readKey, readValue);
					cum_weight += readValue.get();
					ret = (ret+readKey.get())/2;
		    	}
		    }
		}
		finally {
			IOUtilFunctions.closeSilently(currentStream);
		}
	    return new double[] {ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight)};
	}

	public static void createDirIfNotExistOnHDFS(String dir, String permissions) 
		throws IOException
	{
		Path path = new Path(dir);
		try {
			FileSystem fs = FileSystem.get(_rJob);
			if( !fs.exists(path) ) 
			{
				char[] c = permissions.toCharArray();
				short sU = (short)((c[0]-48) * 64);
				short sG = (short)((c[1]-48) * 8);
				short sO = (short)((c[2]-48)); 
				short mode = (short)(sU + sG + sO);
				FsPermission perm = new FsPermission(mode);
				fs.mkdirs(path, perm);
			}	
		}
		catch (Exception ex){
			throw new IOException("Failed in creating a non existing dir on HDFS", ex);
		}
		
		//NOTE: we depend on the configured umask, setting umask in job or fspermission has no effect
		//similarly setting MRConfigurationNames.DFS_DATANODE_DATA_DIR_PERM as no effect either.
	}

	public static FSDataOutputStream getHDFSDataOutputStream(String filename, boolean overwrite) 
		throws IOException
	{
		FileSystem fs = FileSystem.get(_rJob);
		Path path = new Path(filename);
		return fs.create(path, overwrite);
	}
}