org.apache.sysml.runtime.matrix.MMCJMR Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.sysml.runtime.matrix;

import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.MRInstructionParser;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.mr.AggregateBinaryInstruction;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixCell;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes;
import org.apache.sysml.runtime.matrix.mapred.MMCJMRMapper;
import org.apache.sysml.runtime.matrix.mapred.MMCJMRReducerWithAggregator;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups;
import org.apache.sysml.yarn.DMLAppMasterUtils;
import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;


/*
 * inBlockRepresentation: indicate whether to use block representation or cell representation
 * inputs: input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
 * inputInfos: the input format information for the input matrices
 * rlen: the number of rows for each matrix
 * clen: the number of columns for each matrix
 * brlen: the number of rows per block
 * bclen: the number of columns per block
 * instructionsInMapper: in Mapper, the set of unary operations that need to be performed on each input matrix
 * aggInstructionsInReducer: in Reducer, right after sorting, the set of aggreagte operations that need 
 * 							to be performed on each input matrix, 
 * aggBinInstrction: the aggregate binary instruction for the MMCJ operation
 * numReducers: the number of reducers
 * replication: the replication factor for the output
 * output: the path for the output file
 * outputInfo: information about output format
 */
public class MMCJMR 
{
	private static final boolean AUTOMATIC_CONFIG_NUM_REDUCERS = true;
	private static final Log LOG = LogFactory.getLog(MMCJMR.class);

	private MMCJMR() {
		//prevent instantiation via private constructor
	}
	
	public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, 
			int[] brlens, int[] bclens, String instructionsInMapper, 
			String aggInstructionsInReducer, String aggBinInstrction, int numReducers, 
			int replication, String output, OutputInfo outputinfo) 
	throws Exception
	{
		JobConf job = new JobConf(MMCJMR.class);
		
		// TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
		boolean inBlockRepresentation=MRJobConfiguration.deriveRepresentation(inputInfos);
		
		// by default, assume that dimensions of MMCJ's output are known at compile time
		byte resultDimsUnknown = (byte) 0;   
		MatrixCharacteristics[] stats=commonSetup(job, inBlockRepresentation, inputs, inputInfos, rlens, clens, 
				brlens, bclens, instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, numReducers, 
				replication, resultDimsUnknown, output, outputinfo);
		
		// Print the complete instruction
		if (LOG.isTraceEnabled())
			inst.printCompleteMRJobInstruction(stats);
		
		// Update resultDimsUnknown based on computed "stats"
		// There is always a single output
		if ( stats[0].getRows() == -1 || stats[0].getCols() == -1 ) {
			resultDimsUnknown = (byte) 1;
			
			// if the dimensions are unknown, then setup done in commonSetup() must be updated
			byte[] resultIndexes=new byte[]{MRInstructionParser.parseSingleInstruction(aggBinInstrction).output};
			byte[] resultDimsUnknown_Array = new byte[]{resultDimsUnknown};
			//set up the multiple output files, and their format information
			MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown_Array, new String[]{output}, new OutputInfo[]{outputinfo}, inBlockRepresentation);
		}

		AggregateBinaryInstruction ins=(AggregateBinaryInstruction) MRInstructionParser.parseSingleInstruction(aggBinInstrction);
		MatrixCharacteristics dim1 = MRJobConfiguration.getMatrixCharactristicsForBinAgg(job, ins.input1);
		MatrixCharacteristics dim2 = MRJobConfiguration.getMatrixCharactristicsForBinAgg(job, ins.input2);
		
		if(dim1.getRowsPerBlock()>dim1.getRows())
			dim1.setRowsPerBlock( (int) dim1.getRows() );
		if(dim1.getColsPerBlock()>dim1.getCols())
			dim1.setColsPerBlock( (int) dim1.getCols() );
		if(dim2.getRowsPerBlock()>dim2.getRows())
			dim2.setRowsPerBlock( (int) dim2.getRows() );
		if(dim2.getColsPerBlock()>dim2.getCols())
			dim2.setColsPerBlock( (int) dim2.getCols() );
	
		long blockSize1=77+8*dim1.getRowsPerBlock()*dim1.getColsPerBlock();
		long blockSize2=77+8*dim2.getRowsPerBlock()*dim2.getColsPerBlock();
		long blockSizeResult=77+8*dim1.getRowsPerBlock()*dim2.getColsPerBlock();
		
		long cacheSize = -1;
		//cache the first result
		if(dim1.getRows() mapoutputIndexes=MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,  instructionsInMapper, aggInstructionsInReducer, 
				aggBinInstrction, resultIndexes );
		
		//set up the multiple output files, and their format information
		MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown_Array, new String[]{output}, new OutputInfo[]{outputinfo}, inBlockRepresentation);
		
		// configure mapper
		job.setMapperClass(MMCJMRMapper.class);
		job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
		if(inBlockRepresentation)
			job.setMapOutputValueClass(MatrixBlock.class);
		else
			job.setMapOutputValueClass(MatrixCell.class);
		job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
		job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexPartitioner.class);
		
		//configure combiner
		//TODO: cannot set up combiner, because it will destroy the stable numerical algorithms 
		// for sum or for central moments 
		
	    //if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
	    //	job.setCombinerClass(MMCJMRCombiner.class);
		
		MatrixChar_N_ReducerGroups ret=MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, 
				instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, null, resultIndexes, 
				mapoutputIndexes, true);
		
		//set up the number of reducers
		if( AUTOMATIC_CONFIG_NUM_REDUCERS ){
			int numRed = determineNumReducers(rlens, clens, numReducers, ret.numReducerGroups);
			job.setNumReduceTasks(numRed);
		}
		else
			MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

		//configure reducer
		// note: the alternative MMCJMRReducer is not maintained
		job.setReducerClass(MMCJMRReducerWithAggregator.class);
		
		return ret.stats;
	}
	
	/**
	 * Determine number of reducers based on configured number of reducers, number of results groups
	 * and input data divided by blocksize (as heuristic for useful degree of parallelism).
	 * 
	 * @param rlen array of numbers of rows
	 * @param clen array of numbers of columns
	 * @param defaultNumRed default number of reducers
	 * @param numRedGroups number of reducer groups
	 * @return number of reducers
	 */
	protected static int determineNumReducers( long[] rlen, long[] clen, int defaultNumRed, long numRedGroups )
	{
		//init return with default value
		int ret = defaultNumRed;
		
		//determine max output matrix size
		long maxNumRed = InfrastructureAnalyzer.getRemoteParallelReduceTasks();
		long blockSize = InfrastructureAnalyzer.getHDFSBlockSize()/(1024*1024);
		long maxSize = -1; //in MB
		for( int i=0; i