org.apache.sysml.runtime.matrix.GMR Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.sysml.runtime.matrix;

import java.util.ArrayList;
import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.Instruction;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.MRInstructionParser;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.mr.IDistributedCacheConsumer;
import org.apache.sysml.runtime.instructions.mr.MRInstruction;
import org.apache.sysml.runtime.instructions.mr.PickByCountInstruction;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell;
import org.apache.sysml.runtime.matrix.mapred.GMRCombiner;
import org.apache.sysml.runtime.matrix.mapred.GMRMapper;
import org.apache.sysml.runtime.matrix.mapred.GMRReducer;
import org.apache.sysml.runtime.matrix.mapred.MRBaseForCommonInstructions;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups;
import org.apache.sysml.runtime.matrix.sort.PickFromCompactInputFormat;
import org.apache.sysml.runtime.util.MapReduceTool;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.sysml.yarn.DMLAppMasterUtils;

 
public class GMR
{
	private static final Log LOG = LogFactory.getLog(GMR.class.getName());
		
	private GMR() {
		//prevent instantiation via private constructor
	}

	/**
	 * Execute job.
	 * 
	 * @param inst MR job instruction
	 * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
	 * @param inputInfos the input format information for the input matrices
	 * @param rlens array of number of rows
	 * @param clens array of number of columns
	 * @param brlens array of number of rows in block
	 * @param bclens array of number of columns in block
	 * @param partitioned boolean array of partitioned status
	 * @param pformats array of data partition formats
	 * @param psizes does nothing
	 * @param recordReaderInstruction record reader instruction
	 * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
	 * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
	 * that need to be performed on each input matrix
	 * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
	 * @param numReducers the number of reducers
	 * @param replication the replication factor for the output
	 * @param jvmReuse if true, reuse JVM
	 * @param resultIndexes the indexes of the result matrices that needs to be outputted
	 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
	 * @param outputs the names for the output directories, one for each result index
	 * @param outputInfos output format information for the output matrices
	 * @return job return object
	 * @throws Exception if Exception occurs
	 */
	@SuppressWarnings({ "unchecked", "rawtypes" })
	public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, 
			int[] brlens, int[] bclens, 
			boolean[] partitioned, PDataPartitionFormat[] pformats, int[] psizes,
			String recordReaderInstruction, String instructionsInMapper, String aggInstructionsInReducer, 
			String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, 
			String[] outputs, OutputInfo[] outputInfos) 
		throws Exception
	{
		JobConf job = new JobConf(GMR.class);
		job.setJobName("G-MR");
		
		boolean inBlockRepresentation=MRJobConfiguration.deriveRepresentation(inputInfos);

		//whether use block representation or cell representation
		MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
	
		//added for handling recordreader instruction
		String[] realinputs=inputs;
		InputInfo[] realinputInfos=inputInfos;
		long[] realrlens=rlens;
		long[] realclens=clens;
		int[] realbrlens=brlens;
		int[] realbclens=bclens;
		byte[] realIndexes=new byte[inputs.length];
		for(byte b=0; b) inputInfos[ins.input1].inputKeyClass, 
					inputInfos[ins.input1].inputValueClass);
		    job.setInputFormat(PickFromCompactInputFormat.class);
		    PickFromCompactInputFormat.setZeroValues(job, (MetaDataNumItemsByEachReducer)inputInfos[ins.input1].metadata);
		    
			if(ins.isValuePick)
			{
				double[] probs=MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2], rlens[ins.input2], 
						clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
			    PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs);
			    
			    realinputs=new String[inputs.length-1];
				realinputInfos=new InputInfo[inputs.length-1];
				realrlens=new long[inputs.length-1];
				realclens=new long[inputs.length-1];
				realbrlens=new int[inputs.length-1];
				realbclens=new int[inputs.length-1];
				realIndexes=new byte[inputs.length-1];
				byte realIndex=0;
				for(byte i=0; i mapoutputIndexes=MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, 
				otherInstructionsInReducer, resultIndexes);
		
		MatrixChar_N_ReducerGroups ret=MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, 
				instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
		
		MatrixCharacteristics[] stats=ret.stats;
		
		//set up the number of reducers
		MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
		
		// Print the complete instruction
		if (LOG.isTraceEnabled())
			inst.printCompleteMRJobInstruction(stats);
		
		// Update resultDimsUnknown based on computed "stats"
		byte[] dimsUnknown = new byte[resultIndexes.length];
		for ( int i=0; i < resultIndexes.length; i++ ) { 
			if ( stats[i].getRows() == -1 || stats[i].getCols() == -1 ) {
				dimsUnknown[i] = (byte)1;
			}
			else {
				dimsUnknown[i] = (byte) 0;
			}
		}
		//MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);
		
		//set up the multiple output files, and their format information
		MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation, true);
		
		// configure mapper and the mapper output key value pairs
		job.setMapperClass(GMRMapper.class);
		if(numReducers==0)
		{
			job.setMapOutputKeyClass(Writable.class);
			job.setMapOutputValueClass(Writable.class);
		}else
		{
			job.setMapOutputKeyClass(MatrixIndexes.class);
			if(inBlockRepresentation)
				job.setMapOutputValueClass(TaggedMatrixBlock.class);
			else
				job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
		}
		
		//set up combiner
		if(numReducers!=0 && aggInstructionsInReducer!=null 
				&& !aggInstructionsInReducer.isEmpty())
		{
			job.setCombinerClass(GMRCombiner.class);
		}
	
		//configure reducer
		job.setReducerClass(GMRReducer.class);
		//job.setReducerClass(PassThroughReducer.class);
		
		// By default, the job executes in "cluster" mode.
		// Determine if we can optimize and run it in "local" mode.
		MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
		for ( int i=0; i < inputs.length; i++ ) {
			inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
		}
			
		//set unique working dir
		MRJobConfiguration.setUniqueWorkingDir(job);
		
		
		RunningJob runjob=JobClient.runJob(job);
		
		Group group=runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
		for(int i=0; i indexList = new ArrayList<>();
			String[] inst = allInsts.split(Instruction.INSTRUCTION_DELIM);
			for( String tmp : inst ) {
				if( InstructionUtils.isDistributedCacheUsed(tmp) )
				{
					ArrayList tmpindexList = new ArrayList<>();
					
					MRInstruction mrinst = MRInstructionParser.parseSingleInstruction(tmp);
					if( mrinst instanceof IDistributedCacheConsumer )
						((IDistributedCacheConsumer)mrinst).addDistCacheIndex(tmp, tmpindexList);
					
					//copy distinct indexes only (prevent redundant add to distcache)
					for( Byte tmpix : tmpindexList )
						if( !indexList.contains(tmpix) )
							indexList.add(tmpix);
				}
			}

			//construct index and path strings
			ArrayList pathList = new ArrayList<>(); // list of paths to be placed in Distributed cache
			StringBuilder indexString = new StringBuilder(); // input indices to be placed in Distributed Cache (concatenated) 
			StringBuilder pathString = new StringBuilder();  // input paths to be placed in Distributed Cache (concatenated) 
			for( byte index : indexList )
			{
				if( pathList.size()>0 ) {
					indexString.append(Instruction.INSTRUCTION_DELIM);
					pathString.append(Instruction.INSTRUCTION_DELIM);
				}
				pathList.add( inputs[index] );
				indexString.append(index);
				pathString.append(inputs[index]);
			}
			
			
			//configure mr job with distcache indexes
			MRJobConfiguration.setupDistCacheInputs(job, indexString.toString(), pathString.toString(), pathList);
			
			//clean in-memory cache (prevent job interference in local mode)
			if( InfrastructureAnalyzer.isLocalMode(job) ) {
				MRBaseForCommonInstructions.resetDistCache();
				return true;
			}
		}
		
		return false;
	}

	/**
	 * Determine which indices are only used as inputs through distributed cache and hence would
	 * be redundant job inputs.
	 * 
	 * @param realIndexes array of byte indexes
	 * @param inst1 instruction 1
	 * @param inst2 instruction 2
	 * @param inst3 instruction 3
	 * @param inst4 instruction 4
	 * @return array of byte indexes
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
	private static boolean[] getDistCacheOnlyInputs(byte[] realIndexes, String inst1, String inst2, String inst3, String inst4) 
		throws DMLRuntimeException
	{
		boolean[] ret = new boolean[realIndexes.length];
		String[] inst = new String[]{inst1, inst2, inst3, inst4};
		
		//for all result indexes
		for( int i=0; i