org.apache.sysml.runtime.matrix.MMRJMR Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix;
import java.util.HashSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixCell;
import org.apache.sysml.runtime.matrix.data.TripleIndexes;
import org.apache.sysml.runtime.matrix.mapred.MMRJMRMapper;
import org.apache.sysml.runtime.matrix.mapred.MMRJMRReducer;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups;
import org.apache.sysml.yarn.DMLAppMasterUtils;
/*
* inBlockRepresentation: indicate whether to use block representation or cell representation
* inputs: input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
* inputInfos: the input format information for the input matrices
* rlen: the number of rows for each matrix
* clen: the number of columns for each matrix
* brlen: the number of rows per block
* bclen: the number of columns per block
* instructionsInMapper: in Mapper, the set of unary operations that need to be performed on each input matrix
* aggInstructionsInReducer: in Reducer, right after sorting, the set of aggreagte operations that need
* to be performed on each input matrix,
* aggBinInstrction: the aggregate binary instruction for the MMCJ operation
* otherInstructionsInReducer: the mixed operations that need to be performed on matrices after the aggregate operations
* numReducers: the number of reducers
* replication: the replication factor for the output
* resulltIndexes: the indexes of the result matrices that needs to be outputted.
* outputs: the names for the output directories, one for each result index
* outputInfos: output format information for the output matrices
*/
public class MMRJMR
{
private static final Log LOG = LogFactory.getLog(MMRJMR.class.getName());
private MMRJMR() {
//prevent instantiation via private constructor
}
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos,
long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper,
String aggInstructionsInReducer, String aggBinInstrctions, String otherInstructionsInReducer,
int numReducers, int replication, byte[] resultIndexes,
String[] outputs, OutputInfo[] outputInfos)
throws Exception
{
JobConf job = new JobConf(MMRJMR.class);
job.setJobName("MMRJ-MR");
if(numReducers<=0)
throw new Exception("MMRJ-MR has to have at least one reduce task!");
// TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
boolean inBlockRepresentation=MRJobConfiguration.deriveRepresentation(inputInfos);
//whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
byte[] realIndexes=new byte[inputs.length];
for(byte b=0; b mapoutputIndexes=MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer,
aggBinInstrctions, resultIndexes );
MatrixChar_N_ReducerGroups ret=MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, otherInstructionsInReducer,
resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats=ret.stats;
//set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
byte[] dimsUnknown = new byte[resultIndexes.length];
for ( int i=0; i < resultIndexes.length; i++ ) {
if ( stats[i].getRows() == -1 || stats[i].getCols() == -1 ) {
dimsUnknown[i] = (byte)1;
}
else {
dimsUnknown[i] = (byte) 0;
}
}
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation);
// configure mapper
job.setMapperClass(MMRJMRMapper.class);
job.setMapOutputKeyClass(TripleIndexes.class);
if(inBlockRepresentation)
job.setMapOutputValueClass(TaggedMatrixBlock.class);
else
job.setMapOutputValueClass(TaggedMatrixCell.class);
job.setOutputKeyComparatorClass(TripleIndexes.Comparator.class);
job.setPartitionerClass(TripleIndexes.FirstTwoIndexesPartitioner.class);
//configure combiner
//TODO: cannot set up combiner, because it will destroy the stable numerical algorithms
// for sum or for central moments
// if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
// job.setCombinerClass(MMCJMRCombiner.class);
//configure reducer
job.setReducerClass(MMRJMRReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for ( int i=0; i < inputs.length; i++ ) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob=JobClient.runJob(job);
/* Process different counters */
Group group=runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for(int i=0; i