org.apache.sysml.runtime.matrix.DataGenMR Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix;
import java.io.PrintWriter;
import java.util.HashSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.random.Well1024a;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.MRInstructionParser;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction;
import org.apache.sysml.runtime.instructions.mr.MRInstruction;
import org.apache.sysml.runtime.instructions.mr.RandInstruction;
import org.apache.sysml.runtime.instructions.mr.SeqInstruction;
import org.apache.sysml.runtime.instructions.mr.MRInstruction.MRINSTRUCTION_TYPE;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.LibMatrixDatagen;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock;
import org.apache.sysml.runtime.matrix.mapred.GMRCombiner;
import org.apache.sysml.runtime.matrix.mapred.GMRReducer;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.DataGenMapper;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups;
import org.apache.sysml.runtime.util.MapReduceTool;
import org.apache.sysml.yarn.DMLAppMasterUtils;
import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;
/**
* Rand MapReduce job which creates random objects.
*
*/
public class DataGenMR
{
private static final Log LOG = LogFactory.getLog(DataGenMR.class.getName());
private DataGenMR() {
//prevent instantiation via private constructor
}
/**
* Starts a Rand MapReduce job which will produce one or more random objects.
*
* @param numRows number of rows for each random object
* @param numCols number of columns for each random object
* @param blockRowSize number of rows in a block for each random object
* @param blockColSize number of columns in a block for each random object
* @param minValue minimum of the random values for each random object
* @param maxValue maximum of the random values for each random object
* @param sparsity sparsity for each random object
* @param pdf probability density function for each random object
* @param replication file replication
* @param inputs input file for each random object
* @param outputs output file for each random object
* @param outputInfos output information for each random object
* @param instructionsInMapper instruction for each random object
* @param resultIndexes result indexes for each random object
* @return matrix characteristics for each random object
* @throws Exception if an error occurred in the MapReduce phase
*/
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions,
String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer,
int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix,
String[] outputs, OutputInfo[] outputInfos)
throws Exception
{
JobConf job = new JobConf(DataGenMR.class);
job.setJobName("DataGen-MR");
//whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, true);
byte[] realIndexes=new byte[dataGenInstructions.length];
for(byte b=0; bto
incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
// Correctness checks on (from, to, incr)
boolean neg = (from > to);
if ( incr == 0 )
throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
if (neg != (incr < 0) )
throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
// Compute the number of rows in the sequence
long numrows = 1 + (long)Math.floor((to-from)/incr);
if ( rlens[i] > 0 ) {
if ( numrows != rlens[i] )
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
}
else {
rlens[i] = numrows;
}
if ( clens[i] >0 && clens[i] != 1)
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
else
clens[i] = 1;
FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
PrintWriter pw = new PrintWriter(fsOut);
StringBuilder sb = new StringBuilder();
double temp = from;
double block_from, block_to;
for(long r = 0; r < rlens[i]; r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
// block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
long bid_i = ((r / brlens[i]) + 1);
long bid_j = 1;
block_from = temp;
block_to = temp+(curBlockRowSize-1)*incr;
temp = block_to + incr; // next block starts from here
sb.append(bid_i);
sb.append(',');
sb.append(bid_j);
sb.append(',');
/*
// Need not include block size while generating seq()
sb.append(curBlockRowSize);
sb.append(',');
sb.append(1);
sb.append(',');*/
sb.append(block_from);
sb.append(',');
sb.append(block_to);
sb.append(',');
sb.append(incr);
pw.println(sb.toString());
//System.out.println("MapTask " + r + ": " + sb.toString());
sb.setLength(0);
numblocks++;
}
pw.close();
fsOut.close();
inputInfos[i] = InputInfo.TextCellInputInfo;
} else {
throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype );
}
}
dataGenInsStr=dataGenInsStr.substring(1);//remove the first ","
RunningJob runjob;
MatrixCharacteristics[] stats;
try{
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
//set up the rand Instructions
MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
//set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
//set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
//set up the replication factor for the results
job.setInt("dfs.replication", replication);
//set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
//determine degree of parallelism (nmappers: 1<=n<=capacity)
//TODO use maxsparsity whenever we have a way of generating sparse rand data
int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
//correction max number of mappers on yarn clusters
if( InfrastructureAnalyzer.isYarnEnabled() )
capacity = (int)Math.max( capacity, YarnClusterAnalyzer.getNumCores() );
int nmapers = Math.max(Math.min((int)(8*maxbrlen*maxbclen*(long)numblocks/dfsblocksize), capacity),1);
job.setNumMapTasks(nmapers);
//set up what matrices are needed to pass from the mapper to reducer
HashSet mapoutputIndexes=MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret=MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr,
instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer,
resultIndexes, mapoutputIndexes, false);
stats=ret.stats;
//set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// print the complete MRJob instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] resultDimsUnknown = new byte[resultIndexes.length];
for ( int i=0; i < resultIndexes.length; i++ ) {
if ( stats[i].getRows() == -1 || stats[i].getCols() == -1 ) {
resultDimsUnknown[i] = (byte) 1;
}
else {
resultDimsUnknown[i] = (byte) 0;
}
}
boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") ||instructionsInMapper.contains("groupedagg") ;
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
// configure mapper and the mapper output key value pairs
job.setMapperClass(DataGenMapper.class);
if(numReducers==0)
{
job.setMapOutputKeyClass(Writable.class);
job.setMapOutputValueClass(Writable.class);
}else
{
job.setMapOutputKeyClass(MatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixBlock.class);
}
//set up combiner
if(numReducers!=0 && aggInstructionsInReducer!=null
&& !aggInstructionsInReducer.isEmpty())
job.setCombinerClass(GMRCombiner.class);
//configure reducer
job.setReducerClass(GMRReducer.class);
//job.setReducerClass(PassThroughReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for ( int i=0; i < inputs.length; i++ ) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
runjob=JobClient.runJob(job);
/* Process different counters */
Group group=runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for(int i=0; i