org.apache.sysml.runtime.matrix.SortMR Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.lops.SortKeys;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.mr.MRInstruction;
import org.apache.sysml.runtime.instructions.mr.UnaryInstruction;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.sort.CompactInputFormat;
import org.apache.sysml.runtime.matrix.sort.CompactOutputFormat;
import org.apache.sysml.runtime.matrix.sort.IndexSortComparable;
import org.apache.sysml.runtime.matrix.sort.IndexSortComparableDesc;
import org.apache.sysml.runtime.matrix.sort.IndexSortMapper;
import org.apache.sysml.runtime.matrix.sort.IndexSortReducer;
import org.apache.sysml.runtime.matrix.sort.IndexSortStitchupReducer;
import org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat;
import org.apache.sysml.runtime.matrix.sort.IndexSortStitchupMapper;
import org.apache.sysml.runtime.matrix.sort.ValueSortMapper;
import org.apache.sysml.runtime.matrix.sort.ValueSortReducer;
import org.apache.sysml.runtime.util.MapReduceTool;
/**
* TODO fix issues sortindex mappers
*/
@SuppressWarnings("deprecation")
public class SortMR
{
private static final Log LOG = LogFactory.getLog(SortMR.class.getName());
public static final String NUM_VALUES_PREFIX="num.values.in";
public static final String COMBINE_INSTRUCTION = "combine.instruction";
public static final String SORT_INSTRUCTION = "sort.instruction";
public static final String VALUE_IS_WEIGHT="value.is.weight";
public static final String SORT_INDEXES_OFFSETS = "sort.indexes.offsets";
public static final String SORT_DECREASING = "sort.decreasing";
private SortMR() {
//prevent instantiation via private constructor
}
/**
* A partitioner that splits text keys into roughly equal partitions
* in a global sorted order.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static class TotalOrderPartitioner
implements Partitioner
{
private ArrayList splitPoints;
/**
* Read the cut points from the given sequence file.
* @param fs the file system
* @param p the path to read
* @param job the job config
* @return the strings to split the partitions on
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private ArrayList readPartitions(FileSystem fs, Path p, JobConf job)
throws IOException
{
SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job);
ArrayList parts = new ArrayList();
try
{
//WritableComparable key = keyClass.newInstance();
DoubleWritable key = new DoubleWritable();
NullWritable value = NullWritable.get();
while (reader.next(key, value)) {
parts.add(key);
//key=keyClass.newInstance();
key = new DoubleWritable();
}
}
catch (Exception e) {
throw new RuntimeException(e);
}
finally {
IOUtilFunctions.closeSilently(reader);
}
reader.close();
return parts;
}
public void configure(JobConf job) {
try {
FileSystem fs = FileSystem.get(job);
Path partFile = new Path(MRJobConfiguration.getSortPartitionFilename(job));
splitPoints = readPartitions(fs, partFile, job);
}
catch (IOException ie) {
throw new IllegalArgumentException("can't read paritions file", ie);
}
}
public int getPartition(K key, V value, int numPartitions) {
return findPartition(key)%numPartitions;
}
private int findPartition(K key) {
int i=0;
for( ; i) outputInfo.outputKeyClass, outputInfo.outputValueClass);
//setup instructions and meta information
if(combineInst!=null&&!combineInst.trim().isEmpty())
job.set(COMBINE_INSTRUCTION, combineInst);
job.set(SORT_INSTRUCTION, sortInst);
job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
boolean desc = getSortInstructionDescending(sortInst);
job.setBoolean(SORT_DECREASING, desc);
MRJobConfiguration.setBlockSize(job, (byte)0, brlen, bclen);
MRJobConfiguration.setInputInfo(job, (byte)0, inputInfo, brlen, bclen, ConvertTarget.CELL);
int partitionWith0=SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);
//setup mapper/reducer/partitioner/output classes
if( getSortInstructionType(sortInst)==SortKeys.OperationTypes.Indexes ){
MRJobConfiguration.setInputInfo(job, (byte)0, inputInfo, brlen, bclen, ConvertTarget.CELL);
job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
job.setMapperClass(IndexSortMapper.class);
job.setReducerClass(IndexSortReducer.class);
job.setMapOutputKeyClass( !desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(MatrixIndexes.class);
job.setOutputValueClass(MatrixBlock.class);
}
else { //default case: SORT w/wo weights
MRJobConfiguration.setInputInfo(job, (byte)0, inputInfo, brlen, bclen, ConvertTarget.CELL);
job.setOutputFormat(CompactOutputFormat.class);
job.setMapperClass(ValueSortMapper.class);
job.setReducerClass(ValueSortReducer.class);
job.setOutputKeyClass(outputInfo.outputKeyClass); //double
job.setOutputValueClass(outputInfo.outputValueClass); //int
}
job.setPartitionerClass(TotalOrderPartitioner.class);
//setup distributed cache
DistributedCache.addCacheFile(partitionUri, job);
DistributedCache.createSymlink(job);
//setup replication factor
job.setInt("dfs.replication", replication);
MatrixCharacteristics[] s = new MatrixCharacteristics[1];
s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(s);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
//run mr job
RunningJob runjob=JobClient.runJob(job);
Group group=runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
numReducers=job.getNumReduceTasks();
//process final meta data
long[] counts=new long[numReducers];
long total=0;
for(int i=0; i