org.apache.sysml.runtime.matrix.SortMR Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.sysml.runtime.matrix;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.Counters.Group;

import org.apache.sysml.lops.Lop;
import org.apache.sysml.lops.SortKeys;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.mr.MRInstruction;
import org.apache.sysml.runtime.instructions.mr.UnaryInstruction;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.sort.CompactInputFormat;
import org.apache.sysml.runtime.matrix.sort.CompactOutputFormat;
import org.apache.sysml.runtime.matrix.sort.IndexSortComparable;
import org.apache.sysml.runtime.matrix.sort.IndexSortComparableDesc;
import org.apache.sysml.runtime.matrix.sort.IndexSortMapper;
import org.apache.sysml.runtime.matrix.sort.IndexSortReducer;
import org.apache.sysml.runtime.matrix.sort.IndexSortStitchupReducer;
import org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat;
import org.apache.sysml.runtime.matrix.sort.IndexSortStitchupMapper;
import org.apache.sysml.runtime.matrix.sort.ValueSortMapper;
import org.apache.sysml.runtime.matrix.sort.ValueSortReducer;
import org.apache.sysml.runtime.util.MapReduceTool;


/**
 * TODO fix issues sortindex mappers
 */
@SuppressWarnings("deprecation")
public class SortMR 
{
    private static final Log LOG = LogFactory.getLog(SortMR.class.getName());
    
    public static final String NUM_VALUES_PREFIX="num.values.in";
    public static final String COMBINE_INSTRUCTION = "combine.instruction";
    public static final String SORT_INSTRUCTION = "sort.instruction";
    public static final String VALUE_IS_WEIGHT="value.is.weight";
    public static final String SORT_INDEXES_OFFSETS = "sort.indexes.offsets";
    public static final String SORT_DECREASING = "sort.decreasing";
    
    
  	private SortMR() {
		//prevent instantiation via private constructor
	}
  
  
  /**
   * A partitioner that splits text keys into roughly equal partitions
   * in a global sorted order.
   */
  @SuppressWarnings({ "unchecked", "rawtypes" })
  private static class TotalOrderPartitioner 
                      implements Partitioner
  { 
	private ArrayList splitPoints;
    
    /**
     * Read the cut points from the given sequence file.
     * @param fs the file system
     * @param p the path to read
     * @param job the job config
     * @return the strings to split the partitions on
     * @throws IOException
     * @throws IllegalAccessException 
     * @throws InstantiationException 
     */
    private ArrayList readPartitions(FileSystem fs, Path p, JobConf job) 
    	throws IOException 
    {
    	SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job);
    	ArrayList parts = new ArrayList();
    	try 
    	{
			//WritableComparable key = keyClass.newInstance();
    		DoubleWritable key = new DoubleWritable();
			NullWritable value = NullWritable.get();
			while (reader.next(key, value)) {
				parts.add(key);
				//key=keyClass.newInstance();
				key = new DoubleWritable();
			}
		} 
    	catch (Exception e) {
			throw new RuntimeException(e);
		} 
    	finally {
    		IOUtilFunctions.closeSilently(reader);
    	}
		
		reader.close();
		return parts;
    }

    public void configure(JobConf job) {
      try {
    	  FileSystem fs = FileSystem.get(job);
          Path partFile = new Path(MRJobConfiguration.getSortPartitionFilename(job)); 
          splitPoints = readPartitions(fs, partFile, job);
        
      } 
      catch (IOException ie) {
        throw new IllegalArgumentException("can't read paritions file", ie);
      }
    }

    public int getPartition(K key, V value, int numPartitions) {
      return findPartition(key)%numPartitions;
    }

	private int findPartition(K key) {
		int i=0;
		for( ; i) outputInfo.outputKeyClass, outputInfo.outputValueClass);
	    
	    //setup instructions and meta information
	    if(combineInst!=null&&!combineInst.trim().isEmpty())
	    	job.set(COMBINE_INSTRUCTION, combineInst);
	    job.set(SORT_INSTRUCTION, sortInst);
	    job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
	    boolean desc = getSortInstructionDescending(sortInst);
	    job.setBoolean(SORT_DECREASING, desc);
	    MRJobConfiguration.setBlockSize(job, (byte)0, brlen, bclen);
	    MRJobConfiguration.setInputInfo(job, (byte)0, inputInfo, brlen, bclen, ConvertTarget.CELL);
	    int partitionWith0=SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);
	    
	    //setup mapper/reducer/partitioner/output classes
	    if( getSortInstructionType(sortInst)==SortKeys.OperationTypes.Indexes ){
		    MRJobConfiguration.setInputInfo(job, (byte)0, inputInfo, brlen, bclen, ConvertTarget.CELL);
		    job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
	    	job.setMapperClass(IndexSortMapper.class);
		    job.setReducerClass(IndexSortReducer.class);
		    job.setMapOutputKeyClass( !desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
		    job.setMapOutputValueClass(LongWritable.class);		    
		    job.setOutputKeyClass(MatrixIndexes.class); 
		    job.setOutputValueClass(MatrixBlock.class);   
	    }
	    else { //default case: SORT w/wo weights
	    	MRJobConfiguration.setInputInfo(job, (byte)0, inputInfo, brlen, bclen, ConvertTarget.CELL);
	    	job.setOutputFormat(CompactOutputFormat.class);
		    job.setMapperClass(ValueSortMapper.class);
		    job.setReducerClass(ValueSortReducer.class);	
		    job.setOutputKeyClass(outputInfo.outputKeyClass); //double
		    job.setOutputValueClass(outputInfo.outputValueClass); //int
	    }
	    job.setPartitionerClass(TotalOrderPartitioner.class);
	    
	    
	    //setup distributed cache
	    DistributedCache.addCacheFile(partitionUri, job);
	    DistributedCache.createSymlink(job);
	    
	    //setup replication factor
	    job.setInt("dfs.replication", replication);
	    
		MatrixCharacteristics[] s = new MatrixCharacteristics[1];
		s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);
		
		// Print the complete instruction
		if (LOG.isTraceEnabled())
			inst.printCompleteMRJobInstruction(s);
		
		//set unique working dir
		MRJobConfiguration.setUniqueWorkingDir(job);
		
		//run mr job
	    RunningJob runjob=JobClient.runJob(job);
		Group group=runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
		numReducers=job.getNumReduceTasks();
		
		//process final meta data
		long[] counts=new long[numReducers];
		long total=0;
		for(int i=0; i