All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.utils.ConcatenateVectorsJob Maven / Gradle / Ivy

Go to download

Optional components of Mahout which generally support interaction with third party systems, formats, APIs, etc.

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *3
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.utils;

import java.io.IOException;

import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.VectorWritable;

/*
 * Map-reduce job to combine two matrices A and B to (a1,a2,...aN,b1,b2,...bN)
 * Technically works on Vector files, so will also concatenate two vectors.
 * If either input is a NamedVector, the output has the name: A.name has precedence over B.name.
 * Concatenation or per-member combinations given a function object.
 * 
 * Uses clever hack which requires different matrices to have a different number of columns.
 * Courtesy of Jake Mannix, https://issues.apache.org/jira/browse/MAHOUT-884
 * If vectors are same length, this will not concatenate them in the right order
 *
 * @deprecated as of 0.10.0
 *
 * TODO: generalize to multiple matrices, should the teeming masses so desire
 */
@Deprecated
public class ConcatenateVectorsJob extends AbstractJob {
  
  static final String MATRIXA_DIMS = "mahout.concatenatevectors.matrixA_dims";
  static final String MATRIXB_DIMS = "mahout.concatenatevectors.matrixB_dims";
  
  private ConcatenateVectorsJob() {}
  
  public static void main(String[] args) throws Exception {
    ToolRunner.run(new ConcatenateVectorsJob(), args);
  }
  
  @Override
  public int run(String[] args) throws Exception {
    addOption("matrixA", "ma", "A (left) matrix directory", true);
    addOption("matrixB", "mb", "B (right) matrix directory", true);
    addOutputOption();
    DefaultOptionCreator.overwriteOption().create();

    if (parseArguments(args) == null) {
      return -1;
    }

    Path pathA = new Path(getOption("matrixA"));
    Path pathB = new Path(getOption("matrixB"));
    Path pathOutput = getOutputPath();

    Configuration configuration = getConf();
    FileSystem fs = FileSystem.get(configuration);

    Class keyClassA = getKeyClass(pathA, fs);
    Class keyClassB = getKeyClass(pathB, fs);

    Preconditions.checkArgument(keyClassA.equals(keyClassB), "All SequenceFiles must use same key class");

    int dimA = getDimensions(pathA);
    int dimB = getDimensions(pathB);
    
    String nameA = getOption("matrixA");
    String nameB = getOption("matrixB");
    
    Job concatenate = prepareJob(
      new Path(nameA + "," + nameB), pathOutput, Mapper.class, keyClassA, VectorWritable.class,
      ConcatenateVectorsReducer.class, keyClassA, VectorWritable.class);

    configuration = concatenate.getConfiguration();
    configuration.set(MATRIXA_DIMS, Integer.toString(dimA));
    configuration.set(MATRIXB_DIMS, Integer.toString(dimB));
    // TODO: add reducer as combiner - need a system that can exercise combiners

    boolean succeeded = concatenate.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }
    return 0;
  }

  private Class getKeyClass(Path path, FileSystem fs) throws IOException {
    // this works for both part* and a directory/ with part*.
    Path pathPattern = new Path(path, "part*");
    FileStatus[] paths = fs.globStatus(pathPattern);
    Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory");

    Path file = paths[0].getPath();
    try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, fs.getConf())){
      return reader.getKeyClass().asSubclass(Writable.class);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy