All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.math.hadoop.stochasticsvd.ABtJob Maven / Gradle / Ivy

There is a newer version: 0.13.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.hadoop.stochasticsvd;

import java.io.Closeable;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.regex.Matcher;

import com.google.common.collect.Lists;
import org.apache.commons.lang3.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep;

/**
 * Computes ABt products, then first step of QR which is pushed down to the
 * reducer.
 * 
 */
@SuppressWarnings("deprecation")
public final class ABtJob {

  public static final String PROP_BT_PATH = "ssvd.Bt.path";
  public static final String PROP_BT_BROADCAST = "ssvd.Bt.broadcast";

  private ABtJob() {
  }

  /**
   * So, here, i preload A block into memory.
   * 

* * A sparse matrix seems to be ideal for that but there are two reasons why i * am not using it: *

    *
  • 1) I don't know the full block height. so i may need to reallocate it * from time to time. Although this probably not a showstopper. *
  • 2) I found that RandomAccessSparseVectors seem to take much more memory * than the SequentialAccessSparseVectors. *
*

* */ public static class ABtMapper extends Mapper { private SplitPartitionedWritable outKey; private final Deque closeables = new ArrayDeque<>(); private SequenceFileDirIterator btInput; private Vector[] aCols; // private Vector[] yiRows; // private VectorWritable outValue = new VectorWritable(); private int aRowCount; private int kp; private int blockHeight; private SparseRowBlockAccumulator yiCollector; @Override protected void map(Writable key, VectorWritable value, Context context) throws IOException, InterruptedException { Vector vec = value.get(); int vecSize = vec.size(); if (aCols == null) { aCols = new Vector[vecSize]; } else if (aCols.length < vecSize) { aCols = Arrays.copyOf(aCols, vecSize); } if (vec.isDense()) { for (int i = 0; i < vecSize; i++) { extendAColIfNeeded(i, aRowCount + 1); aCols[i].setQuick(aRowCount, vec.getQuick(i)); } } else { for (Vector.Element vecEl : vec.nonZeroes()) { int i = vecEl.index(); extendAColIfNeeded(i, aRowCount + 1); aCols[i].setQuick(aRowCount, vecEl.get()); } } aRowCount++; } private void extendAColIfNeeded(int col, int rowCount) { if (aCols[col] == null) { aCols[col] = new SequentialAccessSparseVector(rowCount < 10000 ? 10000 : rowCount, 1); } else if (aCols[col].size() < rowCount) { Vector newVec = new SequentialAccessSparseVector(rowCount << 1, aCols[col].getNumNondefaultElements() << 1); newVec.viewPart(0, aCols[col].size()).assign(aCols[col]); aCols[col] = newVec; } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { try { // yiRows= new Vector[aRowCount]; int lastRowIndex = -1; while (btInput.hasNext()) { Pair btRec = btInput.next(); int btIndex = btRec.getFirst().get(); Vector btVec = btRec.getSecond().get(); Vector aCol; if (btIndex > aCols.length || (aCol = aCols[btIndex]) == null) { continue; } int j = -1; for (Vector.Element aEl : aCol.nonZeroes()) { j = aEl.index(); // outKey.setTaskItemOrdinal(j); // outValue.set(btVec.times(aEl.get())); // assign might work better // // with memory after all. // context.write(outKey, outValue); yiCollector.collect((long) j, btVec.times(aEl.get())); } if (lastRowIndex < j) { lastRowIndex = j; } } aCols = null; // output empty rows if we never output partial products for them // this happens in sparse matrices when last rows are all zeros // and is subsequently causing shorter Q matrix row count which we // probably don't want to repair there but rather here. Vector yDummy = new SequentialAccessSparseVector(kp); // outValue.set(yDummy); for (lastRowIndex += 1; lastRowIndex < aRowCount; lastRowIndex++) { // outKey.setTaskItemOrdinal(lastRowIndex); // context.write(outKey, outValue); yiCollector.collect((long) lastRowIndex, yDummy); } } finally { IOUtils.close(closeables); } } @Override protected void setup(final Context context) throws IOException, InterruptedException { int k = Integer.parseInt(context.getConfiguration().get(QRFirstStep.PROP_K)); int p = Integer.parseInt(context.getConfiguration().get(QRFirstStep.PROP_P)); kp = k + p; outKey = new SplitPartitionedWritable(context); String propBtPathStr = context.getConfiguration().get(PROP_BT_PATH); Validate.notNull(propBtPathStr, "Bt input is not set"); Path btPath = new Path(propBtPathStr); boolean distributedBt = context.getConfiguration().get(PROP_BT_BROADCAST) != null; if (distributedBt) { Path[] btFiles = HadoopUtil.getCachedFiles(context.getConfiguration()); // DEBUG: stdout //System.out.printf("list of files: " + btFiles); StringBuilder btLocalPath = new StringBuilder(); for (Path btFile : btFiles) { if (btLocalPath.length() > 0) { btLocalPath.append(Path.SEPARATOR_CHAR); } btLocalPath.append(btFile); } btInput = new SequenceFileDirIterator<>(new Path(btLocalPath.toString()), PathType.LIST, null, null, true, context.getConfiguration()); } else { btInput = new SequenceFileDirIterator<>(btPath, PathType.GLOB, null, null, true, context.getConfiguration()); } // TODO: how do i release all that stuff?? closeables.addFirst(btInput); OutputCollector yiBlockCollector = new OutputCollector() { @Override public void collect(LongWritable blockKey, SparseRowBlockWritable block) throws IOException { outKey.setTaskItemOrdinal((int) blockKey.get()); try { context.write(outKey, block); } catch (InterruptedException exc) { throw new IOException("Interrupted", exc); } } }; blockHeight = context.getConfiguration().getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, -1); yiCollector = new SparseRowBlockAccumulator(blockHeight, yiBlockCollector); closeables.addFirst(yiCollector); } } /** * QR first step pushed down to reducer. * */ public static class QRReducer extends Reducer { // hack: partition number formats in hadoop, copied. this may stop working // if it gets // out of sync with newer hadoop version. But unfortunately rules of forming // output file names are not sufficiently exposed so we need to hack it // if we write the same split output from either mapper or reducer. // alternatively, we probably can replace it by our own output file namnig // management // completely and bypass MultipleOutputs entirely. private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private final Deque closeables = Lists.newLinkedList(); protected final SparseRowBlockWritable accum = new SparseRowBlockWritable(); protected int blockHeight; protected int lastTaskId = -1; protected OutputCollector qhatCollector; protected OutputCollector rhatCollector; protected QRFirstStep qr; @Override protected void setup(Context context) throws IOException, InterruptedException { blockHeight = context.getConfiguration().getInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, -1); } protected void setupBlock(Context context, SplitPartitionedWritable spw) throws InterruptedException, IOException { IOUtils.close(closeables); qhatCollector = createOutputCollector(QJob.OUTPUT_QHAT, spw, context, DenseBlockWritable.class); rhatCollector = createOutputCollector(QJob.OUTPUT_RHAT, spw, context, VectorWritable.class); qr = new QRFirstStep(context.getConfiguration(), qhatCollector, rhatCollector); closeables.addFirst(qr); lastTaskId = spw.getTaskId(); } @Override protected void reduce(SplitPartitionedWritable key, Iterable values, Context context) throws IOException, InterruptedException { accum.clear(); for (SparseRowBlockWritable bw : values) { accum.plusBlock(bw); } if (key.getTaskId() != lastTaskId) { setupBlock(context, key); } long blockBase = key.getTaskItemOrdinal() * blockHeight; for (int k = 0; k < accum.getNumRows(); k++) { Vector yiRow = accum.getRows()[k]; key.setTaskItemOrdinal(blockBase + accum.getRowIndices()[k]); qr.collect(key, yiRow); } } private Path getSplitFilePath(String name, SplitPartitionedWritable spw, Context context) throws InterruptedException, IOException { String uniqueFileName = FileOutputFormat.getUniqueFile(context, name, ""); uniqueFileName = uniqueFileName.replaceFirst("-r-", "-m-"); uniqueFileName = uniqueFileName.replaceFirst("\\d+$", Matcher.quoteReplacement(NUMBER_FORMAT.format(spw.getTaskId()))); return new Path(FileOutputFormat.getWorkOutputPath(context), uniqueFileName); } /** * key doesn't matter here, only value does. key always gets substituted by * SPW. */ private OutputCollector createOutputCollector(String name, final SplitPartitionedWritable spw, Context ctx, Class valueClass) throws IOException, InterruptedException { Path outputPath = getSplitFilePath(name, spw, ctx); final SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.get(outputPath.toUri(), ctx.getConfiguration()), ctx.getConfiguration(), outputPath, SplitPartitionedWritable.class, valueClass); closeables.addFirst(w); return new OutputCollector() { @Override public void collect(K key, V val) throws IOException { w.append(spw, val); } }; } @Override protected void cleanup(Context context) throws IOException, InterruptedException { IOUtils.close(closeables); } } public static void run(Configuration conf, Path[] inputAPaths, Path inputBtGlob, Path outputPath, int aBlockRows, int minSplitSize, int k, int p, int outerProdBlockHeight, int numReduceTasks, boolean broadcastBInput) throws ClassNotFoundException, InterruptedException, IOException { JobConf oldApiJob = new JobConf(conf); // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_QHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // DenseBlockWritable.class); // // MultipleOutputs // .addNamedOutput(oldApiJob, // QJob.OUTPUT_RHAT, // org.apache.hadoop.mapred.SequenceFileOutputFormat.class, // SplitPartitionedWritable.class, // VectorWritable.class); Job job = new Job(oldApiJob); job.setJobName("ABt-job"); job.setJarByClass(ABtJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputAPaths); if (minSplitSize > 0) { FileInputFormat.setMinInputSplitSize(job, minSplitSize); } FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(SplitPartitionedWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(SplitPartitionedWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(ABtMapper.class); job.setCombinerClass(BtJob.OuterProductCombiner.class); job.setReducerClass(QRReducer.class); job.getConfiguration().setInt(QJob.PROP_AROWBLOCK_SIZE, aBlockRows); job.getConfiguration().setInt(BtJob.PROP_OUTER_PROD_BLOCK_HEIGHT, outerProdBlockHeight); job.getConfiguration().setInt(QRFirstStep.PROP_K, k); job.getConfiguration().setInt(QRFirstStep.PROP_P, p); job.getConfiguration().set(PROP_BT_PATH, inputBtGlob.toString()); // number of reduce tasks doesn't matter. we don't actually // send anything to reducers. job.setNumReduceTasks(numReduceTasks); // broadcast Bt files if required. if (broadcastBInput) { job.getConfiguration().set(PROP_BT_BROADCAST, "y"); FileSystem fs = FileSystem.get(inputBtGlob.toUri(), conf); FileStatus[] fstats = fs.globStatus(inputBtGlob); if (fstats != null) { for (FileStatus fstat : fstats) { /* * new api is not enabled yet in our dependencies at this time, still * using deprecated one */ DistributedCache.addCacheFile(fstat.getPath().toUri(), conf); } } } job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("ABt job unsuccessful."); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy