org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.hadoop.similarity.item;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
import org.apache.mahout.math.map.OpenIntLongHashMap;
/**
* Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering
*
* Preferences in the input file should look like {@code userID,itemID[,preferencevalue]}
*
*
* Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user
* simply expresses a preference for an item, but no degree of preference).
*
*
*
* The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
* parsed as {@code long}s.
*
*
* Command line arguments specific to this class are:
*
*
* - --input (path): Directory containing one or more text files with the preference data
* - --output (path): output path where similarity data should be written
* - --similarityClassname (classname): Name of distributed similarity measure class to instantiate or a predefined
* similarity from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}
* - --maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)
* - --maxPrefsPerUser (integer): max number of preferences to consider per user, users with more preferences will
* be sampled down (1000)
* - --minPrefsPerUser (integer): ignore users with less preferences than this (1)
* - --booleanData (boolean): Treat input data as having no pref values (false)
* - --threshold (double): discard item pairs with a similarity value below this
*
*
* General command line options are documented in {@link AbstractJob}.
*
* Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.
*/
public final class ItemSimilarityJob extends AbstractJob {
public static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr";
public static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem";
private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
private static final int DEFAULT_MAX_PREFS = 500;
private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
public static void main(String[] args) throws Exception {
ToolRunner.run(new ItemSimilarityJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
+ "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number "
+ "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, "
+ "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')',
String.valueOf(DEFAULT_MAX_PREFS));
addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
+ "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
addOption("randomSeed", null, "use this seed for sampling", false);
Map> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
String similarityClassName = getOption("similarityClassname");
int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
int maxPrefs = Integer.parseInt(getOption("maxPrefs"));
int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
boolean booleanData = Boolean.valueOf(getOption("booleanData"));
double threshold = hasOption("threshold")
? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
long randomSeed = hasOption("randomSeed")
? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED;
Path similarityMatrixPath = getTempPath("similarityMatrix");
Path prepPath = getTempPath("prepareRatingMatrix");
AtomicInteger currentPhase = new AtomicInteger();
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] {
"--input", getInputPath().toString(),
"--output", prepPath.toString(),
"--minPrefsPerUser", String.valueOf(minPrefsPerUser),
"--booleanData", String.valueOf(booleanData),
"--tempDir", getTempPath().toString(),
});
}
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
"--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
"--output", similarityMatrixPath.toString(),
"--numberOfColumns", String.valueOf(numberOfUsers),
"--similarityClassname", similarityClassName,
"--maxObservationsPerRow", String.valueOf(maxPrefs),
"--maxObservationsPerColumn", String.valueOf(maxPrefs),
"--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem),
"--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
"--threshold", String.valueOf(threshold),
"--randomSeed", String.valueOf(randomSeed),
"--tempDir", getTempPath().toString(),
});
}
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
boolean succeeded = mostSimilarItems.waitForCompletion(true);
if (!succeeded) {
return -1;
}
}
return 0;
}
public static class MostSimilarItemPairsMapper
extends Mapper {
private OpenIntLongHashMap indexItemIDMap;
private int maxSimilarItemsPerItem;
@Override
protected void setup(Context ctx) {
Configuration conf = ctx.getConfiguration();
maxSimilarItemsPerItem = conf.getInt(MAX_SIMILARITIES_PER_ITEM, -1);
indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEM_ID_INDEX_PATH_STR), conf);
Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem must be greater then 0!");
}
@Override
protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)
throws IOException, InterruptedException {
int itemIDIndex = itemIDIndexWritable.get();
TopSimilarItemsQueue topKMostSimilarItems = new TopSimilarItemsQueue(maxSimilarItemsPerItem);
for (Vector.Element element : similarityVector.get().nonZeroes()) {
SimilarItem top = topKMostSimilarItems.top();
double candidateSimilarity = element.get();
if (candidateSimilarity > top.getSimilarity()) {
top.set(indexItemIDMap.get(element.index()), candidateSimilarity);
topKMostSimilarItems.updateTop();
}
}
long itemID = indexItemIDMap.get(itemIDIndex);
for (SimilarItem similarItem : topKMostSimilarItems.getTopItems()) {
long otherItemID = similarItem.getItemID();
if (itemID < otherItemID) {
ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity()));
} else {
ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity()));
}
}
}
}
public static class MostSimilarItemPairsReducer
extends Reducer {
@Override
protected void reduce(EntityEntityWritable pair, Iterable values, Context ctx)
throws IOException, InterruptedException {
ctx.write(pair, values.iterator().next());
}
}
}