All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.cf.taste.hadoop.similarity.item;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
import org.apache.mahout.math.map.OpenIntLongHashMap;

/**
 * 

Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering

* *

Preferences in the input file should look like {@code userID,itemID[,preferencevalue]}

* *

* Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user * simply expresses a preference for an item, but no degree of preference). *

* *

* The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are * parsed as {@code long}s. *

* *

Command line arguments specific to this class are:

* *
    *
  1. --input (path): Directory containing one or more text files with the preference data
  2. *
  3. --output (path): output path where similarity data should be written
  4. *
  5. --similarityClassname (classname): Name of distributed similarity measure class to instantiate or a predefined * similarity from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}
  6. *
  7. --maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)
  8. *
  9. --maxPrefsPerUser (integer): max number of preferences to consider per user, users with more preferences will * be sampled down (1000)
  10. *
  11. --minPrefsPerUser (integer): ignore users with less preferences than this (1)
  12. *
  13. --booleanData (boolean): Treat input data as having no pref values (false)
  14. *
  15. --threshold (double): discard item pairs with a similarity value below this
  16. *
* *

General command line options are documented in {@link AbstractJob}.

* *

Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.

*/ public final class ItemSimilarityJob extends AbstractJob { public static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr"; public static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem"; private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100; private static final int DEFAULT_MAX_PREFS = 500; private static final int DEFAULT_MIN_PREFS_PER_USER = 1; public static void main(String[] args) throws Exception { ToolRunner.run(new ItemSimilarityJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, " + "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')', String.valueOf(DEFAULT_MAX_PREFS)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE)); addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); addOption("randomSeed", null, "use this seed for sampling", false); Map> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = getOption("similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); int maxPrefs = Integer.parseInt(getOption("maxPrefs")); int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); boolean booleanData = Boolean.valueOf(getOption("booleanData")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED; Path similarityMatrixPath = getTempPath("similarityMatrix"); Path prepPath = getTempPath("prepareRatingMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(), "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData), "--tempDir", getTempPath().toString(), }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxObservationsPerRow", String.valueOf(maxPrefs), "--maxObservationsPerColumn", String.valueOf(maxPrefs), "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed", String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); boolean succeeded = mostSimilarItems.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; } public static class MostSimilarItemPairsMapper extends Mapper { private OpenIntLongHashMap indexItemIDMap; private int maxSimilarItemsPerItem; @Override protected void setup(Context ctx) { Configuration conf = ctx.getConfiguration(); maxSimilarItemsPerItem = conf.getInt(MAX_SIMILARITIES_PER_ITEM, -1); indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEM_ID_INDEX_PATH_STR), conf); Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem must be greater then 0!"); } @Override protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx) throws IOException, InterruptedException { int itemIDIndex = itemIDIndexWritable.get(); TopSimilarItemsQueue topKMostSimilarItems = new TopSimilarItemsQueue(maxSimilarItemsPerItem); for (Vector.Element element : similarityVector.get().nonZeroes()) { SimilarItem top = topKMostSimilarItems.top(); double candidateSimilarity = element.get(); if (candidateSimilarity > top.getSimilarity()) { top.set(indexItemIDMap.get(element.index()), candidateSimilarity); topKMostSimilarItems.updateTop(); } } long itemID = indexItemIDMap.get(itemIDIndex); for (SimilarItem similarItem : topKMostSimilarItems.getTopItems()) { long otherItemID = similarItem.getItemID(); if (itemID < otherItemID) { ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity())); } else { ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity())); } } } } public static class MostSimilarItemPairsReducer extends Reducer { @Override protected void reduce(EntityEntityWritable pair, Iterable values, Context ctx) throws IOException, InterruptedException { ctx.write(pair, values.iterator().next()); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy