All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codelibs.elasticsearch.taste.recommender.SamplingCandidateItemsStrategy Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.codelibs.elasticsearch.taste.recommender;

import java.util.Iterator;

import org.codelibs.elasticsearch.taste.common.FastIDSet;
import org.codelibs.elasticsearch.taste.common.LongPrimitiveArrayIterator;
import org.codelibs.elasticsearch.taste.common.LongPrimitiveIterator;
import org.codelibs.elasticsearch.taste.common.SamplingLongPrimitiveIterator;
import org.codelibs.elasticsearch.taste.common.iterator.FixedSizeSamplingIterator;
import org.codelibs.elasticsearch.taste.model.DataModel;
import org.codelibs.elasticsearch.taste.model.Preference;
import org.codelibs.elasticsearch.taste.model.PreferenceArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;

/**
 * 

Returns all items that have not been rated by the user (3) and that were preferred by another user * (2) that has preferred at least one item (1) that the current user has preferred too.

* *

This strategy uses sampling to limit the number of items that are considered, by sampling three different * things, noted above:

* *
    *
  1. The items that the user has preferred
  2. *
  3. The users who also prefer each of those items
  4. *
  5. The items those users also prefer
  6. *
* *

There is a maximum associated with each of these three things; if the number of items or users exceeds * that max, it is sampled so that the expected number of items or users actually used in that part of the * computation is equal to the max.

* *

Three arguments control these three maxima. Each is a "factor" f, which establishes the max at * f * log2(n), where n is the number of users or items in the data. For example if factor #2 is 5, * which controls the number of users sampled per item, then 5 * log2(# users) is the maximum for this * part of the computation.

* *

Each can be set to not do any limiting with value {@link #NO_LIMIT_FACTOR}.

*/ public class SamplingCandidateItemsStrategy extends AbstractCandidateItemsStrategy { private static final Logger log = LoggerFactory .getLogger(SamplingCandidateItemsStrategy.class); /** * Default factor used if not otherwise specified, for all limits. (30). */ public static final int DEFAULT_FACTOR = 30; /** * Specify this value as a factor to mean no limit. */ public static final int NO_LIMIT_FACTOR = Integer.MAX_VALUE; private static final int MAX_LIMIT = Integer.MAX_VALUE; private static final double LOG2 = Math.log(2.0); private final int maxItems; private final int maxUsersPerItem; private final int maxItemsPerUser; /** * Defaults to using no limit ({@link #NO_LIMIT_FACTOR}) for all factors, except * {@code candidatesPerUserFactor} which defaults to {@link #DEFAULT_FACTOR}. * * @see #SamplingCandidateItemsStrategy(int, int, int, int, int) */ public SamplingCandidateItemsStrategy(final int numUsers, final int numItems) { this(DEFAULT_FACTOR, DEFAULT_FACTOR, DEFAULT_FACTOR, numUsers, numItems); } /** * @param itemsFactor factor controlling max items considered for a user * @param usersPerItemFactor factor controlling max users considered for each of those items * @param candidatesPerUserFactor factor controlling max candidate items considered from each of those users * @param numUsers number of users currently in the data * @param numItems number of items in the data */ public SamplingCandidateItemsStrategy(final int itemsFactor, final int usersPerItemFactor, final int candidatesPerUserFactor, final int numUsers, final int numItems) { Preconditions.checkArgument(itemsFactor > 0, "itemsFactor must be greater then 0!"); Preconditions.checkArgument(usersPerItemFactor > 0, "usersPerItemFactor must be greater then 0!"); Preconditions.checkArgument(candidatesPerUserFactor > 0, "candidatesPerUserFactor must be greater then 0!"); Preconditions.checkArgument(numUsers > 0, "numUsers must be greater then 0!"); Preconditions.checkArgument(numItems > 0, "numItems must be greater then 0!"); maxItems = computeMaxFrom(itemsFactor, numItems); maxUsersPerItem = computeMaxFrom(usersPerItemFactor, numUsers); maxItemsPerUser = computeMaxFrom(candidatesPerUserFactor, numItems); log.debug("maxItems {}, maxUsersPerItem {}, maxItemsPerUser {}", maxItems, maxUsersPerItem, maxItemsPerUser); } private static int computeMaxFrom(final int factor, final int numThings) { if (factor == NO_LIMIT_FACTOR) { return MAX_LIMIT; } final long max = (long) (factor * (1.0 + Math.log(numThings) / LOG2)); return max > MAX_LIMIT ? MAX_LIMIT : (int) max; } @Override protected FastIDSet doGetCandidateItems(final long[] preferredItemIDs, final DataModel dataModel) { LongPrimitiveIterator preferredItemIDsIterator = new LongPrimitiveArrayIterator( preferredItemIDs); if (preferredItemIDs.length > maxItems) { final double samplingRate = (double) maxItems / preferredItemIDs.length; // log.info("preferredItemIDs.length {}, samplingRate {}", preferredItemIDs.length, samplingRate); preferredItemIDsIterator = new SamplingLongPrimitiveIterator( preferredItemIDsIterator, samplingRate); } final FastIDSet possibleItemsIDs = new FastIDSet(); while (preferredItemIDsIterator.hasNext()) { final long itemID = preferredItemIDsIterator.nextLong(); final PreferenceArray prefs = dataModel .getPreferencesForItem(itemID); final int prefsLength = prefs.length(); if (prefsLength > maxUsersPerItem) { final Iterator sampledPrefs = new FixedSizeSamplingIterator( maxUsersPerItem, prefs.iterator()); while (sampledPrefs.hasNext()) { addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(sampledPrefs.next() .getUserID())); } } else { for (int i = 0; i < prefsLength; i++) { addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(prefs.getUserID(i))); } } } possibleItemsIDs.removeAll(preferredItemIDs); return possibleItemsIDs; } private void addSomeOf(final FastIDSet possibleItemIDs, final FastIDSet itemIDs) { if (itemIDs.size() > maxItemsPerUser) { final LongPrimitiveIterator it = new SamplingLongPrimitiveIterator( itemIDs.iterator(), (double) maxItemsPerUser / itemIDs.size()); while (it.hasNext()) { possibleItemIDs.add(it.nextLong()); } } else { possibleItemIDs.addAll(itemIDs); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy