org.codelibs.elasticsearch.taste.recommender.SamplingCandidateItemsStrategy Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.codelibs.elasticsearch.taste.recommender;

import java.util.Iterator;

import org.codelibs.elasticsearch.taste.common.FastIDSet;
import org.codelibs.elasticsearch.taste.common.LongPrimitiveArrayIterator;
import org.codelibs.elasticsearch.taste.common.LongPrimitiveIterator;
import org.codelibs.elasticsearch.taste.common.SamplingLongPrimitiveIterator;
import org.codelibs.elasticsearch.taste.common.iterator.FixedSizeSamplingIterator;
import org.codelibs.elasticsearch.taste.model.DataModel;
import org.codelibs.elasticsearch.taste.model.Preference;
import org.codelibs.elasticsearch.taste.model.PreferenceArray;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;

/**
 * Returns all items that have not been rated by the user (3) and that were preferred by another user
 * (2) that has preferred at least one item (1) that the current user has preferred too.
 *
 * This strategy uses sampling to limit the number of items that are considered, by sampling three different
 * things, noted above:
 *
 * 
 *   The items that the user has preferred
 *   The users who also prefer each of those items
 *   The items those users also prefer
 * 
 *
 * There is a maximum associated with each of these three things; if the number of items or users exceeds
 * that max, it is sampled so that the expected number of items or users actually used in that part of the
 * computation is equal to the max.
 *
 * Three arguments control these three maxima. Each is a "factor" f, which establishes the max at
 * f * log2(n), where n is the number of users or items in the data. For example if factor #2 is 5,
 * which controls the number of users sampled per item, then 5 * log2(# users) is the maximum for this
 * part of the computation.
 *
 * Each can be set to not do any limiting with value {@link #NO_LIMIT_FACTOR}.
 */
public class SamplingCandidateItemsStrategy extends
        AbstractCandidateItemsStrategy {

    private static final Logger log = LoggerFactory
            .getLogger(SamplingCandidateItemsStrategy.class);

    /**
     * Default factor used if not otherwise specified, for all limits. (30).
     */
    public static final int DEFAULT_FACTOR = 30;

    /**
     * Specify this value as a factor to mean no limit.
     */
    public static final int NO_LIMIT_FACTOR = Integer.MAX_VALUE;

    private static final int MAX_LIMIT = Integer.MAX_VALUE;

    private static final double LOG2 = Math.log(2.0);

    private final int maxItems;

    private final int maxUsersPerItem;

    private final int maxItemsPerUser;

    /**
     * Defaults to using no limit ({@link #NO_LIMIT_FACTOR}) for all factors, except
     * {@code candidatesPerUserFactor} which defaults to {@link #DEFAULT_FACTOR}.
     *
     * @see #SamplingCandidateItemsStrategy(int, int, int, int, int)
     */
    public SamplingCandidateItemsStrategy(final int numUsers, final int numItems) {
        this(DEFAULT_FACTOR, DEFAULT_FACTOR, DEFAULT_FACTOR, numUsers, numItems);
    }

    /**
     * @param itemsFactor factor controlling max items considered for a user
     * @param usersPerItemFactor factor controlling max users considered for each of those items
     * @param candidatesPerUserFactor factor controlling max candidate items considered from each of those users
     * @param numUsers number of users currently in the data
     * @param numItems number of items in the data
     */
    public SamplingCandidateItemsStrategy(final int itemsFactor,
            final int usersPerItemFactor, final int candidatesPerUserFactor,
            final int numUsers, final int numItems) {
        Preconditions.checkArgument(itemsFactor > 0,
                "itemsFactor must be greater then 0!");
        Preconditions.checkArgument(usersPerItemFactor > 0,
                "usersPerItemFactor must be greater then 0!");
        Preconditions.checkArgument(candidatesPerUserFactor > 0,
                "candidatesPerUserFactor must be greater then 0!");
        Preconditions.checkArgument(numUsers > 0,
                "numUsers must be greater then 0!");
        Preconditions.checkArgument(numItems > 0,
                "numItems must be greater then 0!");
        maxItems = computeMaxFrom(itemsFactor, numItems);
        maxUsersPerItem = computeMaxFrom(usersPerItemFactor, numUsers);
        maxItemsPerUser = computeMaxFrom(candidatesPerUserFactor, numItems);
        log.debug("maxItems {}, maxUsersPerItem {}, maxItemsPerUser {}",
                maxItems, maxUsersPerItem, maxItemsPerUser);
    }

    private static int computeMaxFrom(final int factor, final int numThings) {
        if (factor == NO_LIMIT_FACTOR) {
            return MAX_LIMIT;
        }
        final long max = (long) (factor * (1.0 + Math.log(numThings) / LOG2));
        return max > MAX_LIMIT ? MAX_LIMIT : (int) max;
    }

    @Override
    protected FastIDSet doGetCandidateItems(final long[] preferredItemIDs,
            final DataModel dataModel) {
        LongPrimitiveIterator preferredItemIDsIterator = new LongPrimitiveArrayIterator(
                preferredItemIDs);
        if (preferredItemIDs.length > maxItems) {
            final double samplingRate = (double) maxItems
                    / preferredItemIDs.length;
            //      log.info("preferredItemIDs.length {}, samplingRate {}", preferredItemIDs.length, samplingRate);
            preferredItemIDsIterator = new SamplingLongPrimitiveIterator(
                    preferredItemIDsIterator, samplingRate);
        }
        final FastIDSet possibleItemsIDs = new FastIDSet();
        while (preferredItemIDsIterator.hasNext()) {
            final long itemID = preferredItemIDsIterator.nextLong();
            final PreferenceArray prefs = dataModel
                    .getPreferencesForItem(itemID);
            final int prefsLength = prefs.length();
            if (prefsLength > maxUsersPerItem) {
                final Iterator sampledPrefs = new FixedSizeSamplingIterator(
                        maxUsersPerItem, prefs.iterator());
                while (sampledPrefs.hasNext()) {
                    addSomeOf(possibleItemsIDs,
                            dataModel.getItemIDsFromUser(sampledPrefs.next()
                                    .getUserID()));
                }
            } else {
                for (int i = 0; i < prefsLength; i++) {
                    addSomeOf(possibleItemsIDs,
                            dataModel.getItemIDsFromUser(prefs.getUserID(i)));
                }
            }
        }
        possibleItemsIDs.removeAll(preferredItemIDs);
        return possibleItemsIDs;
    }

    private void addSomeOf(final FastIDSet possibleItemIDs,
            final FastIDSet itemIDs) {
        if (itemIDs.size() > maxItemsPerUser) {
            final LongPrimitiveIterator it = new SamplingLongPrimitiveIterator(
                    itemIDs.iterator(), (double) maxItemsPerUser
                            / itemIDs.size());
            while (it.hasNext()) {
                possibleItemIDs.add(it.nextLong());
            }
        } else {
            possibleItemIDs.addAll(itemIDs);
        }
    }

}