All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.grouplens.lenskit.eval.data.pack.PackTask Maven / Gradle / Ivy

There is a newer version: 3.0-T5
Show newest version
/*
 * LensKit, an open source recommender systems toolkit.
 * Copyright 2010-2014 LensKit Contributors.  See CONTRIBUTORS.md.
 * Work on LensKit has been funded by the National Science Foundation under
 * grants IIS 05-34939, 08-08692, 08-12148, and 10-17697.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 51
 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
package org.grouplens.lenskit.eval.data.pack;

import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.grouplens.lenskit.cursors.Cursor;
import org.grouplens.lenskit.data.dao.packed.BinaryFormatFlag;
import org.grouplens.lenskit.data.dao.packed.BinaryRatingPacker;
import org.grouplens.lenskit.data.event.Rating;
import org.grouplens.lenskit.eval.AbstractTask;
import org.grouplens.lenskit.eval.TaskExecutionException;
import org.grouplens.lenskit.eval.data.CSVDataSource;
import org.grouplens.lenskit.eval.data.DataSource;
import org.grouplens.lenskit.eval.data.traintest.GenericTTDataSet;
import org.grouplens.lenskit.eval.data.traintest.TTDataSet;
import org.grouplens.lenskit.util.io.StagedWrite;
import org.grouplens.lenskit.util.io.UpToDateChecker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.EnumSet;
import java.util.List;
import java.util.UUID;

/**
 * Pack data sets.  This task returns a list of the data sets.  It can pack both data sources
 * and train-test data sets, in which case it packs the test and training data.  It returns the
 * packed data sets.
 *
 * @since 2.1
 * @author GroupLens Research
 */
public class PackTask extends AbstractTask> {
    private static final Logger logger = LoggerFactory.getLogger(PackTask.class);

    private List trainTestSets = Lists.newArrayList();
    private List dataSources = Lists.newArrayList();
    private Function packFileFunction = new DefaultOutputFunction();
    private EnumSet binaryFlags = EnumSet.of(BinaryFormatFlag.TIMESTAMPS);

    public void addDataset(TTDataSet data) {
        Preconditions.checkNotNull(data, "data source");
        trainTestSets.add(data);
    }

    public void addDataset(DataSource data) {
        Preconditions.checkNotNull(data, "data source");
        dataSources.add(data);
    }

    /**
     * Set whether to include timestamps.
     * @param include If {@code true} (the default), timestamps will be packed.
     */
    public void setIncludeTimestamps(boolean include) {
        if (include) {
            binaryFlags.add(BinaryFormatFlag.TIMESTAMPS);
        } else {
            binaryFlags.remove(BinaryFormatFlag.TIMESTAMPS);
        }
    }

    /**
     * Get whether to include timestamps.
     * @return Whether or not timestamps will be included.
     */
    public boolean getIncludeTimestamps() {
        return binaryFlags.contains(BinaryFormatFlag.TIMESTAMPS);
    }

    /**
     * Set the function to produce the default files for each data source.  The function will take
     * data sources and produce the files into which they should be packed.  The default function
     * packs CSV data sources into binary files next to their source files, and other data sources
     * into files with unique names in the {@code pack} directory.
     *
     * @param func The function.
     */
    public void setOutputFile(Function func) {
        Preconditions.checkNotNull(func, "output file function");
        packFileFunction = func;
    }

    @Override
    protected List perform() throws TaskExecutionException, InterruptedException {
        List results = Lists.newArrayList();
        for (TTDataSet data: trainTestSets) {
            // FIXME Pack the query data
            DataSource train = packDataSource(data.getTrainingData());
            DataSource test = packDataSource(data.getTestData());
            results.add(GenericTTDataSet.copyBuilder(data)
                                        .setTrain(train)
                                        .setTest(test)
                                        .build());
        }
        for (DataSource data: dataSources) {
            results.add(packDataSource(data));
        }
        return results;
    }

    private DataSource packDataSource(DataSource data) throws TaskExecutionException {
        File outFile = packFileFunction.apply(data);
        Preconditions.checkNotNull(outFile, "output file");

        PackedDataSource source = new PackedDataSource(data.getName(), outFile,
                                                       data.getPreferenceDomain());

        UpToDateChecker check = new UpToDateChecker();
        check.addInput(data.lastModified());
        check.addOutput(outFile);
        if (check.isUpToDate()) {
            logger.info("{} is up to date", outFile);
            return source;
        }

        logger.info("packing {} to {}", data, outFile);
        StagedWrite stage = StagedWrite.begin(outFile);
        try {
            BinaryRatingPacker packer = BinaryRatingPacker.open(stage.getStagingFile(), binaryFlags);
            try {
                Cursor ratings = data.getEventDAO().streamEvents(Rating.class);
                try {
                    packer.writeRatings(ratings);
                } finally {
                    ratings.close();
                }
            } finally {
                packer.close();
            }
            stage.commit();
        } catch (IOException ex) {
            logger.error("error packing {}: {}", outFile, ex);
            throw new TaskExecutionException("error packing " + outFile, ex);
        } finally {
            stage.close();
        }

        return source;
    }

    private class DefaultOutputFunction implements Function {
        @Nullable
        @Override
        public File apply(@Nullable DataSource input) {
            assert input != null;
            if (input instanceof CSVDataSource) {
                CSVDataSource csv = (CSVDataSource) input;
                File file = csv.getFile();
                String name = file.getName();
                return new File(file.getParentFile(), name + ".pack");
            } else {
                File dir = new File(getProject().getConfig().getDataDir());
                dir = new File(dir, "packed");
                UUID uuid = UUID.randomUUID();
                return new File(dir, uuid.toString() + ".pack");
            }
        }
    }
}