All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.arosbio.ml.testing.KFoldCV Maven / Gradle / Ivy

Go to download

Conformal AI package, including all data IO, transformations, machine learning models and predictor classes. Without inclusion of chemistry-dependent code.

There is a newer version: 2.0.0
Show newest version
/*
 * Copyright (C) Aros Bio AB.
 *
 * CPSign is an Open Source Software that is dual licensed to allow you to choose a license that best suits your requirements:
 *
 * 1) GPLv3 (GNU General Public License Version 3) with Additional Terms, including an attribution clause as well as a limitation to use the software for commercial purposes.
 *
 * 2) CPSign Proprietary License that allows you to use CPSign for commercial activities, such as in a revenue-generating operation or environment, or integrate CPSign in your proprietary software without worrying about disclosing the source code of your proprietary software, which is required if you choose to use the software under GPLv3 license. See arosbio.com/cpsign/commercial-license for details.
 */
package com.arosbio.ml.testing;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import com.arosbio.commons.CollectionUtils;
import com.arosbio.commons.GlobalConfig;
import com.arosbio.commons.TypeUtils;
import com.arosbio.commons.config.IntegerConfig;
import com.arosbio.data.Dataset;
import com.arosbio.data.splitting.FoldedSplitter;
import com.arosbio.ml.testing.utils.TestStrategiesUtils;
import com.arosbio.ml.testing.utils.TestTrainWrapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Range;

/**
 * k-fold cross-validation 
 * 
 * @author staffan
 * @author Aros bio
 */
public class KFoldCV implements TestingStrategy {

	public static final String STRATEGY_NAME = "KFoldCV";
	public static final int DEFAULT_K = 10;

	private boolean stratified = false;
	private boolean shuffle = true;
	private int numRepeat = 1;
	private int numFolds = DEFAULT_K;
	private long rngSeed = GlobalConfig.getInstance().getRNGSeed();

	/**
	 * Default constructor, uses k=10 and the default global seed from {@link GlobalConfig}
	 */
	public KFoldCV() {
	}

	/**
	 * Set the k parameter for the number of folds, and use the default global seed from {@link GlobalConfig}
	 * @param k number of folds, k>=2
	 * @throws IllegalArgumentException if k<2
	 */
	public KFoldCV(int k) {
		if (k < 2)
			throw new IllegalArgumentException("k cannot be less than 2 in k-fold CV");
		this.numFolds = k;
	}

	public KFoldCV(int k, long seed) {
		this(k);
		rngSeed = seed;
	}

	public KFoldCV(int k, boolean stratify) {
		this(k);
		this.stratified = stratify;
	}

	public boolean hasDescription() {
		return true;
	}

	public String getDescription() {
		return "k-fold cross-validation. Randomize the order of the data, splits the full dataset into k disjoint folds and iteratively use one fold for test-set and remaining (k-1) folds for training.";
	}

	public String getName() {
		return STRATEGY_NAME;
	}

	public boolean isStratified() {
		return stratified;
	}

	public KFoldCV withStratified(boolean stratify) {
		this.stratified = stratify;
		return this;
	}

	public boolean usesShuffle() {
		return shuffle;
	}

	public KFoldCV withShuffle(boolean shuffle) {
		this.shuffle = shuffle;
		return this;
	}

	public int getNumRepeat() {
		return numRepeat;
	}

	public KFoldCV withNumRepeat(int numRepeat) {
		if (numRepeat > 0)
			this.numRepeat = numRepeat;
		else
			this.numRepeat = 1;
		return this;
	}

	public int getNumFolds() {
		return numFolds;
	}

	/**
	 * Set the number of folds, must be ≥2
	 * @param numFolds num folds, if a value ≤2 is given, it will be set to the default (10)
	 * @return the same instance
	 */
	public KFoldCV withNumFolds(int numFolds) {
		if (numFolds < 2)
			throw new IllegalArgumentException("Num folds in "+STRATEGY_NAME + " must be >=2, got '"+numFolds+'\'');
		this.numFolds = numFolds;
		return this;
	}

	@Override
	public Long getSeed() {
		return rngSeed;
	}

	@Override
	public void setSeed(long seed) {
		this.rngSeed = seed;
	}

	public KFoldCV withSeed(long seed){
		this.rngSeed = seed;
		return this;
	}

	@Override
	public int getNumberOfSplitsAndValidate(Dataset data) 
			throws IllegalArgumentException {
		if (data.getDataset().size() < numFolds)
			throw new IllegalArgumentException("Cannot run k-Fold CV with more folds than records!");
		if (! shuffle && numRepeat > 1)
			throw new IllegalArgumentException("shuffle cannot be false if numRepeat > 1");
		return numFolds * numRepeat;
	}

	@Override
	public Iterator getSplits(Dataset data) {
		return new TestTrainWrapper(new FoldedSplitter.Builder()
			.seed(rngSeed)
			.name(STRATEGY_NAME)
			.numFolds(numFolds)
			.shuffle(shuffle)
			.stratify(stratified)
			.numRepeat(numRepeat)
			.findLabelRange(false)
			.build(data));
	}

	public String toString() {
		return String.format("%s-fold %scross-validation%s", numFolds,
				(stratified? "stratified ":""),
				(numRepeat>1? " repeated " + numRepeat + " times" : ""));
	}

	public static final String[] K_PARAM_NAMES = new String[] {"numSplits","folds", "k"};

	@Override
	public List getConfigParameters() {
		return ImmutableList.of(
			new IntegerConfig.Builder(Arrays.asList(K_PARAM_NAMES), DEFAULT_K)
				.range(Range.atLeast(2)).description("Number of folds to use").build(),
			TestStrategiesUtils.shuffleParameter,
			TestStrategiesUtils.numRepParameter,
			TestStrategiesUtils.stratifiedParameter);
	}

	@Override
	public void setConfigParameters(Map params) throws IllegalArgumentException {
		Map noNullParams = CollectionUtils.dropNullValues(params);
		for (Map.Entry kv : noNullParams.entrySet()) {
			try {
				// Folds
				if (CollectionUtils.containsIgnoreCase(K_PARAM_NAMES, kv.getKey())) {
					withNumFolds(TypeUtils.asInt(kv.getValue()));
				} 
				// shuffle
				else if (CollectionUtils.containsIgnoreCase(TestStrategiesUtils.shuffleParamNames, kv.getKey())) {
					withShuffle(TypeUtils.asBoolean(kv.getValue()));
				} 

				// num reps
				else if (CollectionUtils.containsIgnoreCase(TestStrategiesUtils.numRepParamNames, kv.getKey())) {
					withNumRepeat(TypeUtils.asInt(kv.getValue()));
				} 

				// stratified
				else if (CollectionUtils.containsIgnoreCase(TestStrategiesUtils.stratifiedParamNames, kv.getKey())) {
					withStratified(TypeUtils.asBoolean(kv.getValue()));
				} 

			} catch (IllegalArgumentException e) {
				// Pass along
				throw e;
			} catch (Exception e) {
				throw new IllegalArgumentException("Invalid input for parameter " + kv.getKey() + ": " + kv.getValue());
			}
		}

	}

	public KFoldCV clone(){
        KFoldCV c = new KFoldCV();
		c.stratified = stratified;
		c.shuffle = shuffle;
		c.numRepeat = numRepeat;
		c.numFolds = numFolds;
		c.rngSeed = rngSeed;
        return c;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy