All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.chungkwong.classifier.validator.DataDivider Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2018 Chan Chung Kwong
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
package com.github.chungkwong.classifier.validator;
import java.util.*;
import java.util.stream.*;
/**
 * Utility being used to divide dataset
 * @author Chan Chung Kwong
 */
public class DataDivider{
	/**
	 * Partition dataset into train set and test set randomly.
	 * All sample are cached in RAM, so randomSplit should be used instead for
	 * large dataset.
	 * @param  the type of the data
	 * @param dataset to be divided
	 * @param trainRatio the ratio of data used for train
	 * @return split data set
	 */
	public static  SplitDataSet randomSplitInRam(DataSet dataset,double trainRatio){
		List> list=dataset.getSamples().collect(Collectors.toList());
		Collections.shuffle(list);//FIXME: all data are loaded to RAM
		int cut=(int)(list.size()*trainRatio);
		String name=dataset.getName()+"(train=random"+trainRatio+')';
		return new SplitDataSet<>(()->list.subList(0,cut).stream(),()->list.subList(cut,list.size()).stream(),name);
	}
	/**
	 * Partition dataset into train set and test set sequentially.
	 * All sample are cached in RAM, so randomSplit should be used instead for
	 * large dataset.
	 * @param  the type of the data
	 * @param dataset to be divided
	 * @param trainRatio the ratio of data used for train
	 * @return split data set
	 */
	public static  SplitDataSet sequentialSplitInRam(DataSet dataset,double trainRatio){
		List> list=dataset.getSamples().collect(Collectors.toList());
		int cut=(int)(list.size()*trainRatio);
		String name=dataset.getName()+"(train=first"+trainRatio+')';
		return new SplitDataSet<>(()->list.subList(0,cut).stream(),()->list.subList(cut,list.size()).stream(),name);
	}
	/**
	 * Partition dataset into train set and test set randomly
	 * @param  the type of the data
	 * @param dataset to be divided
	 * @param trainRatio the ratio of data used for train
	 * @return split data set
	 */
	public static  SplitDataSet randomSplit(DataSet dataset,double trainRatio){
		int count=(int)dataset.getSamples().count();
		BitSet bitSet=new BitSet(count);
		for(int i=0;i(()->getTrainStream(bitSet,dataset),()->getTestStream(bitSet,dataset),name);
	}
	private static  Stream> getTrainStream(BitSet bitSet,DataSet dataset){
		int[] index=new int[]{0};
		return dataset.getSamples().filter((sample)->bitSet.get(index[0]++));
	}
	private static  Stream> getTestStream(BitSet bitSet,DataSet dataset){
		int[] index=new int[]{0};
		return dataset.getSamples().filter((sample)->!bitSet.get(index[0]++));
	}
	/**
	 * Partition dataset into train set and test set sequentially
	 * @param  the type of the data
	 * @param dataset to be divided
	 * @param trainRatio the ratio of data used for train
	 * @return split data set
	 */
	public static  SplitDataSet sequentialSplit(DataSet dataset,double trainRatio){
		long cut=(int)(dataset.getSamples().count()*trainRatio);
		String name=dataset.getName()+"(train=first"+trainRatio+')';
		return new SplitDataSet<>(()->dataset.getSamples().limit(cut),()->dataset.getSamples().skip(cut),name);
	}
	/**
	 * Use the full dataset for both train and test
	 * @param  the type of the data
	 * @param dataset to be divided
	 * @return split data set
	 */
	public static  SplitDataSet noSplit(DataSet dataset){
		return new SplitDataSet<>(()->dataset.getSamples(),()->dataset.getSamples(),dataset.getName());
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy