
com.github.chungkwong.classifier.validator.DataDivider Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of text-classifier-collection Show documentation
Show all versions of text-classifier-collection Show documentation
A full fledged text classification toolkit for Java
The newest version!
/*
* Copyright (C) 2018 Chan Chung Kwong
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package com.github.chungkwong.classifier.validator;
import java.util.*;
import java.util.stream.*;
/**
* Utility being used to divide dataset
* @author Chan Chung Kwong
*/
public class DataDivider{
/**
* Partition dataset into train set and test set randomly.
* All sample are cached in RAM, so randomSplit should be used instead for
* large dataset.
* @param the type of the data
* @param dataset to be divided
* @param trainRatio the ratio of data used for train
* @return split data set
*/
public static SplitDataSet randomSplitInRam(DataSet dataset,double trainRatio){
List> list=dataset.getSamples().collect(Collectors.toList());
Collections.shuffle(list);//FIXME: all data are loaded to RAM
int cut=(int)(list.size()*trainRatio);
String name=dataset.getName()+"(train=random"+trainRatio+')';
return new SplitDataSet<>(()->list.subList(0,cut).stream(),()->list.subList(cut,list.size()).stream(),name);
}
/**
* Partition dataset into train set and test set sequentially.
* All sample are cached in RAM, so randomSplit should be used instead for
* large dataset.
* @param the type of the data
* @param dataset to be divided
* @param trainRatio the ratio of data used for train
* @return split data set
*/
public static SplitDataSet sequentialSplitInRam(DataSet dataset,double trainRatio){
List> list=dataset.getSamples().collect(Collectors.toList());
int cut=(int)(list.size()*trainRatio);
String name=dataset.getName()+"(train=first"+trainRatio+')';
return new SplitDataSet<>(()->list.subList(0,cut).stream(),()->list.subList(cut,list.size()).stream(),name);
}
/**
* Partition dataset into train set and test set randomly
* @param the type of the data
* @param dataset to be divided
* @param trainRatio the ratio of data used for train
* @return split data set
*/
public static SplitDataSet randomSplit(DataSet dataset,double trainRatio){
int count=(int)dataset.getSamples().count();
BitSet bitSet=new BitSet(count);
for(int i=0;i(()->getTrainStream(bitSet,dataset),()->getTestStream(bitSet,dataset),name);
}
private static Stream> getTrainStream(BitSet bitSet,DataSet dataset){
int[] index=new int[]{0};
return dataset.getSamples().filter((sample)->bitSet.get(index[0]++));
}
private static Stream> getTestStream(BitSet bitSet,DataSet dataset){
int[] index=new int[]{0};
return dataset.getSamples().filter((sample)->!bitSet.get(index[0]++));
}
/**
* Partition dataset into train set and test set sequentially
* @param the type of the data
* @param dataset to be divided
* @param trainRatio the ratio of data used for train
* @return split data set
*/
public static SplitDataSet sequentialSplit(DataSet dataset,double trainRatio){
long cut=(int)(dataset.getSamples().count()*trainRatio);
String name=dataset.getName()+"(train=first"+trainRatio+')';
return new SplitDataSet<>(()->dataset.getSamples().limit(cut),()->dataset.getSamples().skip(cut),name);
}
/**
* Use the full dataset for both train and test
* @param the type of the data
* @param dataset to be divided
* @return split data set
*/
public static SplitDataSet noSplit(DataSet dataset){
return new SplitDataSet<>(()->dataset.getSamples(),()->dataset.getSamples(),dataset.getName());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy