moa.streams.clustering.FileStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of moa Show documentation
Massive On-line Analysis is an environment for massive data mining. MOA provides a framework for data stream mining and includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.
There is a newer version: 2024.07.0
Show newest version
/**
 * [FileStream.java]
 * 
 * @author Timm Jansen
 * @editor Yunsu Kim
 * 
 * Last Edited: 2013/06/27
 * Data Management and Data Exploration Group, RWTH Aachen University
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 *    
 *    
 */

package moa.streams.clustering;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;

import moa.core.InputStreamProgressMonitor;
import moa.core.InstanceExample;
import com.yahoo.labs.samoa.instances.InstancesHeader;
import moa.core.ObjectRepository;
import com.github.javacliparser.FileOption;
import com.github.javacliparser.FlagOption;
import com.github.javacliparser.IntOption;
import com.github.javacliparser.ListOption;
import com.github.javacliparser.Option;
import moa.tasks.TaskMonitor;

import com.yahoo.labs.samoa.instances.Instance;
import com.yahoo.labs.samoa.instances.Instances;

public class FileStream extends ClusteringStream{

	@Override
	public String getPurposeString() {
		return "A stream read from an ARFF file. HINT: Visualization only works correctly with numerical 0-1 normalized attributes!";
	}

	private static final long serialVersionUID = 1L;


    String defaultfile = "KDDCup99.arff";

	public FileOption arffFileOption = new FileOption("arffFile", 'f',
			"ARFF file to load.", defaultfile, "arff", false);

	public IntOption classIndexOption = new IntOption(
			"classIndex",
			'c',
			"Class index of data. 0 for none or -1 for last attribute in file.",
			-1, -1, Integer.MAX_VALUE);

    public FlagOption normalizeOption = 
    		new FlagOption("normalize", 'n', 
    				"Numerical data will be normalized to 0-1 " +
    				"for the visualization to work. The complete arff file needs to be read upfront.");

    public ListOption removeAttributesOption = new ListOption("removeAttributes", 'r',
            "Attributes to remove. Enter comma seperated list, " +
            "starting with 1 for first attribute.", 
            new IntOption("removeAttribute", ' ', "Attribute to remove.",-1),
            new Option[0], ',');	
	
    public FlagOption keepNonNumericalAttrOption = 
    		new FlagOption("keepNonNumericalAttr", 'K',
    		"Non-numerical attributes are being filtered by default " +
    		"(except the class attribute). " +
    		"Check to keep all attributes. This option is being " +
    		"overwritten by the manual attribute removal filter.");
	

    
  
	protected Instances instances;

	protected Reader fileReader;

	protected boolean hitEndOfFile;

    protected InstanceExample lastInstanceRead;

	protected int numInstancesRead;

	protected InputStreamProgressMonitor fileProgressMonitor;
	
	private Integer[] removeAttributes = null;
    
	private Instances filteredDataset = null;
    
	private ArrayList valuesMinMaxDiff = null;

	
	public FileStream(){
		//remove numAttritube Option from ClusteringStream as that is being set internally for Filestream
		numAttsOption = null;
	}
	
	@Override
	public void prepareForUseImpl(TaskMonitor monitor,
			ObjectRepository repository) {
		restart();
	}

	public InstancesHeader getHeader() {
		return new InstancesHeader(this.filteredDataset);
	}

	public long estimatedRemainingInstances() {
		double progressFraction = this.fileProgressMonitor
				.getProgressFraction();
		if ((progressFraction > 0.0) && (this.numInstancesRead > 0)) {
			return (long) ((this.numInstancesRead / progressFraction) - this.numInstancesRead);
		}
		return -1;
	}

	public boolean hasMoreInstances() {
		return !this.hitEndOfFile;
	}

    public InstanceExample nextInstance() {
        InstanceExample prevInstance = this.lastInstanceRead;
		this.hitEndOfFile = !readNextInstanceFromFile();
		return prevInstance;
	}

	public boolean isRestartable() {
		return true;
	}

	public void restart() {
		try {
			if (fileReader != null) {
				fileReader.close();
			}
			InputStream fileStream = new FileInputStream(arffFileOption.getFile());
			fileProgressMonitor = new InputStreamProgressMonitor(fileStream);
			fileReader = new BufferedReader(new InputStreamReader(fileProgressMonitor));
            instances = new Instances(fileReader, 1, this.classIndexOption.getValue());
			if (classIndexOption.getValue() < 0) {
				instances.setClassIndex(instances.numAttributes() - 1);
			} else if (classIndexOption.getValue() > 0) {
				instances.setClassIndex(classIndexOption.getValue() - 1);
			}


			//use hashset to delete duplicates and attributes numbers that aren't valid
			HashSet attributes =  new HashSet(); 
			Option[] rawAttributeList = removeAttributesOption.getList();
			for (int i = 0; i < rawAttributeList.length; i++) {
				int attribute = ((IntOption)rawAttributeList[i]).getValue();
				if(1 <= attribute && attribute <= instances.numAttributes())
					attributes.add(attribute-1);
				else
					System.out.println("Found invalid attribute removal description: " +
							"Attribute option "+attribute
							+" will be ignored. Filestream only has "
							+instances.numAttributes()+" attributes.");
			}
			
			//remove all non numeric attributes except the class attribute
			if(!keepNonNumericalAttrOption.isSet()){
				for (int i = 0; i < instances.numAttributes(); i++) {
					if(!instances.attribute(i).isNumeric() && i != instances.classIndex()){
						attributes.add(i);
					}
				}
			}
			
			//read min/max values in case we need to normalize
			if(normalizeOption.isSet())
				valuesMinMaxDiff = readMinMaxDiffValues(attributes);
			
			//convert hashset to array and sort array so we can delete attributes in a sequence
			removeAttributes = attributes.toArray(new Integer[0]);
			Arrays.sort(removeAttributes);
			
			//set updated number of attributes (class attribute included)
			numAttsOption = new IntOption("numAtts", 'a',"", instances.numAttributes() - removeAttributes.length);
			
			if(removeAttributes.length > 0){
				System.out.println("Removing the following attributes:");
				for (int i = 0; i < removeAttributes.length; i++) {
					System.out.println((removeAttributes[i]+1)+" "
							+instances.attribute(removeAttributes[i]).name());
				}
			}
            
			//create filtered dataset
			filteredDataset = new Instances(instances);
			for (int i = removeAttributes.length-1; i >= 0 ; i--) {
				filteredDataset.deleteAttributeAt(removeAttributes[i]);
				if(true){
					
				}
			}

			this.numInstancesRead = 0;
			this.lastInstanceRead = null;
			this.hitEndOfFile = !readNextInstanceFromFile();
		} catch (IOException ioe) {
			throw new RuntimeException("ArffFileStream restart failed.", ioe);
		}
	}

	protected boolean readNextInstanceFromFile() {
		try {
			
			if (this.instances.readInstance(this.fileReader)) {
				Instance rawInstance = this.instances.instance(0);
				
				//remove dataset from instance so we can delete attributes
				for (int i = removeAttributes.length-1; i >= 0 ; i--) {
					rawInstance.deleteAttributeAt(removeAttributes[i]);	
				}
				//set adjusted dataset for instance
				rawInstance.setDataset(filteredDataset);

				if (normalizeOption.isSet() && valuesMinMaxDiff != null) {
					for (int i = 0; i < rawInstance.numAttributes() ; i++) {
						if (valuesMinMaxDiff.get(i)[2] != 1 &&		// Already normalized
							valuesMinMaxDiff.get(i)[2] != 0 &&		// Max. value is 0 (unable to be normalized)
							i != rawInstance.classIndex()) {		// Class label is not subject to be normalized
							double v = rawInstance.value(i);
							v = (v - valuesMinMaxDiff.get(i)[0]) / valuesMinMaxDiff.get(i)[2];
							rawInstance.setValue(i, v);
						}
					}
				}
				
                this.lastInstanceRead = new InstanceExample(rawInstance);
				this.instances.delete(); // keep instances clean
				this.numInstancesRead++;
				return true;
			}
			if (this.fileReader != null) {
				this.fileReader.close();
				this.fileReader = null;
			}
			return false;
		} catch (IOException ioe) {
			throw new RuntimeException(
					"ArffFileStream failed to read instance from stream.", ioe);
		}
	}
	
	/**
	 * @param ignoredAttributes Attributes that will be ignored
	 * @return A list with min/max and diff=max-min values per attribute of the arff file 
	 */
	protected ArrayList readMinMaxDiffValues(HashSet ignoredAttributes) {
		ArrayList valuesMinMaxDiff = null;
		
		if(ignoredAttributes == null)
			ignoredAttributes = new HashSet();
		
		try {
			InputStream fileStream = new FileInputStream(arffFileOption.getFile());
			InputStreamProgressMonitor fileProgressMonitor = new InputStreamProgressMonitor(fileStream);
			Reader fileReader = new BufferedReader(new InputStreamReader(fileProgressMonitor));
            Instances instances = new Instances(fileReader, 1, this.classIndexOption.getValue());

			valuesMinMaxDiff = new ArrayList();
			for (int i = 0; i < instances.numAttributes()-ignoredAttributes.size(); i++) {
				Double[] values =  {Double.POSITIVE_INFINITY,Double.NEGATIVE_INFINITY,0.0};
				valuesMinMaxDiff.add(values);
			}
			
			System.out.print("Reading arff file for normalization...");
			int counter = 0;
			while (instances.readInstance(fileReader)) {
				Instance instance = instances.instance(0);
				int a = 0;
				for (int i = 0; i < instances.numAttributes(); i++) {
					if(!ignoredAttributes.contains(i)){
						double value = instance.value(i);
						if(value < valuesMinMaxDiff.get(a)[0])
							valuesMinMaxDiff.get(a)[0] = value;
						if(value > valuesMinMaxDiff.get(a)[1])
							valuesMinMaxDiff.get(a)[1] = value;
						a++;
					}
				}
				instances.delete();

				//show some progress
				counter++;
				if(counter >= 10000){
					counter = 0;
					System.out.print(".");
				}
			}
			if (fileReader != null) {
				fileReader.close();
				fileReader = null;
			}
			System.out.println("done!");

			for (int i = 0; i < valuesMinMaxDiff.size(); i++) {
				valuesMinMaxDiff.get(i)[2]=valuesMinMaxDiff.get(i)[1]-valuesMinMaxDiff.get(i)[0];
			}

			return valuesMinMaxDiff;
		} catch (IOException ioe) {
			throw new RuntimeException(
					"ArffFileStream failed to read instance from stream.", ioe);
		}
	}	
	

	public void getDescription(StringBuilder sb, int indent) {
		// TODO Auto-generated method stub

	}

}