All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.converters.TextDirectoryLoader Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * TextDirectoryLoader.java
 * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.LinkedList;
import java.util.List;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.CommandlineRunnable;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SerializedObject;
import weka.core.Utils;

/**
 *  Loads all text files in a directory and uses the
 * subdirectory names as class labels. The content of the text files will be
 * stored in a String attribute, the filename can be stored as well.
 * 

* * * Valid options are: *

* *

 * -D
 *  Enables debug output.
 *  (default: off)
 * 
* *
 * -F
 *  Stores the filename in an additional attribute.
 *  (default: off)
 * 
* *
 * -dir <directory>
 *  The directory to work on.
 *  (default: current directory)
 * 
* *
 * -charset <charset name>
 *  The character set to use, e.g UTF-8.
 *  (default: use the default character set)
 * 
* *
 * -R
 *  Retain all string attribute values when reading incrementally.
 * 
* * * * Based on code from the TextDirectoryToArff tool: * * * @author Ashraf M. Kibriya (amk14 at cs.waikato.ac.nz) * @author Richard Kirkby (rkirkby at cs.waikato.ac.nz) * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision: 12184 $ * @see Loader */ public class TextDirectoryLoader extends AbstractLoader implements BatchConverter, IncrementalConverter, OptionHandler, CommandlineRunnable { /** for serialization */ private static final long serialVersionUID = 2592118773712247647L; /** Holds the determined structure (header) of the data set. */ protected Instances m_structure = null; /** Holds the source of the data set. */ protected File m_sourceFile = new File(System.getProperty("user.dir")); /** whether to print some debug information */ protected boolean m_Debug = false; /** whether to include the filename as an extra attribute */ protected boolean m_OutputFilename = false; /** * The charset to use when loading text files (default is to just use the * default charset). */ protected String m_charSet = ""; /** * default constructor */ public TextDirectoryLoader() { // No instances retrieved yet setRetrieval(NONE); } /** * Returns a string describing this loader * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Loads all text files in a directory and uses the subdirectory names " + "as class labels. The content of the text files will be stored in a " + "String attribute, the filename can be stored as well."; } /** * Lists the available options * * @return an enumeration of the available options */ @Override public Enumeration