
cc.mallet.classify.examples.DocumentClassifier Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Takes a list of directory names as arguments, (each directory
should contain all the text files for each class), performs a random train/test split,
trains a classifier, and outputs accuracy on the testing and training sets.
@author Andrew McCallum [email protected]
*/
package cc.mallet.classify.examples;
import java.io.*;
import cc.mallet.classify.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
public class DocumentClassifier
{
static public void main (String[] args)
{
// Create Java File objects for each of the arguments
File[] directories = new File[args.length];
for (int i = 0; i < args.length; i++)
directories[i] = new File (args[i]);
// Create the pipeline that will take as input {data = File, target = String for classname}
// and turn them into {data = FeatureVector, target = Label}
Pipe instancePipe = new SerialPipes (new Pipe[] {
new Target2Label (), // Target String -> class label
new Input2CharSequence (), // Data File -> String containing contents
new CharSubsequence (CharSubsequence.SKIP_HEADER), // Remove UseNet or email header
new CharSequence2TokenSequence (), // Data String -> TokenSequence
new TokenSequenceLowercase (), // TokenSequence words lowercased
new TokenSequenceRemoveStopwords (),// Remove stopwords from sequence
new TokenSequence2FeatureSequence(),// Replace each Token with a feature index
new FeatureSequence2FeatureVector(),// Collapse word order into a "feature vector"
new PrintInputAndTarget(),
});
// Create an empty list of the training instances
InstanceList ilist = new InstanceList (instancePipe);
// Add all the files in the directories to the list of instances.
// The Instance that goes into the beginning of the instancePipe
// will have a File in the "data" slot, and a string from args[] in the "target" slot.
ilist.addThruPipe (new FileIterator (directories, FileIterator.STARTING_DIRECTORIES));
// Make a test/train split; ilists[0] will be for training; ilists[1] will be for testing
InstanceList[] ilists = ilist.split (new double[] {.5, .5});
// Create a classifier trainer, and use it to create a classifier
ClassifierTrainer naiveBayesTrainer = new NaiveBayesTrainer ();
Classifier classifier = naiveBayesTrainer.train (ilists[0]);
System.out.println ("The training accuracy is "+ classifier.getAccuracy (ilists[0]));
System.out.println ("The testing accuracy is "+ classifier.getAccuracy (ilists[1]));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy