
cc.mallet.topics.tui.Vectors2Topics Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2005 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.topics.tui;
import cc.mallet.util.CommandOption;
import cc.mallet.util.Randoms;
import cc.mallet.types.InstanceList;
import cc.mallet.types.FeatureSequence;
import cc.mallet.topics.*;
import cc.mallet.pipe.iterator.DBInstanceIterator;
import java.io.*;
/** Perform topic analysis in the style of LDA and its variants.
* @author Andrew McCallum
*/
public class Vectors2Topics {
static CommandOption.String inputFile = new CommandOption.String
(Vectors2Topics.class, "input", "FILENAME", true, null,
"The filename from which to read the list of training instances. Use - for stdin. " +
"The instances must be FeatureSequence or FeatureSequenceWithBigrams, not FeatureVector", null);
static CommandOption.SpacedStrings languageInputFiles = new CommandOption.SpacedStrings
(Vectors2Topics.class, "language-inputs", "FILENAME [FILENAME ...]", true, null,
"Filenames for polylingual topic model. Each language should have its own file, " +
"with the same number of instances in each file. If a document is missing in " +
"one language, there should be an empty instance.", null);
static CommandOption.String testingFile = new CommandOption.String
(Vectors2Topics.class, "testing", "FILENAME", false, null,
"The filename from which to read the list of instances for empirical likelihood calculation. Use - for stdin. " +
"The instances must be FeatureSequence or FeatureSequenceWithBigrams, not FeatureVector", null);
static CommandOption.String outputModelFilename = new CommandOption.String
(Vectors2Topics.class, "output-model", "FILENAME", true, null,
"The filename in which to write the binary topic model at the end of the iterations. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String inputModelFilename = new CommandOption.String
(Vectors2Topics.class, "input-model", "FILENAME", true, null,
"The filename from which to read the binary topic model to which the --input will be appended, " +
"allowing incremental training. " +
"By default this is null, indicating that no file will be read.", null);
static CommandOption.String inferencerFilename = new CommandOption.String
(Vectors2Topics.class, "inferencer-filename", "FILENAME", true, null,
"A topic inferencer applies a previously trained topic model to new documents. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String evaluatorFilename = new CommandOption.String
(Vectors2Topics.class, "evaluator-filename", "FILENAME", true, null,
"A held-out likelihood evaluator for new documents. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String stateFile = new CommandOption.String
(Vectors2Topics.class, "output-state", "FILENAME", true, null,
"The filename in which to write the Gibbs sampling state after at the end of the iterations. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String topicKeysFile = new CommandOption.String
(Vectors2Topics.class, "output-topic-keys", "FILENAME", true, null,
"The filename in which to write the top words for each topic and any Dirichlet parameters. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String topicWordWeightsFile = new CommandOption.String
(Vectors2Topics.class, "topic-word-weights-file", "FILENAME", true, null,
"The filename in which to write unnormalized weights for every topic and word type. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String wordTopicCountsFile = new CommandOption.String
(Vectors2Topics.class, "word-topic-counts-file", "FILENAME", true, null,
"The filename in which to write a sparse representation of topic-word assignments. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String topicReportXMLFile = new CommandOption.String
(Vectors2Topics.class, "xml-topic-report", "FILENAME", true, null,
"The filename in which to write the top words for each topic and any Dirichlet parameters in XML format. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String topicPhraseReportXMLFile = new CommandOption.String
(Vectors2Topics.class, "xml-topic-phrase-report", "FILENAME", true, null,
"The filename in which to write the top words and phrases for each topic and any Dirichlet parameters in XML format. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.String docTopicsFile = new CommandOption.String
(Vectors2Topics.class, "output-doc-topics", "FILENAME", true, null,
"The filename in which to write the topic proportions per document, at the end of the iterations. " +
"By default this is null, indicating that no file will be written.", null);
static CommandOption.Double docTopicsThreshold = new CommandOption.Double
(Vectors2Topics.class, "doc-topics-threshold", "DECIMAL", true, 0.0,
"When writing topic proportions per document with --output-doc-topics, " +
"do not print topics with proportions less than this threshold value.", null);
static CommandOption.Integer docTopicsMax = new CommandOption.Integer
(Vectors2Topics.class, "doc-topics-max", "INTEGER", true, -1,
"When writing topic proportions per document with --output-doc-topics, " +
"do not print more than INTEGER number of topics. "+
"A negative value indicates that all topics should be printed.", null);
static CommandOption.Integer numTopics = new CommandOption.Integer
(Vectors2Topics.class, "num-topics", "INTEGER", true, 10,
"The number of topics to fit.", null);
static CommandOption.Integer numThreads = new CommandOption.Integer
(Vectors2Topics.class, "num-threads", "INTEGER", true, 1,
"The number of threads for parallel training.", null);
static CommandOption.Integer numIterations = new CommandOption.Integer
(Vectors2Topics.class, "num-iterations", "INTEGER", true, 1000,
"The number of iterations of Gibbs sampling.", null);
static CommandOption.Integer randomSeed = new CommandOption.Integer
(Vectors2Topics.class, "random-seed", "INTEGER", true, 0,
"The random seed for the Gibbs sampler. Default is 0, which will use the clock.", null);
static CommandOption.Integer topWords = new CommandOption.Integer
(Vectors2Topics.class, "num-top-words", "INTEGER", true, 20,
"The number of most probable words to print for each topic after model estimation.", null);
static CommandOption.Integer showTopicsInterval = new CommandOption.Integer
(Vectors2Topics.class, "show-topics-interval", "INTEGER", true, 50,
"The number of iterations between printing a brief summary of the topics so far.", null);
static CommandOption.Integer outputModelInterval = new CommandOption.Integer
(Vectors2Topics.class, "output-model-interval", "INTEGER", true, 0,
"The number of iterations between writing the model (and its Gibbs sampling state) to a binary file. " +
"You must also set the --output-model to use this option, whose argument will be the prefix of the filenames.", null);
static CommandOption.Integer outputStateInterval = new CommandOption.Integer
(Vectors2Topics.class, "output-state-interval", "INTEGER", true, 0,
"The number of iterations between writing the sampling state to a text file. " +
"You must also set the --output-state to use this option, whose argument will be the prefix of the filenames.", null);
static CommandOption.Integer optimizeInterval = new CommandOption.Integer
(Vectors2Topics.class, "optimize-interval", "INTEGER", true, 0,
"The number of iterations between reestimating dirichlet hyperparameters.", null);
static CommandOption.Integer optimizeBurnIn = new CommandOption.Integer
(Vectors2Topics.class, "optimize-burn-in", "INTEGER", true, 200,
"The number of iterations to run before first estimating dirichlet hyperparameters.", null);
static CommandOption.Boolean useSymmetricAlpha = new CommandOption.Boolean
(Vectors2Topics.class, "use-symmetric-alpha", "true|false", false, false,
"Only optimize the concentration parameter of the prior over document-topic distributions. This may reduce the number of very small, poorly estimated topics, but may disperse common words over several topics.", null);
static CommandOption.Boolean useNgrams = new CommandOption.Boolean
(Vectors2Topics.class, "use-ngrams", "true|false", false, false,
"Rather than using LDA, use Topical-N-Grams, which models phrases.", null);
static CommandOption.Boolean usePAM = new CommandOption.Boolean
(Vectors2Topics.class, "use-pam", "true|false", false, false,
"Rather than using LDA, use Pachinko Allocation Model, which models topical correlations." +
"You cannot do this and also --use-ngrams.", null);
static CommandOption.Double alpha = new CommandOption.Double
(Vectors2Topics.class, "alpha", "DECIMAL", true, 50.0,
"Alpha parameter: smoothing over topic distribution.",null);
static CommandOption.Double beta = new CommandOption.Double
(Vectors2Topics.class, "beta", "DECIMAL", true, 0.01,
"Beta parameter: smoothing over unigram distribution.",null);
static CommandOption.Double gamma = new CommandOption.Double
(Vectors2Topics.class, "gamma", "DECIMAL", true, 0.01,
"Gamma parameter: smoothing over bigram distribution",null);
static CommandOption.Double delta = new CommandOption.Double
(Vectors2Topics.class, "delta", "DECIMAL", true, 0.03,
"Delta parameter: smoothing over choice of unigram/bigram",null);
static CommandOption.Double delta1 = new CommandOption.Double
(Vectors2Topics.class, "delta1", "DECIMAL", true, 0.2,
"Topic N-gram smoothing parameter",null);
static CommandOption.Double delta2 = new CommandOption.Double
(Vectors2Topics.class, "delta2", "DECIMAL", true, 1000.0,
"Topic N-gram smoothing parameter",null);
static CommandOption.Integer pamNumSupertopics = new CommandOption.Integer
(Vectors2Topics.class, "pam-num-supertopics", "INTEGER", true, 10,
"When using the Pachinko Allocation Model (PAM) set the number of supertopics. " +
"Typically this is about half the number of subtopics, although more may help.", null);
static CommandOption.Integer pamNumSubtopics = new CommandOption.Integer
(Vectors2Topics.class, "pam-num-subtopics", "INTEGER", true, 20,
"When using the Pachinko Allocation Model (PAM) set the number of subtopics.", null);
public static void main (String[] args) throws java.io.IOException
{
// Process the command-line options
CommandOption.setSummary (Vectors2Topics.class,
"A tool for estimating, saving and printing diagnostics for topic models, such as LDA.");
CommandOption.process (Vectors2Topics.class, args);
if (usePAM.value) {
InstanceList ilist = InstanceList.load (new File(inputFile.value));
System.out.println ("Data loaded.");
if (inputModelFilename.value != null)
throw new IllegalArgumentException ("--input-model not supported with --use-pam.");
PAM4L pam = new PAM4L(pamNumSupertopics.value, pamNumSubtopics.value);
pam.estimate (ilist, numIterations.value, /*optimizeModelInterval*/50,
showTopicsInterval.value,
outputModelInterval.value, outputModelFilename.value,
randomSeed.value == 0 ? new Randoms() : new Randoms(randomSeed.value));
pam.printTopWords(topWords.value, true);
if (stateFile.value != null)
pam.printState (new File(stateFile.value));
if (docTopicsFile.value != null) {
PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
pam.printDocumentTopics (out, docTopicsThreshold.value, docTopicsMax.value);
out.close();
}
if (outputModelFilename.value != null) {
assert (pam != null);
try {
ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
oos.writeObject (pam);
oos.close();
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
}
}
}
else if (useNgrams.value) {
InstanceList ilist = InstanceList.load (new File(inputFile.value));
System.out.println ("Data loaded.");
if (inputModelFilename.value != null)
throw new IllegalArgumentException ("--input-model not supported with --use-ngrams.");
TopicalNGrams tng = new TopicalNGrams(numTopics.value,
alpha.value,
beta.value,
gamma.value,
delta.value,
delta1.value,
delta2.value);
tng.estimate (ilist, numIterations.value, showTopicsInterval.value,
outputModelInterval.value, outputModelFilename.value,
randomSeed.value == 0 ? new Randoms() : new Randoms(randomSeed.value));
tng.printTopWords(topWords.value, true);
if (stateFile.value != null)
tng.printState (new File(stateFile.value));
if (docTopicsFile.value != null) {
PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
tng.printDocumentTopics (out, docTopicsThreshold.value, docTopicsMax.value);
out.close();
}
if (outputModelFilename.value != null) {
assert (tng != null);
try {
ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
oos.writeObject (tng);
oos.close();
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
}
}
}
else if (languageInputFiles.value != null) {
// Start a new polylingual topic model
PolylingualTopicModel topicModel = null;
int numLanguages = languageInputFiles.value.length;
InstanceList[] training = new InstanceList[ languageInputFiles.value.length ];
for (int i=0; i < training.length; i++) {
training[i] = InstanceList.load(new File(languageInputFiles.value[i]));
if (training[i] != null) { System.out.println(i + " is not null"); }
else { System.out.println(i + " is null"); }
}
System.out.println ("Data loaded.");
// For historical reasons we currently only support FeatureSequence data,
// not the FeatureVector, which is the default for the input functions.
// Provide a warning to avoid ClassCastExceptions.
if (training[0].size() > 0 &&
training[0].get(0) != null) {
Object data = training[0].get(0).getData();
if (! (data instanceof FeatureSequence)) {
System.err.println("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
System.exit(1);
}
}
topicModel = new PolylingualTopicModel (numTopics.value, alpha.value);
if (randomSeed.value != 0) {
topicModel.setRandomSeed(randomSeed.value);
}
topicModel.addInstances(training);
topicModel.setTopicDisplay(showTopicsInterval.value, topWords.value);
topicModel.setNumIterations(numIterations.value);
topicModel.setOptimizeInterval(optimizeInterval.value);
topicModel.setBurninPeriod(optimizeBurnIn.value);
if (outputStateInterval.value != 0) {
topicModel.setSaveState(outputStateInterval.value, stateFile.value);
}
if (outputModelInterval.value != 0) {
topicModel.setModelOutput(outputModelInterval.value, outputModelFilename.value);
}
topicModel.estimate();
if (topicKeysFile.value != null) {
topicModel.printTopWords(new File(topicKeysFile.value), topWords.value, false);
}
if (stateFile.value != null) {
topicModel.printState (new File(stateFile.value));
}
if (docTopicsFile.value != null) {
PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
topicModel.printDocumentTopics(out, docTopicsThreshold.value, docTopicsMax.value);
out.close();
}
if (inferencerFilename.value != null) {
try {
for (int language = 0; language < numLanguages; language++) {
ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(inferencerFilename.value + "." + language));
oos.writeObject(topicModel.getInferencer(language));
oos.close();
}
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
if (outputModelFilename.value != null) {
assert (topicModel != null);
try {
ObjectOutputStream oos =
new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
oos.writeObject (topicModel);
oos.close();
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
}
}
}
else {
// Start a new LDA topic model
ParallelTopicModel topicModel = null;
if (inputModelFilename.value != null) {
try {
topicModel = ParallelTopicModel.read(new File(inputModelFilename.value));
} catch (Exception e) {
System.err.println("Unable to restore saved topic model " +
inputModelFilename.value + ": " + e);
System.exit(1);
}
/*
// Loading new data is optional if we are restoring a saved state.
if (inputFile.value != null) {
InstanceList instances = InstanceList.load (new File(inputFile.value));
System.out.println ("Data loaded.");
lda.addInstances(instances);
}
*/
}
else {
InstanceList training = null;
try {
if (inputFile.value.startsWith("db:")) {
training = DBInstanceIterator.getInstances(inputFile.value.substring(3));
}
else {
training = InstanceList.load (new File(inputFile.value));
}
} catch (Exception e) {
System.err.println("Unable to restore instance list " +
inputFile.value + ": " + e);
System.exit(1);
}
System.out.println ("Data loaded.");
if (training.size() > 0 &&
training.get(0) != null) {
Object data = training.get(0).getData();
if (! (data instanceof FeatureSequence)) {
System.err.println("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
System.exit(1);
}
}
topicModel = new ParallelTopicModel (numTopics.value, alpha.value, beta.value);
if (randomSeed.value != 0) {
topicModel.setRandomSeed(randomSeed.value);
}
topicModel.addInstances(training);
}
topicModel.setTopicDisplay(showTopicsInterval.value, topWords.value);
/*
if (testingFile.value != null) {
topicModel.setTestingInstances( InstanceList.load(new File(testingFile.value)) );
}
*/
topicModel.setNumIterations(numIterations.value);
topicModel.setOptimizeInterval(optimizeInterval.value);
topicModel.setBurninPeriod(optimizeBurnIn.value);
topicModel.setSymmetricAlpha(useSymmetricAlpha.value);
if (outputStateInterval.value != 0) {
topicModel.setSaveState(outputStateInterval.value, stateFile.value);
}
if (outputModelInterval.value != 0) {
topicModel.setSaveSerializedModel(outputModelInterval.value, outputModelFilename.value);
}
topicModel.setNumThreads(numThreads.value);
topicModel.estimate();
if (topicKeysFile.value != null) {
topicModel.printTopWords(new File(topicKeysFile.value), topWords.value, false);
}
if (topicReportXMLFile.value != null) {
PrintWriter out = new PrintWriter(topicReportXMLFile.value);
topicModel.topicXMLReport(out, topWords.value);
out.close();
}
if (topicPhraseReportXMLFile.value != null) {
PrintWriter out = new PrintWriter(topicPhraseReportXMLFile.value);
topicModel.topicPhraseXMLReport(out, topWords.value);
out.close();
}
if (stateFile.value != null) {
topicModel.printState (new File(stateFile.value));
}
if (docTopicsFile.value != null) {
PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
topicModel.printDocumentTopics(out, docTopicsThreshold.value, docTopicsMax.value);
out.close();
}
if (topicWordWeightsFile.value != null) {
topicModel.printTopicWordWeights(new File (topicWordWeightsFile.value));
}
if (wordTopicCountsFile.value != null) {
topicModel.printTypeTopicCounts(new File (wordTopicCountsFile.value));
}
if (outputModelFilename.value != null) {
assert (topicModel != null);
try {
ObjectOutputStream oos =
new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
oos.writeObject (topicModel);
oos.close();
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
}
}
if (inferencerFilename.value != null) {
try {
ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(inferencerFilename.value));
oos.writeObject(topicModel.getInferencer());
oos.close();
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
if (evaluatorFilename.value != null) {
try {
ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(evaluatorFilename.value));
oos.writeObject(topicModel.getProbEstimator());
oos.close();
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy