Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0 Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.lda.cvb;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.DistributedRowMatrixWriter;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.SparseRowMatrix;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Runs the same algorithm as {@link CVB0Driver}, but sequentially, in memory. Memory requirements
* are currently: the entire corpus is read into RAM, two copies of the model (each of size
* numTerms * numTopics), and another matrix of size numDocs * numTopics is held in memory
* (to store p(topic|doc) for all docs).
*
* But if all this fits in memory, this can be significantly faster than an iterative MR job.
*/
public class InMemoryCollapsedVariationalBayes0 extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(InMemoryCollapsedVariationalBayes0.class);
private int numTopics;
private int numTerms;
private int numDocuments;
private double alpha;
private double eta;
//private int minDfCt;
//private double maxDfPct;
private boolean verbose = false;
private String[] terms; // of length numTerms;
private Matrix corpusWeights; // length numDocs;
private double totalCorpusWeight;
private double initialModelCorpusFraction;
private Matrix docTopicCounts;
private int numTrainingThreads;
private int numUpdatingThreads;
private ModelTrainer modelTrainer;
private InMemoryCollapsedVariationalBayes0() {
// only for main usage
}
public void setVerbose(boolean verbose) {
this.verbose = verbose;
}
public InMemoryCollapsedVariationalBayes0(Matrix corpus,
String[] terms,
int numTopics,
double alpha,
double eta,
int numTrainingThreads,
int numUpdatingThreads,
double modelCorpusFraction) {
//this.seed = seed;
this.numTopics = numTopics;
this.alpha = alpha;
this.eta = eta;
//this.minDfCt = 0;
//this.maxDfPct = 1.0f;
corpusWeights = corpus;
numDocuments = corpus.numRows();
this.terms = terms;
this.initialModelCorpusFraction = modelCorpusFraction;
numTerms = terms != null ? terms.length : corpus.numCols();
Map termIdMap = new HashMap<>();
if (terms != null) {
for (int t = 0; t < terms.length; t++) {
termIdMap.put(terms[t], t);
}
}
this.numTrainingThreads = numTrainingThreads;
this.numUpdatingThreads = numUpdatingThreads;
postInitCorpus();
initializeModel();
}
private void postInitCorpus() {
totalCorpusWeight = 0;
int numNonZero = 0;
for (int i = 0; i < numDocuments; i++) {
Vector v = corpusWeights.viewRow(i);
double norm;
if (v != null && (norm = v.norm(1)) != 0) {
numNonZero += v.getNumNondefaultElements();
totalCorpusWeight += norm;
}
}
String s = "Initializing corpus with %d docs, %d terms, %d nonzero entries, total termWeight %f";
log.info(String.format(s, numDocuments, numTerms, numNonZero, totalCorpusWeight));
}
private void initializeModel() {
TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms,
numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight);
topicModel.setConf(getConf());
TopicModel updatedModel = initialModelCorpusFraction == 0
? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1)
: topicModel;
updatedModel.setConf(getConf());
docTopicCounts = new DenseMatrix(numDocuments, numTopics);
docTopicCounts.assign(1.0 / numTopics);
modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms);
}
/*
private void inferDocuments(double convergence, int maxIter, boolean recalculate) {
for (int docId = 0; docId < corpusWeights.numRows() ; docId++) {
Vector inferredDocument = topicModel.infer(corpusWeights.viewRow(docId),
docTopicCounts.viewRow(docId));
// do what now?
}
}
*/
public void trainDocuments() {
trainDocuments(0);
}
public void trainDocuments(double testFraction) {
long start = System.nanoTime();
modelTrainer.start();
for (int docId = 0; docId < corpusWeights.numRows(); docId++) {
if (testFraction == 0 || docId % (1 / testFraction) != 0) {
Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics); // docTopicCounts.getRow(docId)
modelTrainer.trainSync(corpusWeights.viewRow(docId), docTopics , true, 10);
}
}
modelTrainer.stop();
logTime("train documents", System.nanoTime() - start);
}
/*
private double error(int docId) {
Vector docTermCounts = corpusWeights.viewRow(docId);
if (docTermCounts == null) {
return 0;
} else {
Vector expectedDocTermCounts =
topicModel.infer(corpusWeights.viewRow(docId), docTopicCounts.viewRow(docId));
double expectedNorm = expectedDocTermCounts.norm(1);
return expectedDocTermCounts.times(docTermCounts.norm(1)/expectedNorm)
.minus(docTermCounts).norm(1);
}
}
private double error() {
long time = System.nanoTime();
double error = 0;
for (int docId = 0; docId < numDocuments; docId++) {
error += error(docId);
}
logTime("error calculation", System.nanoTime() - time);
return error / totalCorpusWeight;
}
*/
public double iterateUntilConvergence(double minFractionalErrorChange,
int maxIterations, int minIter) {
return iterateUntilConvergence(minFractionalErrorChange, maxIterations, minIter, 0);
}
public double iterateUntilConvergence(double minFractionalErrorChange,
int maxIterations, int minIter, double testFraction) {
int iter = 0;
double oldPerplexity = 0;
while (iter < minIter) {
trainDocuments(testFraction);
if (verbose) {
log.info("model after: {}: {}", iter, modelTrainer.getReadModel());
}
log.info("iteration {} complete", iter);
oldPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts,
testFraction);
log.info("{} = perplexity", oldPerplexity);
iter++;
}
double newPerplexity = 0;
double fractionalChange = Double.MAX_VALUE;
while (iter < maxIterations && fractionalChange > minFractionalErrorChange) {
trainDocuments();
if (verbose) {
log.info("model after: {}: {}", iter, modelTrainer.getReadModel());
}
newPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts,
testFraction);
log.info("{} = perplexity", newPerplexity);
iter++;
fractionalChange = Math.abs(newPerplexity - oldPerplexity) / oldPerplexity;
log.info("{} = fractionalChange", fractionalChange);
oldPerplexity = newPerplexity;
}
if (iter < maxIterations) {
log.info(String.format("Converged! fractional error change: %f, error %f",
fractionalChange, newPerplexity));
} else {
log.info(String.format("Reached max iteration count (%d), fractional error change: %f, error: %f",
maxIterations, fractionalChange, newPerplexity));
}
return newPerplexity;
}
public void writeModel(Path outputPath) throws IOException {
modelTrainer.persist(outputPath);
}
private static void logTime(String label, long nanos) {
log.info("{} time: {}ms", label, nanos / 1.0e6);
}
public static int main2(String[] args, Configuration conf) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option helpOpt = DefaultOptionCreator.helpOption();
Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
"The Directory on HDFS containing the collapsed, properly formatted files having "
+ "one doc per line").withShortName("i").create();
Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
"The path to the term-dictionary format is ... ").withShortName("d").create();
Option dfsOpt = obuilder.withLongName("dfs").withRequired(false).withArgument(
abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()).withDescription(
"HDFS namenode URI").withShortName("dfs").create();
Option numTopicsOpt = obuilder.withLongName("numTopics").withRequired(true).withArgument(abuilder
.withName("numTopics").withMinimum(1).withMaximum(1)
.create()).withDescription("Number of topics to learn").withShortName("top").create();
Option outputTopicFileOpt = obuilder.withLongName("topicOutputFile").withRequired(true).withArgument(
abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
.withDescription("File to write out p(term | topic)").withShortName("to").create();
Option outputDocFileOpt = obuilder.withLongName("docOutputFile").withRequired(true).withArgument(
abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
.withDescription("File to write out p(topic | docid)").withShortName("do").create();
Option alphaOpt = obuilder.withLongName("alpha").withRequired(false).withArgument(abuilder
.withName("alpha").withMinimum(1).withMaximum(1).withDefault("0.1").create())
.withDescription("Smoothing parameter for p(topic | document) prior").withShortName("a").create();
Option etaOpt = obuilder.withLongName("eta").withRequired(false).withArgument(abuilder
.withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
.withDescription("Smoothing parameter for p(term | topic)").withShortName("e").create();
Option maxIterOpt = obuilder.withLongName("maxIterations").withRequired(false).withArgument(abuilder
.withName("maxIterations").withMinimum(1).withMaximum(1).withDefault("10").create())
.withDescription("Maximum number of training passes").withShortName("m").create();
Option modelCorpusFractionOption = obuilder.withLongName("modelCorpusFraction")
.withRequired(false).withArgument(abuilder.withName("modelCorpusFraction").withMinimum(1)
.withMaximum(1).withDefault("0.0").create()).withShortName("mcf")
.withDescription("For online updates, initial value of |model|/|corpus|").create();
Option burnInOpt = obuilder.withLongName("burnInIterations").withRequired(false).withArgument(abuilder
.withName("burnInIterations").withMinimum(1).withMaximum(1).withDefault("5").create())
.withDescription("Minimum number of iterations").withShortName("b").create();
Option convergenceOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(abuilder
.withName("convergence").withMinimum(1).withMaximum(1).withDefault("0.0").create())
.withDescription("Fractional rate of perplexity to consider convergence").withShortName("c").create();
Option reInferDocTopicsOpt = obuilder.withLongName("reInferDocTopics").withRequired(false)
.withArgument(abuilder.withName("reInferDocTopics").withMinimum(1).withMaximum(1)
.withDefault("no").create())
.withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
.withShortName("rdt").create();
Option numTrainThreadsOpt = obuilder.withLongName("numTrainThreads").withRequired(false)
.withArgument(abuilder.withName("numTrainThreads").withMinimum(1).withMaximum(1)
.withDefault("1").create())
.withDescription("number of threads to train with")
.withShortName("ntt").create();
Option numUpdateThreadsOpt = obuilder.withLongName("numUpdateThreads").withRequired(false)
.withArgument(abuilder.withName("numUpdateThreads").withMinimum(1).withMaximum(1)
.withDefault("1").create())
.withDescription("number of threads to update the model with")
.withShortName("nut").create();
Option verboseOpt = obuilder.withLongName("verbose").withRequired(false)
.withArgument(abuilder.withName("verbose").withMinimum(1).withMaximum(1)
.withDefault("false").create())
.withDescription("print verbose information, like top-terms in each topic, during iteration")
.withShortName("v").create();
Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(numTopicsOpt)
.withOption(alphaOpt).withOption(etaOpt)
.withOption(maxIterOpt).withOption(burnInOpt).withOption(convergenceOpt)
.withOption(dictOpt).withOption(reInferDocTopicsOpt)
.withOption(outputDocFileOpt).withOption(outputTopicFileOpt).withOption(dfsOpt)
.withOption(numTrainThreadsOpt).withOption(numUpdateThreadsOpt)
.withOption(modelCorpusFractionOption).withOption(verboseOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
parser.setHelpOption(helpOpt);
CommandLine cmdLine = parser.parse(args);
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return -1;
}
String inputDirString = (String) cmdLine.getValue(inputDirOpt);
String dictDirString = cmdLine.hasOption(dictOpt) ? (String)cmdLine.getValue(dictOpt) : null;
int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
double alpha = Double.parseDouble((String)cmdLine.getValue(alphaOpt));
double eta = Double.parseDouble((String)cmdLine.getValue(etaOpt));
int maxIterations = Integer.parseInt((String)cmdLine.getValue(maxIterOpt));
int burnInIterations = Integer.parseInt((String)cmdLine.getValue(burnInOpt));
double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
int numTrainThreads = Integer.parseInt((String)cmdLine.getValue(numTrainThreadsOpt));
int numUpdateThreads = Integer.parseInt((String)cmdLine.getValue(numUpdateThreadsOpt));
String topicOutFile = (String)cmdLine.getValue(outputTopicFileOpt);
String docOutFile = (String)cmdLine.getValue(outputDocFileOpt);
//String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
double modelCorpusFraction = Double.parseDouble((String)cmdLine.getValue(modelCorpusFractionOption));
long start = System.nanoTime();
if (conf.get("fs.default.name") == null) {
String dfsNameNode = (String)cmdLine.getValue(dfsOpt);
conf.set("fs.default.name", dfsNameNode);
}
String[] terms = loadDictionary(dictDirString, conf);
logTime("dictionary loading", System.nanoTime() - start);
start = System.nanoTime();
Matrix corpus = loadVectors(inputDirString, conf);
logTime("vector seqfile corpus loading", System.nanoTime() - start);
start = System.nanoTime();
InMemoryCollapsedVariationalBayes0 cvb0 =
new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta,
numTrainThreads, numUpdateThreads, modelCorpusFraction);
logTime("cvb0 init", System.nanoTime() - start);
start = System.nanoTime();
cvb0.setVerbose(verbose);
cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
logTime("total training time", System.nanoTime() - start);
/*
if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
cvb0.inferDocuments(0.0, 100, true);
} else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
cvb0.inferDocuments(0.0, 100, false);
}
*/
start = System.nanoTime();
cvb0.writeModel(new Path(topicOutFile));
DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
logTime("printTopics", System.nanoTime() - start);
} catch (OptionException e) {
log.error("Error while parsing options", e);
CommandLineUtil.printHelp(group);
}
return 0;
}
private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
if (dictionaryPath == null) {
return null;
}
Path dictionaryFile = new Path(dictionaryPath);
List> termList = new ArrayList<>();
int maxTermId = 0;
// key is word value is id
for (Pair record
: new SequenceFileIterable(dictionaryFile, true, conf)) {
termList.add(new Pair<>(record.getSecond().get(),
record.getFirst().toString()));
maxTermId = Math.max(maxTermId, record.getSecond().get());
}
String[] terms = new String[maxTermId + 1];
for (Pair pair : termList) {
terms[pair.getFirst()] = pair.getSecond();
}
return terms;
}
@Override
public Configuration getConf() {
return super.getConf();
}
private static Matrix loadVectors(String vectorPathString, Configuration conf)
throws IOException {
Path vectorPath = new Path(vectorPathString);
FileSystem fs = vectorPath.getFileSystem(conf);
List subPaths = new ArrayList<>();
if (fs.isFile(vectorPath)) {
subPaths.add(vectorPath);
} else {
for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
subPaths.add(fileStatus.getPath());
}
}
List> rowList = new ArrayList<>();
int numRows = Integer.MIN_VALUE;
int numCols = -1;
boolean sequentialAccess = false;
for (Path subPath : subPaths) {
for (Pair record
: new SequenceFileIterable(subPath, true, conf)) {
int id = record.getFirst().get();
Vector vector = record.getSecond().get();
if (vector instanceof NamedVector) {
vector = ((NamedVector)vector).getDelegate();
}
if (numCols < 0) {
numCols = vector.size();
sequentialAccess = vector.isSequentialAccess();
}
rowList.add(Pair.of(id, vector));
numRows = Math.max(numRows, id);
}
}
numRows++;
Vector[] rowVectors = new Vector[numRows];
for (Pair pair : rowList) {
rowVectors[pair.getFirst()] = pair.getSecond();
}
return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
}
@Override
public int run(String[] strings) throws Exception {
return main2(strings, getConf());
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new InMemoryCollapsedVariationalBayes0(), args);
}
}