org.apache.mahout.clustering.classify.ClusterClassifier Maven / Gradle / Ivy
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.classify;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.classifier.AbstractVectorClassifier;
import org.apache.mahout.classifier.OnlineLearner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.clustering.iterator.ClusteringPolicy;
import org.apache.mahout.clustering.iterator.ClusteringPolicyWritable;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
/**
* This classifier works with any ClusteringPolicy and its associated Clusters.
* It is initialized with a policy and a list of compatible clusters and
* thereafter it can classify any new Vector into one or more of the clusters
* based upon the pdf() function which each cluster supports.
*
* In addition, it is an OnlineLearner and can be trained. Training amounts to
* asking the actual model to observe the vector and closing the classifier
* causes all the models to computeParameters.
*
* Because a ClusterClassifier implements Writable, it can be written-to and
* read-from a sequence file as a single entity. For sequential and MapReduce
* clustering in conjunction with a ClusterIterator; however, it utilizes an
* exploded file format. In this format, the iterator writes the policy to a
* single POLICY_FILE_NAME file in the clustersOut directory and the models are
* written to one or more part-n files so that multiple reducers may employed to
* produce them.
*/
public class ClusterClassifier extends AbstractVectorClassifier implements OnlineLearner, Writable {
private static final String POLICY_FILE_NAME = "_policy";
private List models;
private String modelClass;
private ClusteringPolicy policy;
/**
* The public constructor accepts a list of clusters to become the models
*
* @param models a List
* @param policy a ClusteringPolicy
*/
public ClusterClassifier(List models, ClusteringPolicy policy) {
this.models = models;
modelClass = models.get(0).getClass().getName();
this.policy = policy;
}
// needed for serialization/De-serialization
public ClusterClassifier() {
}
// only used by MR ClusterIterator
protected ClusterClassifier(ClusteringPolicy policy) {
this.policy = policy;
}
@Override
public Vector classify(Vector instance) {
return policy.classify(instance, this);
}
@Override
public double classifyScalar(Vector instance) {
if (models.size() == 2) {
double pdf0 = models.get(0).pdf(new VectorWritable(instance));
double pdf1 = models.get(1).pdf(new VectorWritable(instance));
return pdf0 / (pdf0 + pdf1);
}
throw new IllegalStateException();
}
@Override
public int numCategories() {
return models.size();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(models.size());
out.writeUTF(modelClass);
new ClusteringPolicyWritable(policy).write(out);
for (Cluster cluster : models) {
cluster.write(out);
}
}
@Override
public void readFields(DataInput in) throws IOException {
int size = in.readInt();
modelClass = in.readUTF();
models = new ArrayList<>();
ClusteringPolicyWritable clusteringPolicyWritable = new ClusteringPolicyWritable();
clusteringPolicyWritable.readFields(in);
policy = clusteringPolicyWritable.getValue();
for (int i = 0; i < size; i++) {
Cluster element = ClassUtils.instantiateAs(modelClass, Cluster.class);
element.readFields(in);
models.add(element);
}
}
@Override
public void train(int actual, Vector instance) {
models.get(actual).observe(new VectorWritable(instance));
}
/**
* Train the models given an additional weight. Unique to ClusterClassifier
*
* @param actual the int index of a model
* @param data a data Vector
* @param weight a double weighting factor
*/
public void train(int actual, Vector data, double weight) {
models.get(actual).observe(new VectorWritable(data), weight);
}
@Override
public void train(long trackingKey, String groupKey, int actual, Vector instance) {
models.get(actual).observe(new VectorWritable(instance));
}
@Override
public void train(long trackingKey, int actual, Vector instance) {
models.get(actual).observe(new VectorWritable(instance));
}
@Override
public void close() {
policy.close(this);
}
public List getModels() {
return models;
}
public ClusteringPolicy getPolicy() {
return policy;
}
public void writeToSeqFiles(Path path) throws IOException {
writePolicy(policy, path);
Configuration config = new Configuration();
FileSystem fs = FileSystem.get(path.toUri(), config);
ClusterWritable cw = new ClusterWritable();
for (int i = 0; i < models.size(); i++) {
try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, config,
new Path(path, "part-" + String.format(Locale.ENGLISH, "%05d", i)), IntWritable.class,
ClusterWritable.class)) {
Cluster cluster = models.get(i);
cw.setValue(cluster);
Writable key = new IntWritable(i);
writer.append(key, cw);
}
}
}
public void readFromSeqFiles(Configuration conf, Path path) throws IOException {
Configuration config = new Configuration();
List clusters = new ArrayList<>();
for (ClusterWritable cw : new SequenceFileDirValueIterable(path, PathType.LIST,
PathFilters.logsCRCFilter(), config)) {
Cluster cluster = cw.getValue();
cluster.configure(conf);
clusters.add(cluster);
}
this.models = clusters;
modelClass = models.get(0).getClass().getName();
this.policy = readPolicy(path);
}
public static ClusteringPolicy readPolicy(Path path) throws IOException {
Path policyPath = new Path(path, POLICY_FILE_NAME);
Configuration config = new Configuration();
FileSystem fs = FileSystem.get(policyPath.toUri(), config);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, policyPath, config);
Text key = new Text();
ClusteringPolicyWritable cpw = new ClusteringPolicyWritable();
reader.next(key, cpw);
Closeables.close(reader, true);
return cpw.getValue();
}
public static void writePolicy(ClusteringPolicy policy, Path path) throws IOException {
Path policyPath = new Path(path, POLICY_FILE_NAME);
Configuration config = new Configuration();
FileSystem fs = FileSystem.get(policyPath.toUri(), config);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, policyPath, Text.class,
ClusteringPolicyWritable.class);
writer.append(new Text(), new ClusteringPolicyWritable(policy));
Closeables.close(writer, false);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy