org.apache.mahout.vectorizer.encoders.AdaptiveWordValueEncoder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.vectorizer.encoders;

import com.google.common.base.Charsets;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import org.apache.mahout.math.Vector;

/**
 * Encodes words into vectors much as does WordValueEncoder while maintaining
 * an adaptive dictionary of values seen so far.  This allows weighting of terms
 * without a pre-scan of all of the data.
 */
public class AdaptiveWordValueEncoder extends WordValueEncoder {

  private final Multiset dictionary;

  public AdaptiveWordValueEncoder(String name) {
    super(name);
    dictionary = HashMultiset.create();
  }

  /**
   * Adds a value to a vector.
   *
   * @param originalForm The original form of the value as a string.
   * @param data         The vector to which the value should be added.
   */
  @Override
  public void addToVector(String originalForm, double weight, Vector data) {
    dictionary.add(originalForm);
    super.addToVector(originalForm, weight, data);
  }

  @Override
  protected double getWeight(byte[] originalForm, double w) {
    return w * weight(originalForm);
  }

  @Override
  protected double weight(byte[] originalForm) {
    // the counts here are adjusted so that every observed value has an extra 0.5 count
    // as does a hypothetical unobserved value.  This smooths our estimates a bit and
    // allows the first word seen to have a non-zero weight of -log(1.5 / 2)
    double thisWord = dictionary.count(new String(originalForm, Charsets.UTF_8)) + 0.5;
    double allWords = dictionary.size() + dictionary.elementSet().size() * 0.5 + 0.5;
    return -Math.log(thisWord / allWords);
  }

  public Multiset getDictionary() {
    return dictionary;
  }
}