org.apache.mahout.vectorizer.encoders.TextValueEncoder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.vectorizer.encoders;

import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import org.apache.mahout.math.Vector;

import java.util.Collection;
import java.util.regex.Pattern;

/**
 * Encodes text that is tokenized on non-alphanum separators.  Each word is encoded using a
 * settable encoder which is by default an StaticWordValueEncoder which gives all
 * words the same weight.
 * @see LuceneTextValueEncoder
 */
public class TextValueEncoder extends FeatureVectorEncoder {

  private static final double LOG_2 = Math.log(2.0);

  private static final Splitter ON_NON_WORD = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();

  private FeatureVectorEncoder wordEncoder;
  private final Multiset counts;

  public TextValueEncoder(String name) {
    super(name, 2);
    wordEncoder = new StaticWordValueEncoder(name);
    counts = HashMultiset.create();
  }

  /**
   * Adds a value to a vector after tokenizing it by splitting on non-alphanum characters.
   *
   * @param originalForm The original form of the value as a string.
   * @param data         The vector to which the value should be added.
   */
  @Override
  public void addToVector(byte[] originalForm, double weight, Vector data) {
    addText(originalForm);
    flush(weight, data);
  }

  /**
   * Adds text to the internal word counter, but delays converting it to vector
   * form until flush is called.
   * @param originalForm  The original text encoded as UTF-8
   */
  public void addText(byte[] originalForm) {
    addText(new String(originalForm, Charsets.UTF_8));
  }

  /**
   * Adds text to the internal word counter, but delays converting it to vector
   * form until flush is called.
   * @param text  The original text encoded as UTF-8
   */
  public void addText(CharSequence text) {
    for (String word : tokenize(text)) {
      counts.add(word);
    }
  }

  /**
   * Adds all of the tokens that we counted up to a vector.
   */
  public void flush(double weight, Vector data) {
    for (String word : counts.elementSet()) {
      // weight words by log_2(tf) times whatever other weight we are given
      wordEncoder.addToVector(word, weight * Math.log1p(counts.count(word)) / LOG_2, data);
    }
    counts.clear();
  }

  @Override
  protected int hashForProbe(byte[] originalForm, int dataSize, String name, int probe) {
    return 0;
  }

  @Override
  protected Iterable hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) {
    Collection hashes = Lists.newArrayList();
    for (String word : tokenize(new String(originalForm, Charsets.UTF_8))) {
      hashes.add(hashForProbe(bytesForString(word), dataSize, name, probe));
    }
    return hashes;
  }

  /**
   * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
   * tokenization.
   * @see LuceneTextValueEncoder
   */
  protected Iterable tokenize(CharSequence originalForm) {
    return ON_NON_WORD.split(originalForm);
  }

  /**
   * Converts a value into a form that would help a human understand the internals of how the value
   * is being interpreted.  For text-like things, this is likely to be a list of the terms found with
   * associated weights (if any).
   *
   * @param originalForm The original form of the value as a string.
   * @return A string that a human can read.
   */
  @Override
  public String asString(String originalForm) {
    StringBuilder r = new StringBuilder();
    r.append('[');
    for (String word : tokenize(originalForm)) {
      if (r.length() > 1) {
        r.append(", ");
      }
      r.append(wordEncoder.asString(word));
    }
    r.append(']');
    return r.toString();
  }

  public final void setWordEncoder(FeatureVectorEncoder wordEncoder) {
    this.wordEncoder = wordEncoder;
  }
}