smile.feature.extraction.BagOfWords Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-core Show documentation
smile-core
There is a newer version: 4.0.0
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.feature.extraction;

import java.util.Arrays;
import java.util.function.Function;
import java.util.HashMap;
import java.util.Map;

import smile.data.DataFrame;
import smile.data.Tuple;
import smile.data.transform.Transform;
import smile.data.type.DataTypes;
import smile.data.type.StructField;
import smile.data.type.StructType;
import smile.sort.QuickSort;

/**
 * The bag-of-words feature of text used in natural language
 * processing and information retrieval. In this model, a text
 * (such as a sentence or a document) is represented as an
 * unordered collection of words, disregarding grammar and
 * even word order.
 * 
 * @author Haifeng Li
 */
public class BagOfWords implements Transform {
    /**
     * The tokenizer of text, which may include additional processing
     * such as filtering stop word, converting to lowercase, stemming, etc.
     */
    private final Function tokenizer;
    /**
     * The feature words.
     */
    private final String[] words;
    /**
     * The mapping from feature words to indices.
     */
    private final Map featureIndex;
    /**
     * True to check if feature words appear in a document instead of their
     * frequencies.
     */
    private final boolean binary;
    /**
     * The schema of output space.
     */
    private final StructType schema;
    /**
     * The input text fields.
     */
    private final String[] columns;

    /**
     * Constructor.
     * @param tokenizer the tokenizer of text, which may include additional processing
     *                  such as filtering stop word, converting to lowercase, stemming, etc.
     * @param words the list of feature words.
     */
    public BagOfWords(Function tokenizer, String[] words) {
        this(null, tokenizer, words, false);
    }

    /**
     * Constructor.
     * @param columns the input text fields in a data frame.
     * @param tokenizer the tokenizer of text, which may include additional processing
     *                  such as filtering stop word, converting to lowercase, stemming, etc.
     * @param words the list of feature words. The feature words should be unique in the list.
     *              Note that the Bag class doesn't learn the features, but just use them as attributes.
     * @param binary true to check if feature object appear in a collection
     *               instead of their frequencies.
     */
    public BagOfWords(String[] columns, Function tokenizer, String[] words, boolean binary) {
        this.columns = columns;
        this.tokenizer = tokenizer;
        this.binary = binary;
        this.words = words;
        this.featureIndex = new HashMap<>();
        for (int i = 0; i < words.length; i++) {
            if (this.featureIndex.containsKey(words[i])) {
                throw new IllegalArgumentException("Duplicated word:" + words[i]);
            }
            this.featureIndex.put(words[i], i);
        }

        StructField[] fields = Arrays.stream(words)
                .map(word -> new StructField("BoW_" + word, DataTypes.IntegerType))
                .toArray(StructField[]::new);
        this.schema = new StructType(fields);
    }

    /**
     * Returns the feature words.
     * @return the feature words.
     */
    public String[] features() {
        return words;
    }

    /**
     * Learns a vocabulary dictionary of top-k frequent tokens in the raw documents.
     * @param data training data.
     * @param tokenizer the tokenizer of text, which may include additional processing
     *                  such as filtering stop word, converting to lowercase, stemming, etc.
     * @param k the limit of vocabulary size.
     * @param columns the text columns.
     * @return the model.
     */
    public static BagOfWords fit(DataFrame data, Function tokenizer, int k, String... columns) {
        HashMap words = new HashMap<>();
        for (String column : columns) {
            for (String text : data.column(column).toStringArray()) {
                for (String word : tokenizer.apply(text)) {
                    Integer count = words.get(word);
                    if (count == null) words.put(word, 1);
                    else words.put(word, count + 1);
                }
            }
        }

        String[] features = new String[words.size()];
        int[] count = new int[words.size()];
        int i = 0;
        for (String word : words.keySet()) {
            features[i] = word;
            count[i++] = -words.get(word);
        }

        QuickSort.sort(count, features);
        return new BagOfWords(columns, tokenizer, Arrays.copyOf(features, Math.min(k, features.length)), false);
    }

    @Override
    public Tuple apply(Tuple x) {
        int[] bag = new int[featureIndex.size()];

        for (String column : columns) {
            for (String word : tokenizer.apply(x.getString(column))) {
                Integer index = featureIndex.get(word);
                if (index != null) {
                    if (binary) bag[index] = 1;
                    else bag[index]++;
                }
            }
        }

        return Tuple.of(bag, schema);
    }

    /**
     * Returns the bag-of-words features of a document.
     * @param text a document.
     * @return the feature vector.
     */
    public int[] apply(String text) {
        int[] bag = new int[featureIndex.size()];

        for (String word : tokenizer.apply(text)) {
            Integer index = featureIndex.get(word);
            if (index != null) {
                if (binary) bag[index] = 1;
                else bag[index]++;
            }
        }

        return bag;
    }
}