All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.feature.Bag Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.feature;

import java.util.HashMap;
import java.util.Map;

/**
 * The bag-of-words feature of text used in natural language
 * processing and information retrieval. In this model, a text (such as a
 * sentence or a document) is represented as an unordered collection of words,
 * disregarding grammar and even word order. This is a generic implementations.
 * Thus, it can be used for sequences of any (discrete) data types with limited
 * number of values.
 * 
 * @author Haifeng Li
 */
public class Bag {
    /**
     * The mapping from feature words to indices.
     */
    private Map features;

    /**
     * True to check if feature words appear in a document instead of their
     * frequencies.
     */
    private boolean binary;

    /**
     * Constructor.
     * @param features the list of feature objects.
     */
    public Bag(T[] features) {
        this(features, false);
    }

    /**
     * Constructor.
     * @param features the list of feature objects. The feature objects should be unique in the list.
     * Note that the Bag class doesn't learn the features, but just use them as attributes.
     * @param binary true to check if feature object appear in a collection
     * instead of their frequencies.
     */
    public Bag(T[] features, boolean binary) {
        this.binary = binary;
        this.features = new HashMap();
        for (int i = 0, k = 0; i < features.length; i++) {
            if (!this.features.containsKey(features[i])) {
                this.features.put(features[i], k++);
            }
        }
    }
    
    /**
     * Returns the bag-of-words features of a document. The features are real-valued
     * in convenience of most learning algorithms although they take only integer
     * or binary values.
     */
    public double[] feature(T[] x) {
        double[] bag = new double[features.size()];

        if (binary) {
            for (T word : x) {
                Integer f = features.get(word);
                if (f != null) {
                    bag[f] = 1.0;
                }
            }
        } else {
            for (T word : x) {
                Integer f = features.get(word);
                if (f != null) {
                    bag[f]++;
                }
            }
        }

        return bag;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy