All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.components.machinelearning.impl.VectorNGramFeatureModifierBuilder Maven / Gradle / Ivy

There is a newer version: 5.8.1
Show newest version
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Free Software Foundation, Inc.
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.components.machinelearning.impl;

import java.util.Set;

import org.datacleaner.components.machinelearning.api.MLFeatureModifier;
import org.datacleaner.components.machinelearning.api.MLFeatureModifierBuilder;
import org.datacleaner.components.machinelearning.api.MLTrainingConstraints;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;

public class VectorNGramFeatureModifierBuilder implements MLFeatureModifierBuilder {

    private final int gramLength;
    private final Multiset grams;
    private final MLTrainingConstraints constraints;
    
    public VectorNGramFeatureModifierBuilder(int gramLength) {
        this(new MLTrainingConstraints(-1, true), gramLength);
    }

    public VectorNGramFeatureModifierBuilder(MLTrainingConstraints constraints, int gramLength) {
        this.gramLength = gramLength;
        this.constraints = constraints;
        this.grams = HashMultiset.create();
    }

    @Override
    public void addRecordValue(Object value) {
        final Iterable parts = VectorNGramFeatureModifier.split(value);
        for (String part : parts) {
            for (int index = 0; index + gramLength <= part.length(); index++) {
                final String gram = part.substring(index, index + gramLength);
                synchronized (this) {
                    grams.add(gram);
                }
            }
        }
    }

    @Override
    public MLFeatureModifier build() {
        return new VectorNGramFeatureModifier(gramLength, getGrams());
    }

    protected Set getGrams() {
        final Set resultSet = MLFeatureUtils.sanitizeFeatureVectorSet(grams, constraints);
        return resultSet;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy