org.deeplearning4j.bagofwords.vectorizer.TextVectorizer Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

package org.deeplearning4j.bagofwords.vectorizer;

import org.deeplearning4j.core.datasets.vectorizer.Vectorizer;
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.text.invertedindex.InvertedIndex;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.DataSet;

import java.io.File;
import java.io.InputStream;
import java.util.List;

/**
 * Vectorizes text
 * @author Adam Gibson
 */
public interface TextVectorizer extends Vectorizer {


    /**
     * Sampling for building mini batches
     * @return the sampling
     */
    //double sample();

    /**
     * For word vectors, this is the batch size for how to partition documents
     * in to workloads
     * @return the batchsize for partitioning documents in to workloads
     */
    //int batchSize();

    /**
     * The vocab sorted in descending order
     * @return the vocab sorted in descending order
     */
    VocabCache getVocabCache();


    /**
     * Text coming from an input stream considered as one document
     * @param is the input stream to read from
     * @param label the label to assign
     * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word counts or tfidf scores)
     */
    DataSet vectorize(InputStream is, String label);

    /**
     * Vectorizes the passed in text treating it as one document
     * @param text the text to vectorize
     * @param label the label of the text
     * @return a dataset with a transform of weights(relative to impl; could be word counts or tfidf scores)
     */
    DataSet vectorize(String text, String label);

    /**
     * Train the model
     */
    void fit();

    /**
     *
     * @param input the text to vectorize
     * @param label the label of the text
     * @return {@link DataSet} with a applyTransformToDestination of
     *          weights(relative to impl; could be word counts or tfidf scores)
     */
    DataSet vectorize(File input, String label);


    /**
     * Transforms the matrix
     * @param text text to transform
     * @return {@link INDArray}
     */
    INDArray transform(String text);

    /**
     * Transforms the matrix
     * @param tokens
     * @return
     */
    INDArray transform(List tokens);

    /**
     * Returns the number of words encountered so far
     * @return the number of words encountered so far
     */
    long numWordsEncountered();

    /**
     * Inverted index
     * @return the inverted index for this vectorizer
     */
    InvertedIndex getIndex();
}