All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.deeplearning4j.text.documentiterator.BasicLabelAwareIterator Maven / Gradle / Ivy

There is a newer version: 1.0.0-M2.1
Show newest version
/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

package org.deeplearning4j.text.documentiterator;

import lombok.NonNull;
import org.deeplearning4j.text.documentiterator.interoperability.DocumentIteratorConverter;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.sentenceiterator.interoperability.SentenceIteratorConverter;
import org.deeplearning4j.text.sentenceiterator.labelaware.LabelAwareSentenceIterator;

import java.util.concurrent.atomic.AtomicLong;

/**
 * This is simple class, for building Sentence-Label pairs for ParagraphVectors/Doc2Vec.
 * Idea is simple - you provide SentenceIterator or DocumentIterator, and it builds nice structure for future model reuse
 *
 * @author [email protected]
 */
public class BasicLabelAwareIterator implements LabelAwareIterator {
    // this counter is used for dumb labels generation
    protected AtomicLong documentPosition = new AtomicLong(0);

    protected LabelsSource generator;

    protected transient LabelAwareIterator backendIterator;

    private BasicLabelAwareIterator() {

    }

    /**
     * This method checks, if there's more LabelledDocuments
     * @return
     */
    public boolean hasNextDocument() {
        return backendIterator.hasNextDocument();
    }

    /**
     * This method returns next LabelledDocument
     * @return
     */
    public LabelledDocument nextDocument() {
        return backendIterator.nextDocument();
    }

    /**
    * This methods resets LabelAwareIterator
    */
    public void reset() {
        backendIterator.reset();
    }

    /**
     * This method returns LabelsSource instance, containing all labels derived from this iterator
     * @return
     */
    @Override
    public LabelsSource getLabelsSource() {
        return generator;
    }

    @Override
    public boolean hasNext() {
        return hasNextDocument();
    }

    @Override
    public LabelledDocument next() {
        return nextDocument();
    }

    @Override
    public void shutdown() {
        // no-op
    }

    @Override
    public void remove() {
        // no-op
    }

    public static class Builder {
        private String labelTemplate = "DOC_";

        private LabelAwareIterator labelAwareIterator;
        private LabelsSource generator = new LabelsSource(labelTemplate);

        /**
         * This method should stay protected, since it's only viable for testing purposes
         */
        protected Builder() {

        }

        /**
         * We assume that each sentence in this iterator is separate document/paragraph
         *
         * @param iterator
         */
        public Builder(@NonNull SentenceIterator iterator) {
            this.labelAwareIterator = new SentenceIteratorConverter(iterator, generator);
        }

        /**
         * We assume that each inputStream in this iterator is separate document/paragraph
         * @param iterator
         */
        public Builder(@NonNull DocumentIterator iterator) {
            this.labelAwareIterator = new DocumentIteratorConverter(iterator, generator);
        }

        /**
         * We assume that each sentence in this iterator is separate document/paragraph.
         * Labels will be converted into LabelledDocument format
         *
         * @param iterator
         */
        public Builder(@NonNull LabelAwareSentenceIterator iterator) {
            this.labelAwareIterator = new SentenceIteratorConverter(iterator, generator);
        }

        /**
         * We assume that each inputStream in this iterator is separate document/paragraph
         * Labels will be converted into LabelledDocument format
         *
         * @param iterator
         */
        public Builder(@NonNull LabelAwareDocumentIterator iterator) {
            this.labelAwareIterator = new DocumentIteratorConverter(iterator, generator);
        }


        public Builder(@NonNull LabelAwareIterator iterator) {
            this.labelAwareIterator = iterator;
            this.generator = iterator.getLabelsSource();
        }

        /**
         * Label template will be used for sentence labels generation. I.e. if provided template is "DOCUMENT_", all documents/paragraphs will have their labels starting from "DOCUMENT_0" to "DOCUMENT_X", where X is the total number of documents - 1
         *
         * @param template
         * @return
         */
        public Builder setLabelTemplate(@NonNull String template) {
            this.labelTemplate = template;
            this.generator.setTemplate(template);
            return this;
        }

        /**
         * TODO: To be implemented
         *
         * @param source
         * @return
         */
        public Builder setLabelsSource(@NonNull LabelsSource source) {
            this.generator = source;
            return this;
        }

        public BasicLabelAwareIterator build() {
            BasicLabelAwareIterator iterator = new BasicLabelAwareIterator();
            iterator.generator = this.generator;
            iterator.backendIterator = this.labelAwareIterator;

            return iterator;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy