All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.deeplearning4j.text.sentenceiterator.FileSentenceIterator Maven / Gradle / Ivy

There is a newer version: 1.0.0-M2.1
Show newest version
/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  *  See the NOTICE file distributed with this work for additional
 *  *  information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

package org.deeplearning4j.text.sentenceiterator;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Queue;
import java.util.zip.GZIPInputStream;

@SuppressWarnings("unchecked")
public class FileSentenceIterator extends BaseSentenceIterator {

    /*
     * Used as a pair for when
     * the number of sentences is not known
     */
    protected volatile Iterator fileIterator;
    protected volatile Queue cache;
    protected volatile LineIterator currLineIterator;
    protected volatile File file;
    protected volatile File currentFile;

    /**
     * Takes a single file or directory
     *
     * @param preProcessor the sentence pre processor
     * @param file         the file or folder to iterate over
     */
    public FileSentenceIterator(SentencePreProcessor preProcessor, File file) {
        super(preProcessor);
        this.file = file;
        cache = new java.util.concurrent.ConcurrentLinkedDeque<>();
        if (file.isDirectory())
            fileIterator = FileUtils.iterateFiles(file, null, true);
        else
            fileIterator = Arrays.asList(file).iterator();
    }

    public FileSentenceIterator(File dir) {
        this(null, dir);
    }


    @Override
    public String nextSentence() {
        String ret = null;
        if (!cache.isEmpty()) {
            ret = cache.poll();
            if (preProcessor != null)
                ret = preProcessor.preProcess(ret);
            return ret;
        } else {

            if (currLineIterator == null || !currLineIterator.hasNext())
                nextLineIter();

            for (int i = 0; i < 100000; i++) {
                if (currLineIterator != null && currLineIterator.hasNext()) {
                    String line = currLineIterator.nextLine();
                    if (line != null)
                        cache.add(line);
                    else
                        break;
                } else
                    break;
            }

            if (!cache.isEmpty()) {
                ret = cache.poll();
                if (preProcessor != null)
                    ret = preProcessor.preProcess(ret);
                return ret;
            }

        }


        if (!cache.isEmpty())
            ret = cache.poll();
        return ret;

    }


    private void nextLineIter() {
        if (fileIterator.hasNext()) {
            try {
                File next = fileIterator.next();
                currentFile = next;
                if (next.getAbsolutePath().endsWith(".gz")) {
                    if (currLineIterator != null)
                        currLineIterator.close();
                    currLineIterator = IOUtils.lineIterator(
                                    new BufferedInputStream(new GZIPInputStream(new FileInputStream(next))), "UTF-8");

                } else {
                    if (currLineIterator != null) {
                        currLineIterator.close();
                    }
                    currLineIterator = FileUtils.lineIterator(next);

                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    @Override
    public boolean hasNext() {
        return currLineIterator != null && currLineIterator.hasNext() || fileIterator.hasNext() || !cache.isEmpty();
    }


    @Override
    public void reset() {
        if (file.isFile())
            fileIterator = Arrays.asList(file).iterator();
        else
            fileIterator = FileUtils.iterateFiles(file, null, true);


    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy