All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.utils.data.dataset.LabeledSentences Maven / Gradle / Ivy

The newest version!
package eu.fbk.utils.data.dataset;

import eu.fbk.utils.data.DatasetMetaInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.IOException;
import java.io.LineNumberReader;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Map;

/**
 * Dataset that is just sentence with an integer label
 * Format is:
 * LABEL SENTENCE
 *
 * @author Yaroslav Nechaev ([email protected])
 */
public class LabeledSentences extends Dataset implements Closeable {
    final static Logger logger = LoggerFactory.getLogger(LabeledSentences.class);

    LineNumberReader reader;

    public LabeledSentences(DatasetMetaInfo info) throws URISyntaxException, IOException {
        super(info);
        reader = getReader();
    }

    @Override
    public void parse() {
    }

    public ArrayList readAll() {
        //Parse the input file
        ArrayList sentences = new ArrayList<>();
        try (LineNumberReader reader = getReader()) {
            String line;
            while ((line = reader.readLine()) != null) {
                //Sanitizing the the word
                Sentence sentence = Sentence.fromString(line);
                if (sentence == null) {
                    continue;
                }
                sentences.add(sentence);
            }

            logger.info("Parsed " + sentences.size() + " sentences");
        } catch (IOException e) {
            logger.error("Can't parse the input file", e);
        }
        return sentences;
    }

    public void reopen() {
        try {
            close();
            reader = getReader();
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }
    }

    public Sentence readNext() {
        try {
            String line = reader.readLine();
            return line == null ? null : Sentence.fromString(line);
        } catch (IOException e) {
            logger.error("Can't read from the target file", e);
        }
        return null;
    }

    @Override
    public void close() throws IOException {
        reader.close();
    }

    public static class Sentence {
        public String label;
        public String sentence;

        public Sentence(String label, String sentence) {
            this.label = label;
            this.sentence = sentence;
        }

        public static Sentence fromString(String line) {
            String[] elements = line.split("\t");
            if (elements.length < 2) {
                logger.warn("Not enough data. You've probably supplied a dataset with different format");
                return null;
            }
            return new Sentence(elements[0], elements[1]);
        }
    }

    public static class RemappedLabeledSentences extends LabeledSentences {
        private Map mappings;

        public RemappedLabeledSentences(Map mappings, DatasetMetaInfo info) throws URISyntaxException, IOException {
            super(info);
            this.mappings = mappings;
        }

        @Override
        public Sentence readNext() {
            Sentence sentence = super.readNext();
            if (sentence == null) {
                return null;
            }
            if (mappings.containsKey(sentence.label)) {
                sentence.label = mappings.get(sentence.label);
            }
            return sentence;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy