All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.utils.data.dataset.bow.NGramMapping Maven / Gradle / Ivy

The newest version!
package eu.fbk.utils.data.dataset.bow;

import eu.fbk.utils.core.Stopwatch;
import eu.fbk.utils.data.DatasetMetaInfo;

import java.io.IOException;
import java.io.LineNumberReader;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

/**
 * A mapping between an ngram and it's corresponding index (thewikimachine version)
 * The format is
 * ngram index weight
 *
 * @author Yaroslav Nechaev ([email protected])
 */
public class NGramMapping extends FeatureMapping {
    public NGramMapping(DatasetMetaInfo info) throws URISyntaxException {
        super(info);
    }

    @Override
    public void parse() {
        //Parse the input file
        features = new HashMap<>();
        int index = 0;
        try (LineNumberReader reader = getReader()) {
            String line;
            Stopwatch watch = Stopwatch.start();
            double maxValue = 0.0d;
            while ((line = reader.readLine()) != null) {
                String[] elements = line.split("\t");
                //Sanitizing the the word
                Feature feature = new Feature();
                feature.index = index;
                String ngram = elements[2].replace(' ', '_').toLowerCase();
                feature.weight = 1.0d / Double.valueOf(elements[0]);
                if (Double.isNaN(feature.weight)) {
                    feature.weight = 0.0d;
                }
                if (feature.weight > maxValue) {
                    maxValue = feature.weight;
                }
                features.put(ngram, feature);
                index++;
                if (index % 1000000 == 0) {
                    logger.info(String.format("Parsed %2dm bow (%.2f seconds)",
                            index / 1000000,
                            (double) watch.click() / 1000
                    ));
                }
            }
            logger.info("Rescaling weights...");
            for (Feature feature : features.values()) {
                feature.weight = Math.log(1 + maxValue * feature.weight);
            }
            logger.info("Parsing finished with " + features.size() + " bow");
        } catch (IOException e) {
            logger.error("Can't parse the input file: " + e.getClass().getSimpleName() + " " + e.getMessage());
        }
    }

    public Map getRawMap() {
        return features;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy