All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.wikidata.WikidataDumpParser Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
package org.wikibrain.wikidata;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang3.StringEscapeUtils;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageInfo;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.parser.DumpSplitter;
import org.wikibrain.parser.xml.PageXmlParser;
import org.wikibrain.utils.WpIOUtils;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * @author Shilad Sen
 */
public class WikidataDumpParser implements Iterable {
    public static final Logger LOG = LoggerFactory.getLogger(DumpSplitter.class);

    private final WikidataParser wdParser;
    private final LanguageSet languages;
    private final File file;

    public WikidataDumpParser(File file) {
        this(file, LanguageSet.ALL);
    }

    /**
     * @param file
     */
    public WikidataDumpParser(File file, LanguageSet languages) {
        this.file = file;
        this.languages = languages;
        this.wdParser = new WikidataParser();
    }

    @Override
    public Iterator iterator() {
        return new IteratorImpl();
    }

    public class IteratorImpl implements Iterator {
        private final Iterator iterImpl;
        private WikidataEntity buff;

        public IteratorImpl() {
            try {
                this.iterImpl = new LineIterator(WpIOUtils.openBufferedReader(file));
            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        @Override
        public boolean hasNext() {
            if (buff == null) {
                fillBuff();
            }
            return (buff != null);
        }

        private void fillBuff() {
            if (buff != null) {
                return;
            }
            // try to queue up the next article
            while (buff == null && iterImpl.hasNext()) {
                String line = iterImpl.next();
                if (line.trim().equals("[") || line.trim().equals("]")) {
                    continue;
                }
                try {
                    if (line.endsWith(",")) {
                        line = line.substring(0, line.length()-1);
                    }
                    if (!line.trim().isEmpty()) {
                        buff = wdParser.parse(line);
                    }
                } catch (Exception e) {
                    LOG.warn("parsing of " + file + " failed for line '" + line  + "':", e);
                }
            }
        }

        @Override
        public WikidataEntity next() {
            fillBuff();
            WikidataEntity next = buff;
            buff = null;
            return next;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy