All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.wikidata.WikidataDumpLoader Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
package org.wikibrain.wikidata;

import gnu.trove.map.TIntIntMap;
import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.MetaInfoDao;
import org.wikibrain.core.dao.UniversalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.download.FileDownloader;
import org.wikibrain.parser.WpParseException;
import org.wikibrain.utils.ParallelForEach;
import org.wikibrain.utils.Procedure;
import org.wikibrain.utils.WpIOUtils;
import org.wikibrain.utils.WpThreadUtils;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.sql.SQLException;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Load the contents of a dump into the various daos.
 */
public class WikidataDumpLoader {
    private static final Logger LOG = LoggerFactory.getLogger(WikidataDumpLoader.class);

    private final AtomicInteger counter = new AtomicInteger();

    private final MetaInfoDao metaDao;
    private final WikidataDao wikidataDao;
    private final UniversalPageDao universalPageDao;
    private final LanguageSet languages;
    private final WikidataParser wdParser = new WikidataParser();
    private final TIntSet universalIds;
    private boolean keepAllLabeledEntities = false;

    public WikidataDumpLoader(WikidataDao wikidataDao, MetaInfoDao metaDao, UniversalPageDao upDao, LanguageSet langs) throws DaoException {
        this.wikidataDao = wikidataDao;
        this.metaDao = metaDao;
        this.languages = langs;
        this.universalPageDao = upDao;
        Map localMaps = universalPageDao.getAllUnivToLocalIdsMap(languages);

        // Build up set of universal ids from the local ids that we know about
        this.universalIds = new TIntHashSet();
        for(TIntIntMap langMap : localMaps.values()) {
            universalIds.addAll(langMap.keys());
        }
    }

    /**
     * Expects file name format starting with lang + "wiki" for example, "enwiki"
     *
     * @param file
     */
    public void load(final File file) throws IOException {
        LineIterator lines = new LineIterator(WpIOUtils.openBufferedReader(file));
        ParallelForEach.iterate(
                lines,
                WpThreadUtils.getMaxThreads(),
                1000,
                new Procedure() {
                    @Override
                    public void call(String page) {
                        try {
                            save(file, page);
                            metaDao.incrementRecords(WikidataEntity.class);
                        } catch (WpParseException e) {
                            LOG.warn("parsing of " + file.getPath() + " failed:", e);
                            metaDao.incrementErrorsQuietly(WikidataEntity.class);
                        } catch (DaoException e) {
                            LOG.warn("parsing of " + file.getPath() + " failed:", e);
                            metaDao.incrementErrorsQuietly(WikidataEntity.class);
                        }
                    }
                },
                Integer.MAX_VALUE
        );
        lines.close();
    }

    private void save(File file, String json) throws WpParseException, DaoException {
        if (!json.contains("{")) {
            return;
        }
        json = json.trim();
        if (json.endsWith(",")) {
            json = json.substring(0, json.length()-1);
        }
        if (counter.incrementAndGet() % 100000 == 0) {
            LOG.info("processing wikidata entity " + counter.get());
        }
        WikidataEntity entity = wdParser.parse(json);
        // check if others use prune's boolean?
        entity.prune(languages);

        if (keepEntity(entity)) {
            wikidataDao.save(entity);
        }
    }

    private boolean keepEntity(WikidataEntity entity) {
        if (entity.getType() == WikidataEntity.Type.PROPERTY) {
            return true;
        } else if (universalIds.contains(entity.getId())) {
            return true;
        } else if (keepAllLabeledEntities && !entity.getLabels().isEmpty()) {
            return true;
        } else {
            return false;
        }
    }

    public void setKeepAllLabeledEntities(boolean keepAllLabeledEntities) {
        this.keepAllLabeledEntities = keepAllLabeledEntities;
    }

    public static void main(String args[]) throws ClassNotFoundException, SQLException, IOException, ConfigurationException, DaoException, WikiBrainException, java.text.ParseException, InterruptedException {


        Options options = new Options();
        options.addOption(
                new DefaultOptionBuilder()
                        .withLongOpt("drop-tables")
                        .withDescription("drop and recreate all tables")
                        .create("d"));
        options.addOption(
                new DefaultOptionBuilder()
                        .withLongOpt("keep-labeled")
                        .withDescription("keep all labeled entities")
                        .create("k"));
        EnvBuilder.addStandardOptions(options);

        CommandLineParser parser = new PosixParser();
        CommandLine cmd;
        try {
            cmd = parser.parse(options, args);
        } catch (ParseException e) {
            System.err.println("Invalid option usage: " + e.getMessage());
            new HelpFormatter().printHelp("WikidataDumpLoader", options);
            return;
        }

        Env env =  new EnvBuilder(cmd).build();
        Configurator conf = env.getConfigurator();
        File path;
        if (cmd.getArgList().isEmpty()) {
            WikidataDumpHelper helper = new WikidataDumpHelper();

            // Fetch the file (if necessary) to the standard path
            String downloadDir = conf.getConf().get().getString("download.path");
            File dest = FileUtils.getFile(downloadDir, helper.getMostRecentFile());
            if (!dest.isFile()) {
                dest.getParentFile().mkdirs();
                File tmp = File.createTempFile("wikibrain-wikidata", "json");
                FileUtils.deleteQuietly(tmp);
                URL url = new URL(helper.getMostRecentUrl());
                FileDownloader downloader = new FileDownloader();
                downloader.download(url, tmp);
                if (dest.isFile()) {
                    throw new IllegalStateException();
                }
                FileUtils.moveFile(tmp, dest);
            }
            path = dest;
        } else if (cmd.getArgList().size() == 1) {
            path = new File(cmd.getArgList().get(0).toString());
        } else {
            System.err.println("Invalid option usage:");
            new HelpFormatter().printHelp("WikidataDumpLoader", options);
            return;
        }

        WikidataDao wdDao = conf.get(WikidataDao.class);
        UniversalPageDao upDao = conf.get(UniversalPageDao.class);
        MetaInfoDao metaDao = conf.get(MetaInfoDao.class);
        LanguageSet langs = conf.get(LanguageSet.class);

        WikidataDumpLoader loader = new WikidataDumpLoader(wdDao, metaDao, upDao, langs);

        if (cmd.hasOption("d")) {
            wdDao.clear();
            metaDao.clear(WikidataStatement.class);
        }
        if (cmd.hasOption("k")) {
            loader.setKeepAllLabeledEntities(true);
        }
        wdDao.beginLoad();
        metaDao.beginLoad();
        loader.load(path);

        LOG.info("building indexes");
        wdDao.endLoad();
        metaDao.endLoad();
        LOG.info("finished");
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy