All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.xmldump.WikiDataClassExtractor Maven / Gradle / Ivy

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.wiki.xmldump;

import eu.fbk.twm.utils.CommandLineWithLogger;
import eu.fbk.twm.utils.Defaults;
import eu.fbk.twm.utils.ExtractorParameters;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.codehaus.jackson.map.ObjectMapper;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WikiDataClassExtractor extends AbstractWikipediaExtractor {
	Pattern q = Pattern.compile("^Q([0-9]+)$");
	private String outputFile;
	BufferedWriter writer = null;
	HashMap types = new HashMap();

	Logger logger = Logger.getLogger(WikiDataClassExtractor.class.getName());

	public WikiDataClassExtractor(int numThreads, int numPages, Locale locale) {
		super(numThreads, numPages, locale);
		types.put(279, "subclass_of");
		types.put(361, "part_of");
		types.put(31, "instance_of");
		types.put(646, "freebase");
		types.put(508, "bncf");
		logger.info(types);
	}

	public void start(String fileName, String outputFile) {
		this.outputFile = outputFile;
		try {
			writer = new BufferedWriter(new FileWriter(outputFile));
		} catch (Exception e) {
			logger.error(e.getMessage());
			return;
		}

		startProcess(fileName);
	}

	@Override
	public void start(ExtractorParameters extractorParameters) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void disambiguationPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void categoryPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void templatePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void redirectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void portalPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void projectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void filePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void contentPage(String text, String title, int wikiID) {

		Matcher m = q.matcher(title);
		if (!m.find()) {
			logger.trace("Invalid title: " + title);
			return;
		}
		String id = m.group(1);
		logger.trace("ID: " + id);

		ObjectMapper mapper = new ObjectMapper();
		Map pageData = null;
		try {
			pageData = mapper.readValue(text, Map.class);
		} catch (Exception ignored) {
			return;
		}

		StringBuffer sb = new StringBuffer();

		ArrayList claims = null;
		try {
			claims = (ArrayList) pageData.get(new String("claims"));
			for (Object c : claims) {
				Map thisClaim = (Map) c;
				ArrayList mArr = (ArrayList) thisClaim.get("m");

				// If it is not a value
				if (!mArr.get(0).equals("value")) {
					continue;
				}

				Integer propID = (Integer) mArr.get(1);

				// If it is not a good property
				if (!types.containsKey(propID)) {
					continue;
				}

				// If it is a WikiData id
				if (mArr.get(2).equals("wikibase-entityid")) {
					Map thisValues = (Map) mArr.get(3);
					Integer valueID = (Integer) thisValues.get("numeric-id");

					sb.append(id).append("\t").append(types.get(propID)).append("\t").append(valueID).append("\n");
				}

				if (mArr.get(2).equals("string")) {
					sb.append(id).append("\t").append(types.get(propID)).append("\t").append(mArr.get(3)).append("\n");
				}
			}
		} catch (Exception ignored) {
			return;
		}

		synchronized (this) {
			try {
				writer.write(sb.toString());
			} catch (Exception e) {
				logger.warn(e.getMessage());
			}
		}
	}

	@Override
	public void endProcess() {
		super.endProcess();
		try {
			writer.close();
		} catch (Exception e) {
			logger.error(e.getMessage());
		}
	}

	public static void main(String args[]) throws IOException {

		CommandLineWithLogger commandLineWithLogger = new CommandLineWithLogger();

		commandLineWithLogger.addOption(OptionBuilder.withArgName("filename").hasArg().withDescription("WikiData xml dump file").isRequired().withLongOpt("wikipedia-dump").create("w"));
		commandLineWithLogger.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("Number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER
                + ")").withLongOpt("num-threads").create("t"));
		commandLineWithLogger.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("Number of pages").withLongOpt("num-pages").create("p"));
		commandLineWithLogger.addOption(OptionBuilder.withArgName("filename").hasArg().withDescription("Output file").isRequired().withLongOpt("output").create("o"));

		CommandLine commandLine = null;
		try {
			commandLine = commandLineWithLogger.getCommandLine(args);
			PropertyConfigurator.configure(commandLineWithLogger.getLoggerProps());
		} catch (Exception e) {
			System.exit(1);
		}

		int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
		if (commandLine.hasOption("num-threads")) {
			numThreads = Integer.parseInt(commandLine.getOptionValue("num-threads"));
		}

		String wikidataFile = commandLine.getOptionValue("w");
		String outputFile = commandLine.getOptionValue("o");

		int numPages = Integer.MAX_VALUE;
		if (commandLine.hasOption("p")) {
			numPages = Integer.parseInt(commandLine.getOptionValue("p"));
		}

		WikiDataClassExtractor wd = new WikiDataClassExtractor(numThreads, numPages, Locale.ENGLISH);
		wd.start(wikidataFile, outputFile);

	}
}