All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.xmldump.DBpediaAllMappingsExtractor Maven / Gradle / Ivy

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.wiki.xmldump;

import eu.fbk.twm.utils.CharacterTable;
import eu.fbk.twm.utils.CommandLineWithLogger;
import eu.fbk.twm.utils.ExtractorParameters;
import eu.fbk.twm.utils.WikipediaExtractor;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplate;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplateParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.*;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DBpediaAllMappingsExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named DBpediaMappingsExtractor.
	 */
	static Logger logger = Logger.getLogger(DBpediaAllMappingsExtractor.class.getName());

	private PrintWriter dbpediaWriter;
	private static Pattern pt = Pattern.compile("^Mapping.([a-zA-Z0-9]+):(.*)");
	private HashMap cache = new HashMap<>();
	private HashMap redirect = new HashMap<>();
	private static Pattern redirectPattern = Pattern.compile("#REDIRECT\\s+\\[\\[Mapping.{0,3}:(.*)\\]\\]");
	private HashMap> ignore = new HashMap<>();
	private HashSet added = new HashSet<>();

	public DBpediaAllMappingsExtractor() {
		super(1, 10000, new Locale("en"));
	}

	public void start(String in, String out, String manual) {
		//todo: rename

		cache = new HashMap<>();
		redirect = new HashMap<>();

		if (manual != null) {
			try {
				BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(manual), "utf-8"));
				String line;
				while ((line = reader.readLine()) != null) {
					line = line.trim();
					if (line.length() == 0) {
						continue;
					}
					if (line.startsWith("#")) {
						continue;
					}
					if (line.startsWith(Character.toString(CharacterTable.HYPHEN_MINUS))) {
						String[] parts = line.split("\t");
						String lang = parts[0].substring(1).trim();
						String infoboxName = parts[1].trim();
						if (ignore.get(lang) == null) {
							ignore.put(lang, new HashSet());
						}
						ignore.get(lang).add(infoboxName);
					}
					else {
						added.add(line);
					}
				}
				reader.close();
			} catch (Exception e) {
				logger.error(e.getMessage());
			}
		}

		logger.info("To ignore: " + ignore.size());
		logger.debug(ignore);
		logger.info("To add: " + added.size());
		logger.debug(added);

		try {
			dbpediaWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8")));
		} catch (IOException e) {
			logger.error(e);
		}
		startProcess(in);
	}

	@Override
	public void contentPage(String text, String title, int wikiID) {

		try {
			title = URLDecoder.decode(title, "UTF-8");
		} catch (Exception e) {
			return;
		}

		Matcher mt = pt.matcher(title);
		if (!mt.find()) {
			logger.debug("Pattern not found: " + title);
			return;
		}

		String infoboxName = mt.group(2);
		String lang = mt.group(1);

		if (ignore.get(lang) != null && ignore.get(lang).contains(infoboxName)) {
			logger.debug("Infobox ignored: " + infoboxName);
			return;
		}

		logger.debug("Title: " + title + " | Lang: " + lang + " | Infobox: " + infoboxName);

		text = text.replaceAll("", "");
		text = text.trim();

		Matcher m = redirectPattern.matcher(text);
		if (m.find()) {
			String to = m.group(1);
			redirect.put(lang + "\t" + infoboxName, lang + "\t" + to.trim().replace(CharacterTable.SPACE, CharacterTable.LOW_LINE));
			return;
		}

		ArrayList listOfTemplates = WikiTemplateParser.parse(text, false);
		for (WikiTemplate t : listOfTemplates) {

			if (!t.isRoot) {
				continue;
			}

			StringBuffer toPrint = new StringBuffer();
//			toPrint.append(lang).append("\t");
//			toPrint.append(infoboxName).append("\t");

			// Conditional mapping
			if (t.getFirstPart().equals("ConditionalMapping")) {
				logger.trace("Conditional");
				ArrayList conditions = WikiTemplateParser.parse(t.getContent(), false);
				for (WikiTemplate tCond : conditions) {
					if (!tCond.getFirstPart().equals("Condition")) {
						continue;
					}
					HashMap pMap = tCond.getHashMapOfParts();

					logger.trace("Property: " + pMap.get("templateProperty"));
					logger.trace("Operator: " + pMap.get("operator"));
					logger.trace("Value: " + pMap.get("value"));
					logger.trace("Mapping: " + pMap.get("mapping"));

					ArrayList mappings = WikiTemplateParser.parse(pMap.get("mapping"), false);
					for (WikiTemplate tMap : mappings) {
						if (tMap.isRoot && tMap.getFirstPart().equals("TemplateMapping")) {
							String mapToClass = tMap.getHashMapOfParts().get("mapToClass");
							if (mapToClass != null) {
								toPrint.append(pMap.get("operator"));
								toPrint.append("|");
								if (pMap.get("templateProperty") != null) {
									toPrint.append(pMap.get("templateProperty"));
								}
								toPrint.append("|");
								if (pMap.get("value") != null) {
									toPrint.append(pMap.get("value"));
								}
								toPrint.append("|");
								toPrint.append(mapToClass);
								toPrint.append("\t");
							}
						}

					}
				}

			}

			// Simple mapping
			else if (t.getFirstPart().equals("TemplateMapping")) {
				String mapToClass = t.getHashMapOfParts().get("mapToClass");
				if (mapToClass != null) {
					toPrint.append(mapToClass);
				}
			}

			else {
				continue;
			}

			cache.put(lang + "\t" + infoboxName, toPrint.toString());

			StringBuffer finalPrint = new StringBuffer();
			finalPrint.append(lang).append("\t");
			finalPrint.append(infoboxName).append("\t");
			finalPrint.append(toPrint);

			String out = new String(finalPrint);
			out = out.trim();
			if (out.length() > 0) {
				logger.debug(finalPrint);
				synchronized (this) {
					dbpediaWriter.append(out).append("\n");
				}
			}

		}
	}

	@Override
	public void filePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void start(ExtractorParameters extractorParameters) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void disambiguationPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void categoryPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void templatePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void redirectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void portalPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void projectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	public void endProcess() {

		if (redirect.size() > 0) {
			dbpediaWriter.append("\n");
			for (String s : redirect.keySet()) {
				StringBuffer finalPrint = new StringBuffer();
				finalPrint.append(s).append("\t");

				String to = cache.get(redirect.get(s));
				if (to == null) {
					continue;
				}
				finalPrint.append(to);

				String out = new String(finalPrint);
				out = out.trim();
				if (out.length() > 0) {
					logger.debug(finalPrint);
					synchronized (this) {
						dbpediaWriter.append(out).append("\n");
					}
				}

			}
		}

		if (added.size() > 0) {
			dbpediaWriter.append("\n");
			for (String line : added) {
				dbpediaWriter.append(line).append("\n");
			}
		}

		dbpediaWriter.flush();
		dbpediaWriter.close();
	}

	public static void main(String args[]) throws IOException {
		CommandLineWithLogger commandLineWithLogger = new CommandLineWithLogger();

		commandLineWithLogger.addOption(OptionBuilder.withDescription("Input file").hasArg().withArgName("file").withLongOpt("input").isRequired().create("i"));
		commandLineWithLogger.addOption(OptionBuilder.withDescription("Output file").hasArg().withArgName("file").withLongOpt("output").isRequired().create("o"));
		commandLineWithLogger.addOption(OptionBuilder.withDescription("Manual mappings file").hasArg().withArgName("file").withLongOpt("manual").create("m"));

		CommandLine commandLine = null;
		try {
			commandLine = commandLineWithLogger.getCommandLine(args);
			System.out.println(commandLineWithLogger.getLoggerProps());
			PropertyConfigurator.configure(commandLineWithLogger.getLoggerProps());
		} catch (Exception e) {
			System.exit(1);
		}

		String xin = commandLine.getOptionValue("input");
		String xout = commandLine.getOptionValue("output");
		String manual = commandLine.getOptionValue("manual");

		logger.debug("Debug message");

		DBpediaAllMappingsExtractor extractor = new DBpediaAllMappingsExtractor();
		extractor.start(xin, xout, manual);
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy