
eu.fbk.twm.wiki.xmldump.DBpediaAllMappingsExtractor Maven / Gradle / Ivy
/*
* Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.fbk.twm.wiki.xmldump;
import eu.fbk.twm.utils.CharacterTable;
import eu.fbk.twm.utils.CommandLineWithLogger;
import eu.fbk.twm.utils.ExtractorParameters;
import eu.fbk.twm.utils.WikipediaExtractor;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplate;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplateParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import java.io.*;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DBpediaAllMappingsExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
/**
* Define a static logger variable so that it references the
* Logger instance named DBpediaMappingsExtractor
.
*/
static Logger logger = Logger.getLogger(DBpediaAllMappingsExtractor.class.getName());
private PrintWriter dbpediaWriter;
private static Pattern pt = Pattern.compile("^Mapping.([a-zA-Z0-9]+):(.*)");
private HashMap cache = new HashMap<>();
private HashMap redirect = new HashMap<>();
private static Pattern redirectPattern = Pattern.compile("#REDIRECT\\s+\\[\\[Mapping.{0,3}:(.*)\\]\\]");
private HashMap> ignore = new HashMap<>();
private HashSet added = new HashSet<>();
public DBpediaAllMappingsExtractor() {
super(1, 10000, new Locale("en"));
}
public void start(String in, String out, String manual) {
//todo: rename
cache = new HashMap<>();
redirect = new HashMap<>();
if (manual != null) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(manual), "utf-8"));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() == 0) {
continue;
}
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(Character.toString(CharacterTable.HYPHEN_MINUS))) {
String[] parts = line.split("\t");
String lang = parts[0].substring(1).trim();
String infoboxName = parts[1].trim();
if (ignore.get(lang) == null) {
ignore.put(lang, new HashSet());
}
ignore.get(lang).add(infoboxName);
}
else {
added.add(line);
}
}
reader.close();
} catch (Exception e) {
logger.error(e.getMessage());
}
}
logger.info("To ignore: " + ignore.size());
logger.debug(ignore);
logger.info("To add: " + added.size());
logger.debug(added);
try {
dbpediaWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8")));
} catch (IOException e) {
logger.error(e);
}
startProcess(in);
}
@Override
public void contentPage(String text, String title, int wikiID) {
try {
title = URLDecoder.decode(title, "UTF-8");
} catch (Exception e) {
return;
}
Matcher mt = pt.matcher(title);
if (!mt.find()) {
logger.debug("Pattern not found: " + title);
return;
}
String infoboxName = mt.group(2);
String lang = mt.group(1);
if (ignore.get(lang) != null && ignore.get(lang).contains(infoboxName)) {
logger.debug("Infobox ignored: " + infoboxName);
return;
}
logger.debug("Title: " + title + " | Lang: " + lang + " | Infobox: " + infoboxName);
text = text.replaceAll("", "");
text = text.trim();
Matcher m = redirectPattern.matcher(text);
if (m.find()) {
String to = m.group(1);
redirect.put(lang + "\t" + infoboxName, lang + "\t" + to.trim().replace(CharacterTable.SPACE, CharacterTable.LOW_LINE));
return;
}
ArrayList listOfTemplates = WikiTemplateParser.parse(text, false);
for (WikiTemplate t : listOfTemplates) {
if (!t.isRoot) {
continue;
}
StringBuffer toPrint = new StringBuffer();
// toPrint.append(lang).append("\t");
// toPrint.append(infoboxName).append("\t");
// Conditional mapping
if (t.getFirstPart().equals("ConditionalMapping")) {
logger.trace("Conditional");
ArrayList conditions = WikiTemplateParser.parse(t.getContent(), false);
for (WikiTemplate tCond : conditions) {
if (!tCond.getFirstPart().equals("Condition")) {
continue;
}
HashMap pMap = tCond.getHashMapOfParts();
logger.trace("Property: " + pMap.get("templateProperty"));
logger.trace("Operator: " + pMap.get("operator"));
logger.trace("Value: " + pMap.get("value"));
logger.trace("Mapping: " + pMap.get("mapping"));
ArrayList mappings = WikiTemplateParser.parse(pMap.get("mapping"), false);
for (WikiTemplate tMap : mappings) {
if (tMap.isRoot && tMap.getFirstPart().equals("TemplateMapping")) {
String mapToClass = tMap.getHashMapOfParts().get("mapToClass");
if (mapToClass != null) {
toPrint.append(pMap.get("operator"));
toPrint.append("|");
if (pMap.get("templateProperty") != null) {
toPrint.append(pMap.get("templateProperty"));
}
toPrint.append("|");
if (pMap.get("value") != null) {
toPrint.append(pMap.get("value"));
}
toPrint.append("|");
toPrint.append(mapToClass);
toPrint.append("\t");
}
}
}
}
}
// Simple mapping
else if (t.getFirstPart().equals("TemplateMapping")) {
String mapToClass = t.getHashMapOfParts().get("mapToClass");
if (mapToClass != null) {
toPrint.append(mapToClass);
}
}
else {
continue;
}
cache.put(lang + "\t" + infoboxName, toPrint.toString());
StringBuffer finalPrint = new StringBuffer();
finalPrint.append(lang).append("\t");
finalPrint.append(infoboxName).append("\t");
finalPrint.append(toPrint);
String out = new String(finalPrint);
out = out.trim();
if (out.length() > 0) {
logger.debug(finalPrint);
synchronized (this) {
dbpediaWriter.append(out).append("\n");
}
}
}
}
@Override
public void filePage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void start(ExtractorParameters extractorParameters) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void disambiguationPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void categoryPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void templatePage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void redirectPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void portalPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void projectPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
public void endProcess() {
if (redirect.size() > 0) {
dbpediaWriter.append("\n");
for (String s : redirect.keySet()) {
StringBuffer finalPrint = new StringBuffer();
finalPrint.append(s).append("\t");
String to = cache.get(redirect.get(s));
if (to == null) {
continue;
}
finalPrint.append(to);
String out = new String(finalPrint);
out = out.trim();
if (out.length() > 0) {
logger.debug(finalPrint);
synchronized (this) {
dbpediaWriter.append(out).append("\n");
}
}
}
}
if (added.size() > 0) {
dbpediaWriter.append("\n");
for (String line : added) {
dbpediaWriter.append(line).append("\n");
}
}
dbpediaWriter.flush();
dbpediaWriter.close();
}
public static void main(String args[]) throws IOException {
CommandLineWithLogger commandLineWithLogger = new CommandLineWithLogger();
commandLineWithLogger.addOption(OptionBuilder.withDescription("Input file").hasArg().withArgName("file").withLongOpt("input").isRequired().create("i"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Output file").hasArg().withArgName("file").withLongOpt("output").isRequired().create("o"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Manual mappings file").hasArg().withArgName("file").withLongOpt("manual").create("m"));
CommandLine commandLine = null;
try {
commandLine = commandLineWithLogger.getCommandLine(args);
System.out.println(commandLineWithLogger.getLoggerProps());
PropertyConfigurator.configure(commandLineWithLogger.getLoggerProps());
} catch (Exception e) {
System.exit(1);
}
String xin = commandLine.getOptionValue("input");
String xout = commandLine.getOptionValue("output");
String manual = commandLine.getOptionValue("manual");
logger.debug("Debug message");
DBpediaAllMappingsExtractor extractor = new DBpediaAllMappingsExtractor();
extractor.start(xin, xout, manual);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy