![JAR search and dependency download from the Maven repository](/logo.png)
eu.fbk.twm.wiki.xmldump.DBpediaClassExtractor Maven / Gradle / Ivy
/*
* Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.fbk.twm.wiki.xmldump;
import eu.fbk.twm.index.PageClassIndexer;
import eu.fbk.twm.utils.*;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplate;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplateParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.PropertyConfigurator;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
public class DBpediaClassExtractor extends AbstractWikipediaExtractor {
PageClassIndexer classWriter = null;
HashMap mappings = null;
DBpediaOntology ontology = null;
String page = null;
public DBpediaClassExtractor(int numThreads, Locale locale) {
super(numThreads, Integer.MAX_VALUE, locale);
}
public void start(String xmlFile, String outputDir, String mappingFile, String owlFile) {
start(xmlFile, outputDir, mappingFile, owlFile, null, null);
}
public void start(String xmlFile, String outputDir, String mappingFile, String owlFile, String addendaFile) {
start(xmlFile, outputDir, mappingFile, owlFile, addendaFile, null);
}
public void start(String xmlFile, String outputDir, String mappingFile, String owlFile, String addendaFile, String page) {
try {
logger.info("Loading mappings");
String lang = getLocale().toLanguageTag();
logger.info("Language: " + lang);
mappings = DBpediaMapping.loadFromFile(mappingFile).get(lang);
ontology = new AirpediaOntology(owlFile, addendaFile);
classWriter = new PageClassIndexer(outputDir, true);
this.page = page;
} catch (Exception e) {
logger.error(e.getMessage());
return;
}
startProcess(xmlFile);
}
@Override
public void start(ExtractorParameters extractorParameters) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void disambiguationPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void categoryPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void templatePage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void redirectPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void portalPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void projectPage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void filePage(String text, String title, int wikiID) {
//To change body of implemented methods use File | Settings | File Templates.
}
@Override
public void contentPage(String text, String title, int wikiID) {
if (page != null && !page.equals(title)) {
return;
}
if (page != null) {
logger.debug(String.format("Found: %s", page));
}
if (mappings != null && mappings.size() > 0) {
ArrayList listOfTemplates = WikiTemplateParser.parse(text, false);
HashSet mapClasses = new HashSet<>();
for (WikiTemplate t : listOfTemplates) {
String fp = t.getFirstPart();
if (fp == null) {
continue;
}
String name = fp.toLowerCase();
name = name.replace(CharacterTable.SPACE, CharacterTable.LOW_LINE);
logger.trace(String.format("%s has template %s", title, name));
DBpediaMapping mapping = mappings.get(name);
if (mapping != null) {
logger.trace(String.format("Mapping found: %s", mapping));
if (mapping.getdBpediaMappingConditions().size() == 0) {
String mapClass = mapping.getMapping();
ArrayList nodes = ontology.getHistoryFromName(mapClass);
if (nodes == null) {
logger.trace("Nodes are null");
break; // If nodes is null, I don't want to go on!
}
logger.trace(String.format("%s ---> %s", title, nodes));
for (DBpediaOntologyNode n : nodes) {
mapClasses.add(n.className);
}
break; // Avoid multiple mappings in one page
}
HashMap parts = t.getHashMapOfParts(false, true);
String mapClass = mapping.applyConditions(parts);
ArrayList nodes = ontology.getHistoryFromName(mapClass);
if (nodes == null) {
logger.trace("Nodes are null");
break; // If nodes is null, I don't want to go on!
}
logger.trace(String.format("Conditional: %s [%s] --> %s", title, name, nodes));
for (DBpediaOntologyNode n : nodes) {
mapClasses.add(n.className);
}
break; // Avoid multiple mappings in one page
}
}
if (mapClasses.size() > 0) {
logger.debug(String.format("%s ---> %s", title, mapClasses));
synchronized (this) {
classWriter.add(title, mapClasses);
}
}
logger.trace(mapClasses);
}
if (page != null) {
logger.info(String.format("Page %s found, exiting", page));
System.exit(0);
}
}
@Override
public void endProcess() {
super.endProcess();
try {
classWriter.close();
} catch (Exception e) {
logger.error(e.getMessage());
}
}
public static void main(String args[]) throws IOException {
CommandLineWithLogger commandLineWithLogger = new CommandLineWithLogger();
commandLineWithLogger.addOption(OptionBuilder.withDescription("Mapping file").isRequired().hasArg().withArgName("file").withLongOpt("map-file").create("m"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("XML dump").isRequired().hasArg().withArgName("file").withLongOpt("xml-dump").create("x"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Base folder").isRequired().hasArg().withArgName("folder").withLongOpt("base-folder").create("b"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("DBpedia ontology file").isRequired().hasArg().withArgName("file").withLongOpt("ontology").create("o"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("DBpedia addenda file").hasArg().withArgName("file").withLongOpt("addenda").create("a"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Number of threads (default=1)").hasArg().withArgName("num").withLongOpt("num-threads").create("t"));
commandLineWithLogger.addOption(OptionBuilder.withDescription("Check page").hasArg().withArgName("page").withLongOpt("page").create("p"));
CommandLine commandLine = null;
try {
commandLine = commandLineWithLogger.getCommandLine(args);
PropertyConfigurator.configure(commandLineWithLogger.getLoggerProps());
} catch (Exception e) {
System.exit(1);
}
String mappingFile = commandLine.getOptionValue("map-file");
String xmlFile = commandLine.getOptionValue("xml-dump");
String baseFolder = commandLine.getOptionValue("base-folder");
String owlFile = commandLine.getOptionValue("ontology");
String addendaFile = commandLine.getOptionValue("addenda");
String page = commandLine.getOptionValue("page");
Integer numThreads = 1;
if (commandLine.hasOption("num-threads")) {
numThreads = Integer.parseInt(commandLine.getOptionValue("num-threads"));
}
if (!baseFolder.endsWith(File.separator)) {
baseFolder += File.separator;
}
ExtractorParameters extractorParameters = new ExtractorParameters(xmlFile, baseFolder, true);
DBpediaClassExtractor dBpediaClassExtractor = new DBpediaClassExtractor(numThreads, extractorParameters.getLocale());
dBpediaClassExtractor.start(xmlFile, extractorParameters.getWikipediaDBPediaClassesIndexName(), mappingFile, owlFile, addendaFile, page);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy