
org.dbpedia.extraction.scripts.MapObjectUris.scala Maven / Gradle / Ivy
package org.dbpedia.extraction.scripts
import org.dbpedia.extraction.util.ConfigUtils.parseLanguages
import org.dbpedia.extraction.util.RichFile.wrapFile
import scala.collection.mutable.{Set,HashMap,MultiMap}
import java.io.File
import scala.Console.err
import org.dbpedia.extraction.util.SimpleWorkers
import org.dbpedia.extraction.util.Language
/**
* Maps old object URIs in triple files to new object URIs:
* - read one or more triple files that contain the URI mapping:
* - the predicate is ignored
* - read one or more files that need their object URI changed:
* - the predicate is ignored
* - literal values and quads without mapping for object URI are copied unchanged
*
* Redirects SHOULD be resolved in the following datasets:
*
* disambiguations
* infobox-properties
* mappingbased-properties
* page-links
* persondata
* topical-concepts
*
* Redirects seem to be so rare in categories that it doesn't make sense to resolve these:
*
* article-categories
* skos-categories
*
* The following datasets DO NOT have object URIs that can be redirected:
*
* category-labels
* external-links
* flickr-wrappr-links
* geo-coordinates
* homepages
* images
* infobox-property-definitions
* infobox-test
* instance-types
* iri-same-as-uri
* labels
* specific-mappingbased-properties
*
* Maybe we should resolve redirects in interlanguage-links, but we would have to integrate
* redirect resolution into interlanguage link resolution. We're pretty strict when we generate
* interlanguage-links-same-as. If we resolve redirects in interlanguage-links, we would probably
* gain a few interlanguage-links (tenths of a percent), but we would not eliminate errors.
*
* Example call:
* ../run MapObjectUris /data/dbpedia transitive-redirects .nt.gz infobox-properties,mappingbased-properties,... -redirected .nt.gz,.nq.gz 10000-
*
* The following should be redirected (as of 2012-07-11)
* disambiguations,infobox-properties,mappingbased-properties,page-links,persondata,topical-concepts
* (specific-mappingbased-properties is not necessary, it has only literal values)
*
* TODO: merge with CanonicalizeUris?
*/
object MapObjectUris {
private def split(arg: String): Array[String] = {
arg.split(",").map(_.trim).filter(_.nonEmpty)
}
def main(args: Array[String]): Unit = {
require(args != null && args.length >= 7,
"need at least seven args: " +
/*0*/ "base dir, " +
/*1*/ "comma-separated names of datasets mapping old URIs to new URIs (e.g. 'transitive-redirects'), "+
/*2*/ "mapping file suffix (e.g. '.nt.gz', '.ttl', '.ttl.bz2'), " +
/*3*/ "comma-separated names of input datasets (e.g. 'infobox-properties,mappingbased-properties'), "+
/*4*/ "output dataset name extension (e.g. '-redirected'), "+
/*5*/ "comma-separated input/output file suffixes (e.g. '.nt.gz,.nq.bz2', '.ttl', '.ttl.bz2'), " +
/*6*/ "languages or article count ranges (e.g. 'en,fr' or '10000-')")
val baseDir = new File(args(0))
val mappings = split(args(1))
require(mappings.nonEmpty, "no mapping datasets")
// Suffix of mapping files, for example ".nt", ".ttl.gz", ".nt.bz2" and so on.
// This script works with .nt, .ttl, .nq or .tql files, using IRIs or URIs.
val mappingSuffix = args(2)
require(mappingSuffix.nonEmpty, "no mapping file suffix")
val inputs = split(args(3))
require(inputs.nonEmpty, "no input datasets")
val extension = args(4)
require(extension.nonEmpty, "no result name extension")
// Suffixes of input/output files, for example ".nt", ".ttl.gz", ".nt.bz2" and so on.
// This script works with .nt, .ttl, .nq or .tql files, using IRIs or URIs.
val suffixes = split(args(5))
require(suffixes.nonEmpty, "no input/output file suffixes")
// Use all remaining args as keys or comma or whitespace separated lists of keys
val languages = parseLanguages(baseDir, args.drop(6))
require(languages.nonEmpty, "no languages")
// We really want to saturate CPUs and disk, so we use 50% more workers than CPUs
val workers = SimpleWorkers(1.5, 1.0) { language: Language =>
val finder = new DateFinder(baseDir, language)
// Redirects can have only one target, so we don't really need a MultiMap here.
// But CanonicalizeUris also uses a MultiMap... TODO: Make this configurable.
val map = new HashMap[String, Set[String]] with MultiMap[String, String]
for (mappping <- mappings) {
var count = 0
QuadReader.readQuads(finder, mappping + mappingSuffix, auto = true) { quad =>
if (quad.datatype != null) throw new IllegalArgumentException(language.wikiCode+": expected object uri, found object literal: "+quad)
// TODO: this wastes a lot of space. Storing the part after ...dbpedia.org/resource/ would
// be enough. Also, the fields of the Quad are derived by calling substring() on the whole
// line, which means that the character array for the whole line is kept in memory, which
// basically means that the whole redirects file is kept in memory. We should
// - only store the resource title in the map
// - use new String(quad.subject), new String(quad.value) to cut the link to the whole line
// - maybe use an index of titles as in ProcessInterLanguageLinks to avoid storing duplicate titles
map.addBinding(quad.subject, quad.value)
count += 1
}
err.println(language.wikiCode+": found "+count+" mappings")
}
for (input <- inputs; suffix <- suffixes) {
QuadMapper.mapQuads(finder, input + suffix, input + extension + suffix, required = false) { quad =>
if (quad.datatype != null) List(quad) // just copy quad with literal values. TODO: make this configurable
else map.get(quad.value) match {
case Some(uris) => for (uri <- uris) yield quad.copy(value = uri) // change object URI
case None => List(quad) // just copy quad without mapping for object URI. TODO: make this configurable
}
}
}
}
workers.start()
for (language <- languages) workers.process(language)
workers.stop()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy