com.bigdata.rdf.util.VocabBuilder Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Apr 30, 2012
*/
package com.bigdata.rdf.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;
import org.apache.log4j.Logger;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.RDFParserFactory;
import org.openrdf.rio.RDFParserRegistry;
import org.openrdf.rio.helpers.RDFHandlerBase;
import com.bigdata.rdf.ServiceProviderHook;
import com.bigdata.rdf.rio.IRDFParserOptions;
import com.bigdata.rdf.rio.RDFParserOptions;
import com.bigdata.rdf.vocab.VocabularyDecl;
/**
* Utility class scans some RDF data, builds up a distribution over the distinct
* predicates, and generates a {@link VocabularyDecl} for the source data. This
* can be used to optimize the data density over source data sources.
*
* TODO This could generate one decl per file and then wrap them into a
* Vocabulary.
*
* @author bryan
*/
public class VocabBuilder {
private static final Logger log = Logger.getLogger(VocabBuilder.class);
private final IRDFParserOptions parserOptions;
// map reporting predicate frequency
private final Map preds = new LinkedHashMap();
// map reporting type frequency
private final Map types = new LinkedHashMap();
private VocabBuilder() {
parserOptions = new RDFParserOptions();
parserOptions.setStopAtFirstError(false);
parserOptions.setVerifyData(false);
}
@SuppressWarnings("deprecation")
protected void loadFiles(final int depth, final File file,
final String baseURI, final RDFFormat rdfFormat,
final FilenameFilter filter) throws IOException {
if (file.isDirectory()) {
if (log.isDebugEnabled())
log.debug("loading directory: " + file);
final File[] files = (filter != null ? file.listFiles(filter)
: file.listFiles());
for (int i = 0; i < files.length; i++) {
final File f = files[i];
loadFiles(depth + 1, f, baseURI, rdfFormat, filter);
}
return;
}
final String n = file.getName();
RDFFormat fmt = RDFFormat.forFileName(n);
if (fmt == null && n.endsWith(".zip")) {
fmt = RDFFormat.forFileName(n.substring(0, n.length() - 4));
}
if (fmt == null && n.endsWith(".gz")) {
fmt = RDFFormat.forFileName(n.substring(0, n.length() - 3));
}
if (fmt == null) // fallback
fmt = rdfFormat;
final RDFParserFactory rdfParserFactory = RDFParserRegistry
.getInstance().get(fmt);
if (rdfParserFactory == null) {
throw new RuntimeException("Parser factory not found: source="
+ file + ", fmt=" + fmt);
}
final RDFParser rdfParser = rdfParserFactory.getParser();
rdfParser.setVerifyData(parserOptions.getVerifyData());
rdfParser.setStopAtFirstError(parserOptions.getStopAtFirstError());
rdfParser.setDatatypeHandling(parserOptions.getDatatypeHandling());
rdfParser.setPreserveBNodeIDs(parserOptions.getPreserveBNodeIDs());
rdfParser.setRDFHandler(new AddStatementHandler());
InputStream is = null;
try {
is = new FileInputStream(file);
if (n.endsWith(".gz")) {
is = new GZIPInputStream(is);
} else if (n.endsWith(".zip")) {
is = new ZipInputStream(is);
}
/*
* Obtain a buffered reader on the input stream.
*/
final Reader reader = new BufferedReader(new InputStreamReader(is));
try {
// baseURI for this file.
final String s = baseURI != null ? baseURI : file.toURI()
.toString();
rdfParser.parse(reader, s);
return;
} catch (Exception ex) {
log.warn("Could not process file " + file + ": " + ex.getStackTrace());
} finally {
reader.close();
}
} finally {
if (is != null)
is.close();
}
}
private class AddStatementHandler extends RDFHandlerBase {
public AddStatementHandler() {
}
public void handleStatement(final Statement stmt)
throws RDFHandlerException {
final URI p = stmt.getPredicate();
// A. Count number of occurrences for predicate at hand
UriFrequency predFrequency = preds.get(p);
if (predFrequency == null) {
preds.put(p, predFrequency = new UriFrequency(p));
if (log.isDebugEnabled())
log.debug("New " + p + " : total=" + preds.size());
}
predFrequency.cnt++;
// B. For typing statements, also count occurrence of the type
if (stmt.getPredicate().equals(RDF.TYPE)) {
final Value o = stmt.getObject();
if (o instanceof URI) {
UriFrequency typeFrequency = types.get((URI)o);
if (typeFrequency == null) {
types.put((URI)o, typeFrequency = new UriFrequency((URI)o));
if (log.isDebugEnabled())
log.debug("New " + (URI)o + " : total=" + types.size());
}
typeFrequency.cnt++;
}
}
}
}
/**
* @param args
* The file(s) to read.
*
* @throws IOException
*/
public static void main(final String[] args) {
final boolean generate = true;
final int minFreq = 10;
final VocabBuilder v = new VocabBuilder();
final String baseURI = null; // unless overridden.
final RDFFormat rdfFormat = RDFFormat.RDFXML; // default
for (String file : args) {
if (log.isInfoEnabled())
log.info("file: " + file);
try {
v.loadFiles(0/* depth */, new File(file), baseURI, rdfFormat,
filter);
} catch (IOException ex) {
log.error("Could not read: file=" + file, ex);
}
}
// sort predicates
final int predsFrequencySize = v.preds.size();
final UriFrequency[] predsFrequency = v.preds.values().toArray(new UriFrequency[predsFrequencySize]);
if (log.isInfoEnabled())
log.info("Sorting " + predsFrequency.length + " predicate items from "
+ args.length + " files");
Arrays.sort(predsFrequency);
// sort types
final int typesFrequencySize = v.types.size();
final UriFrequency[] typesFrequency = v.types.values().toArray(new UriFrequency[typesFrequencySize]);
if (log.isInfoEnabled())
log.info("Sorting " + typesFrequency.length + " types items from " + args.length + " files");
Arrays.sort(typesFrequency);
if (!generate) {
// show predicates on the console
for (int i = 0; i < predsFrequencySize; i++) {
final UriFrequency prefFrequency = predsFrequency[i];
if (prefFrequency.cnt < minFreq)
break;
System.out.println("" + i + "\t" + prefFrequency.cnt + "\t" + prefFrequency.uri);
}
// show types on the console
for (int i = 0; i < typesFrequencySize; i++) {
final UriFrequency typeFrequency = typesFrequency[i];
if (typeFrequency.cnt < minFreq)
break;
System.out.println("" + i + "\t" + typeFrequency.cnt + "\t" + typeFrequency.uri);
}
} else {
/*
* Generate VocabularyDecl file.
*/
final String className = "MyVocabularyDecl";
System.out.println("import java.util.Arrays;");
System.out.println("import java.util.Collections;");
System.out.println("import java.util.Iterator;");
System.out.println("import org.openrdf.model.URI;");
System.out.println("import org.openrdf.model.impl.URIImpl;");
System.out.println("import com.bigdata.rdf.vocab.VocabularyDecl;");
System.out.println("public class "+className+" implements VocabularyDecl {");
System.out.println("static private final URI[] uris = new URI[] {");
System.out.println("// frequencies of predicates in dataset");
for (int i = 0; i < predsFrequencySize; i++) {
final UriFrequency predFrequency = predsFrequency[i];
if (predFrequency.cnt < minFreq)
break;
System.out.println("new URIImpl(\"" + predFrequency.uri + "\"), // rank="
+ i + ", count=" + predFrequency.cnt);
}
System.out.println("// frequencies of types in dataset");
for (int i = 0; i < typesFrequencySize; i++) {
final UriFrequency typeFrequency = typesFrequency[i];
if (typeFrequency.cnt < minFreq)
break;
System.out.println("new URIImpl(\"" + typeFrequency.uri + "\"), // rank="
+ i + ", count=" + typeFrequency.cnt);
}
System.out.println("};"); // end uris.
System.out.println("public " + className + "() {}");
System.out
.println("public Iterator values() {\n"
+ "return Collections.unmodifiableList(Arrays.asList(uris)).iterator();\n"
+ "}");
System.out.println("}"); // end class
}
}
/**
* A vocabulary item together with its frequency count.
*/
private static class UriFrequency implements Comparable {
/**
* The uri.
*/
final URI uri;
/**
* The #of instances of that predicate.
*/
int cnt;
public UriFrequency(final URI uri) {
this.uri = uri;
}
/**
* Place into descending order by count.
*/
@Override
public int compareTo(final UriFrequency arg0) {
return arg0.cnt - cnt;
}
}
/**
* Note: The filter is chosen to select RDF data files and to allow the data
* files to use owl, ntriples, etc as their file extension. gzip and zip
* extensions are also supported.
*/
final private static FilenameFilter filter = new FilenameFilter() {
public boolean accept(final File dir, final String name) {
if (new File(dir, name).isDirectory()) {
if(dir.isHidden()) {
// Skip hidden files.
return false;
}
// if(dir.getName().equals(".svn")) {
//
// // Skip .svn files.
// return false;
//
// }
// visit subdirectories.
return true;
}
// if recognizable as RDF.
boolean isRDF = RDFFormat.forFileName(name) != null
|| (name.endsWith(".zip") && RDFFormat.forFileName(name
.substring(0, name.length() - 4)) != null)
|| (name.endsWith(".gz") && RDFFormat.forFileName(name
.substring(0, name.length() - 3)) != null);
if (log.isDebugEnabled())
log.debug("dir=" + dir + ", name=" + name + " : isRDF=" + isRDF);
return isRDF;
}
};
/**
* Force the load of the various integration/extension classes.
*
* @see
* Class loader problems
*/
static {
ServiceProviderHook.forceLoad();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy