org.apache.any23.rdf.RDFUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.rdf;
import org.apache.any23.util.MathUtils;
import org.apache.any23.util.StringUtils;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.RDFParserRegistry;
import org.eclipse.rdf4j.rio.RDFWriter;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
import org.eclipse.rdf4j.rio.helpers.StatementCollector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Writer;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.Optional;
import java.util.TimeZone;
/**
* Basic class providing a set of utility methods when dealing with RDF.
*
* @author Michele Mostarda ([email protected])
* @author Davide Palmisano ([email protected])
* @author Jacek Grzebyta ([email protected])
*/
public class RDFUtils {
private static int nodeId = 0;
private static final ValueFactory valueFactory = SimpleValueFactory.getInstance();
private static final Logger LOG = LoggerFactory.getLogger(RDFUtils.class);
private static final Statement[] EMPTY_STATEMENTS = new Statement[0];
private RDFUtils() {
}
/**
* Fixes typical errors in an absolute org.eclipse.rdf4j.model.IRI, such as unescaped spaces.
*
* @param uri
* An absolute org.eclipse.rdf4j.model.IRI, can have typical syntax errors
*
* @return An absolute org.eclipse.rdf4j.model.IRI that is valid against the org.eclipse.rdf4j.model.IRI syntax
*
* @throws IllegalArgumentException
* if org.eclipse.rdf4j.model.IRI is not fixable
*/
public static String fixAbsoluteIRI(String uri) {
String fixed = fixIRIWithException(uri);
if (!fixed.matches("[a-zA-Z0-9]+:/.*"))
throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
// Add trailing slash if org.eclipse.rdf4j.model.IRI has only authority but no path.
if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) {
fixed = fixed + "/";
}
return fixed;
}
/**
* This method allows to obtain an XML Schema compliant date
* providing a textual representation of a date and textual a pattern for parsing it.
*
* @param dateToBeParsed
* the String containing the date.
* @param format
* the pattern as descibed in {@link java.text.SimpleDateFormat}
*
* @return a {@link String} representing the date
*
* @throws java.text.ParseException
* if there is an error parsing the given date.
* @throws javax.xml.datatype.DatatypeConfigurationException
* if there is a serious configuration error.
*/
public static String getXSDDate(String dateToBeParsed, String format)
throws ParseException, DatatypeConfigurationException {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format, Locale.ROOT);
Date date = simpleDateFormat.parse(dateToBeParsed);
GregorianCalendar gc = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
gc.setTime(date);
XMLGregorianCalendar xml = DatatypeFactory.newInstance().newXMLGregorianCalendar(gc);
xml.setTimezone(0);
return xml.toString();
}
/**
* Prints a date
to the XSD datetime format.
*
* @param date
* date to be printed.
*
* @return the string representation of the input date.
*/
public static String toXSDDateTime(Date date) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
String s = simpleDateFormat.format(date);
StringBuilder sb = new StringBuilder(s);
sb.insert(22, ':');
return sb.toString();
}
/**
*
* Tries to fix a potentially broken relative or absolute URI.
*
* These appear to be good rules: Remove whitespace or '\' or '"' in beginning and end Replace space with %20 Drop
* the triple if it matches this regex (only protocol): ^[a-zA-Z0-9]+:(//)?$ Drop the triple if it matches this
* regex: ^javascript: Truncate ">.*$ from end of lines (Neko didn't quite manage to fix broken markup) Drop the
* triple if any of these appear in the URL: <>[]|*{}"<>\
*
* @param unescapedIRI
* uri string to be unescaped.
*
* @return the unescaped string.
*/
public static String fixIRIWithException(String unescapedIRI) {
if (unescapedIRI == null)
throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
// Remove starting and ending whitespace
String escapedIRI = unescapedIRI.trim();
// Replace space with %20
escapedIRI = escapedIRI.replaceAll(" ", "%20");
// strip linebreaks
escapedIRI = escapedIRI.replaceAll("\n", "");
// 'Remove starting "\" or '"'
if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\""))
escapedIRI = escapedIRI.substring(1);
// Remove ending "\" or '"'
if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\""))
escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1);
// Drop the triple if it matches this regex (only protocol): ^[a-zA-Z0-9]+:/?/?$
if (escapedIRI.matches("^[a-zA-Z0-9]+:/?/?$"))
throw new IllegalArgumentException("no authority in org.eclipse.rdf4j.model.IRI: " + unescapedIRI);
// Drop the triple if it matches this regex: ^javascript:
if (escapedIRI.matches("^javascript:"))
throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI starts with javascript: " + unescapedIRI);
// stripHTML
// escapedIRI = escapedIRI.replaceAll("\\<.*?\\>", "");
// >.*$ from end of lines (Neko didn't quite manage to fix broken markup)
escapedIRI = escapedIRI.replaceAll(">.*$", "");
// Drop the triple if any of these appear in the URL: <>[]|*{}"<>\
if (escapedIRI.matches("[<>\\[\\]|\\*\\{\\}\"\\\\]"))
throw new IllegalArgumentException("Invalid character in org.eclipse.rdf4j.model.IRI: " + unescapedIRI);
return escapedIRI;
}
/**
* Creates a {@link org.eclipse.rdf4j.model.IRI}.
*
* @param iri
* a base string for the {@link org.eclipse.rdf4j.model.IRI}
*
* @return a valid {@link org.eclipse.rdf4j.model.IRI}
*/
public static org.eclipse.rdf4j.model.IRI iri(String iri) {
return valueFactory.createIRI(iri);
}
/**
* Creates a {@link org.eclipse.rdf4j.model.IRI}.
*
* @param namespace
* a base namespace for the {@link org.eclipse.rdf4j.model.IRI}
* @param localName
* a local name to associate with the namespace
*
* @return a valid {@link org.eclipse.rdf4j.model.IRI}
*/
public static org.eclipse.rdf4j.model.IRI iri(String namespace, String localName) {
return valueFactory.createIRI(namespace, localName);
}
/**
* Creates a {@link Literal}.
*
* @param s
* string representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(String s) {
return valueFactory.createLiteral(s);
}
/**
* Creates a {@link Literal}.
*
* @param b
* boolean representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(boolean b) {
return valueFactory.createLiteral(b);
}
/**
* Creates a {@link Literal}.
*
* @param b
* byte representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(byte b) {
return valueFactory.createLiteral(b);
}
/**
* Creates a {@link Literal}.
*
* @param s
* short representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(short s) {
return valueFactory.createLiteral(s);
}
/**
* Creates a {@link Literal}.
*
* @param i
* int representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(int i) {
return valueFactory.createLiteral(i);
}
/**
* Creates a {@link Literal}.
*
* @param l
* long representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(long l) {
return valueFactory.createLiteral(l);
}
/**
* Creates a {@link Literal}.
*
* @param f
* float representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(float f) {
return valueFactory.createLiteral(f);
}
/**
* Creates a {@link Literal}.
*
* @param d
* double representation of the {@link org.eclipse.rdf4j.model.Literal}
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(double d) {
return valueFactory.createLiteral(d);
}
/**
* Creates a {@link Literal}.
*
* @param s
* the literal's label
* @param l
* the literal's language
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(String s, String l) {
if (l == null) {
// HACK: Workaround for ANY23 code that passes null in for language tag
return valueFactory.createLiteral(s);
} else {
return valueFactory.createLiteral(s, l);
}
}
/**
* Creates a {@link Literal}.
*
* @param s
* the literal's label
* @param datatype
* the literal's datatype
*
* @return valid {@link org.eclipse.rdf4j.model.Literal}
*/
public static Literal literal(String s, org.eclipse.rdf4j.model.IRI datatype) {
return valueFactory.createLiteral(s, datatype);
}
/**
* Creates a {@link BNode}.
*
* @param id
* string representation of the {@link org.eclipse.rdf4j.model.BNode}
*
* @return the valid {@link org.eclipse.rdf4j.model.BNode}
*/
// TODO: replace this with all occurrences of #getBNode()
public static BNode bnode(String id) {
return valueFactory.createBNode(id);
}
/**
* @return a bnode
with unique id.
*/
public static BNode bnode() {
return valueFactory.createBNode();
}
/**
* Creates a {@link BNode} with an MD5 digest as part of the ID.
*
* @param id
* string representation of the {@link org.eclipse.rdf4j.model.BNode} name for which we will create a md5
* hash.
*
* @return the valid {@link org.eclipse.rdf4j.model.BNode}
*/
public static BNode getBNode(String id) {
return valueFactory.createBNode("node" + MathUtils.md5(id));
}
/**
* Creates a {@link Statement}.
*
* @param s
* subject {@link org.eclipse.rdf4j.model.Resource}
* @param p
* predicate {@link org.eclipse.rdf4j.model.URI}
* @param o
* object {@link org.eclipse.rdf4j.model.Value}
*
* @return valid {@link org.eclipse.rdf4j.model.Statement}
*/
public static Statement triple(Resource s, org.eclipse.rdf4j.model.IRI p, Value o) {
return valueFactory.createStatement(s, p, o);
}
/**
* Creates a statement of type: toValue(s), toValue(p), toValue(o)
*
* @param s
* subject.
* @param p
* predicate.
* @param o
* object.
*
* @return a statement instance.
*/
public static Statement triple(String s, String p, String o) {
return valueFactory.createStatement((Resource) toValue(s), (org.eclipse.rdf4j.model.IRI) toValue(p),
toValue(o));
}
/**
* Creates a {@link Statement}.
*
* @param s
* subject.
* @param p
* predicate.
* @param o
* object.
* @param g
* quad resource
*
* @return a statement instance.
*/
public static Statement quad(Resource s, org.eclipse.rdf4j.model.IRI p, Value o, Resource g) {
return valueFactory.createStatement(s, p, o, g);
}
/**
* Creates a statement of type: toValue(s), toValue(p), toValue(o), toValue(g)
*
* @param s
* subject.
* @param p
* predicate.
* @param o
* object.
* @param g
* quad resource
*
* @return a statement instance.
*/
public static Statement quad(String s, String p, String o, String g) {
return valueFactory.createStatement((Resource) toValue(s), (org.eclipse.rdf4j.model.IRI) toValue(p), toValue(o),
(Resource) toValue(g));
}
/**
* Creates a {@link Value}. If s == 'a'
returns an {@link RDF#TYPE}. If
* s.matches('[a-z0-9]+:.*')
expands the corresponding prefix using {@link PopularPrefixes}.
*
* @param s
* string representation of value.
*
* @return a value instance.
*/
public static Value toValue(String s) {
if ("a".equals(s))
return RDF.TYPE;
if (s.matches("[a-z0-9]+:.*")) {
return PopularPrefixes.get().expand(s);
}
return valueFactory.createLiteral(s);
}
/**
*
* Returns all the available {@link RDFFormat}s.
*
* @return an unmodifiable collection of formats.
*/
public static Collection getFormats() {
return RDFParserRegistry.getInstance().getKeys();
}
/**
* Creates a new {@link RDFParser} instance.
*
* @param format
* parser format.
*
* @return parser instance.
*
* @throws IllegalArgumentException
* if format is not supported.
*/
public static RDFParser getParser(RDFFormat format) {
return Rio.createParser(format);
}
/**
* Creates a new {@link RDFWriter} instance.
*
* @param format
* output format.
* @param writer
* data output writer.
*
* @return writer instance.
*
* @throws IllegalArgumentException
* if format is not supported.
*/
public static RDFWriter getWriter(RDFFormat format, Writer writer) {
return Rio.createWriter(format, writer);
}
/**
* Creates a new {@link RDFWriter} instance.
*
* @param format
* output format.
* @param os
* output stream.
*
* @return writer instance.
*
* @throws IllegalArgumentException
* if format is not supported.
*/
public static RDFWriter getWriter(RDFFormat format, OutputStream os) {
return Rio.createWriter(format, os);
}
/**
* Returns a parser type from the given extension.
*
* @param ext
* input extension.
*
* @return parser matching the extension.
*
* @throws IllegalArgumentException
* if no extension matches.
*/
public static Optional getFormatByExtension(String ext) {
if (!ext.startsWith("."))
ext = "." + ext;
return Rio.getParserFormatForFileName(ext);
}
/**
* Parses the content of is
input stream with the specified parser p
using
* baseIRI
.
*
* @param format
* input format type.
* @param is
* input stream containing RDF
.
* @param baseIRI
* base uri.
*
* @return list of statements detected within the input stream.
*
* @throws IOException
* if there is an error reading the {@link java.io.InputStream}
*/
public static Statement[] parseRDF(RDFFormat format, InputStream is, String baseIRI) throws IOException {
final StatementCollector handler = new StatementCollector();
final RDFParser parser = getParser(format);
parser.getParserConfig().set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
parser.setPreserveBNodeIDs(true);
parser.setRDFHandler(handler);
parser.parse(is, baseIRI);
return handler.getStatements().toArray(EMPTY_STATEMENTS);
}
/**
* Parses the content of is
input stream with the specified parser p
using ''
* as base org.eclipse.rdf4j.model.IRI.
*
* @param format
* input format type.
* @param is
* input stream containing RDF
.
*
* @return list of statements detected within the input stream.
*
* @throws IOException
* if there is an error reading the {@link java.io.InputStream}
*/
public static Statement[] parseRDF(RDFFormat format, InputStream is) throws IOException {
return parseRDF(format, is, "");
}
/**
* Parses the content of in
string with the specified parser p
using ''
as
* base org.eclipse.rdf4j.model.IRI.
*
* @param format
* input format type.
* @param in
* input string containing RDF
.
*
* @return list of statements detected within the input string.
*
* @throws IOException
* if there is an error reading the {@link java.io.InputStream}
*/
public static Statement[] parseRDF(RDFFormat format, String in) throws IOException {
return parseRDF(format, new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8)));
}
/**
* Parses the content of the resource
file guessing the content format from the extension.
*
* @param resource
* resource name.
*
* @return the statements declared within the resource file.
*
* @throws java.io.IOException
* if an error occurs while reading file.
*/
public static Statement[] parseRDF(String resource) throws IOException {
final int extIndex = resource.lastIndexOf('.');
if (extIndex == -1)
throw new IllegalArgumentException("Error while detecting the extension in resource name " + resource);
final String extension = resource.substring(extIndex + 1);
return parseRDF(getFormatByExtension(extension).orElseThrow(Rio.unsupportedFormat(extension)),
RDFUtils.class.getResourceAsStream(resource));
}
/**
* Checks if href
is absolute or not.
*
* @param href
* candidate org.eclipse.rdf4j.model.IRI.
*
* @return true
if href
is absolute, false
otherwise.
*/
public static boolean isAbsoluteIRI(String href) {
try {
SimpleValueFactory.getInstance().createIRI(href.trim());
new java.net.URI(href.trim());
return true;
} catch (IllegalArgumentException e) {
LOG.trace("Error processing href: {}", href, e);
return false;
} catch (URISyntaxException e) {
LOG.trace("Error interpreting href: {} as URI.", href, e);
return false;
}
}
/**
* {@link #makeIRI(java.lang.String, org.eclipse.rdf4j.model.IRI, boolean) }.
*
* @param docUri
* It is a namespace. If it ends with '/' character than stays unchanged otherwise the hash character '#'
* is added to the end.
*
* @return instance of {@link Resource}.
*/
public static Resource makeIRI(IRI docUri) {
return makeIRI("node", docUri);
}
/**
* {@link #makeIRI(java.lang.String, org.eclipse.rdf4j.model.IRI, boolean) }.
*
* @param type
* This argument is converted following Java naming conventions with
* {@link StringUtils#implementJavaNaming(java.lang.String) }.
* @param docIRI
* It is a namespace. If it ends with '/' character than stays unchanged otherwise the hash character '#'
* is added to the end.
*
* @return instance of {@link Resource}.
*/
public static Resource makeIRI(String type, IRI docIRI) {
return makeIRI(type, docIRI, false);
}
/**
* Creates implementation of {@link Resource} from given arguments: type and docIRI.
*
* NB: The Java Naming Conventions is described by
* GeeksForGeeks.
*
* @param type
* This argument is converted following Java naming conventions with
* {@link StringUtils#implementJavaNaming(java.lang.String) }.
* @param docIRI
* It is a namespace. If it ends with '/' character than stays unchanged otherwise the hash character '#'
* is added to the end.
* @param addId
* If argument is TRUE than the node identifier is added to the end formated
* '_{int}'
.
*
* @return instance of {@link Resource}.
*/
public static Resource makeIRI(String type, IRI docIRI, boolean addId) {
// preprocess string: converts - -> _
// converts : word1 word2 -> word1Word2
String newType = StringUtils.implementJavaNaming(type);
String iriString;
if (docIRI.toString().endsWith("/") || docIRI.toString().endsWith("#")) {
iriString = docIRI.toString() + newType;
} else {
iriString = docIRI.toString() + "#" + newType;
}
if (addId) {
iriString = iriString + "_" + Integer.toString(nodeId);
}
Resource node = RDFUtils.iri(iriString);
if (addId) {
nodeId++;
}
return node;
}
/**
* Convert string to either IRI or Literal.
*
* If string value expresses valid IRI than {@link IRI} is created. Otherwise method creates simple {@link Literal}
* xsd:string.
*
* @param inString
* an input string to manifest as {@link org.eclipse.rdf4j.model.Value}
*
* @return either {@link IRI} or {@link Literal}.
*/
public static Value makeIRI(String inString) {
if (RDFUtils.isAbsoluteIRI(inString)) {
return RDFUtils.iri(inString);
} else {
return RDFUtils.literal(inString);
}
}
public static Value makeIRI() {
BNode bnode = bnode(Integer.toString(nodeId));
nodeId++;
return bnode;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy