be.ugent.rml.Utils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rmlmapper Show documentation
Show all versions of rmlmapper Show documentation
The RMLMapper executes RML rules to generate high quality Linked Data from multiple originally (semi-)structured data sources.
The newest version!
package be.ugent.rml;
import be.ugent.rml.extractor.ConstantExtractor;
import be.ugent.rml.extractor.Extractor;
import be.ugent.rml.extractor.ReferenceExtractor;
import be.ugent.rml.store.Quad;
import be.ugent.rml.store.QuadStore;
import be.ugent.rml.term.Literal;
import be.ugent.rml.term.NamedNode;
import be.ugent.rml.term.Term;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.eclipse.rdf4j.common.net.ParsedIRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.Rio;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.options.HDTSpecification;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.math.BigDecimal;
import java.net.HttpURLConnection;
import java.net.ServerSocket;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.security.SecureRandom;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
/**
* General static utility functions
*/
public class Utils {
private static final Logger logger = LoggerFactory.getLogger(Utils.class);
// Without support for custom registered languages of length 5-8 of the IANA language-subtag-registry
private static final Pattern regexPatternLanguageTag = Pattern.compile("^((?:(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang))|((?:([A-Za-z]{2,3}(-(?:[A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4})(-(?:[A-Za-z]{4}))?(-(?:[A-Za-z]{2}|[0-9]{3}))?(-(?:[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-(?:[0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(?:x(-[A-Za-z0-9]{1,8})+))?)|(?:x(-[A-Za-z0-9]{1,8})+))$");
public static Reader getReaderFromLocation(String location) throws IOException {
return getReaderFromLocation(location, null, "");
}
public static Reader getReaderFromLocation(String location, File basePath, String contentType) throws IOException {
if (isRemoteFile(location)) {
try {
return getReaderFromURL(new URL(location), contentType);
} catch (IOException e) {
e.printStackTrace();
return null;
}
} else {
return getReaderFromFile(getFile(location, basePath));
}
}
public static InputStream getInputStreamFromLocation(String location, File basePath, String contentType) throws IOException {
return getInputStreamFromLocation(location, basePath, contentType, new HashMap<>());
}
public static InputStream getInputStreamFromLocation(String location, File basePath, String contentType, Map headers) throws IOException {
if (isRemoteFile(location)) {
return getInputStreamFromURL(new URL(location), contentType, headers);
} else {
return getInputStreamFromFile(getFile(location, basePath));
}
}
/**
* Get an InputStream from a string. This string is either a path (local or remote) to an RDF file, or a raw RDF text.
* If it's a path, conversion from Windows path separators to UNIX paht separators is performed
* @param mOptionValue input, either RDF file path or raw RDF text
* @return input stream
*/
public static InputStream getInputStreamFromFileOrContentString(String mOptionValue) {
InputStream out;
logger.debug("{} mapping file", mOptionValue);
String extension;
try{
// will throw illegalArgumentException on a windows NTFS if a ":" is present
// on Windows a : is the identifier of an alternate data stream
extension = FilenameUtils.getExtension(mOptionValue);
}
catch (IllegalArgumentException e){
return IOUtils.toInputStream(mOptionValue, StandardCharsets.UTF_8);
}
if (extension != null) {
// Windows paths 🤷♂️
mOptionValue = mOptionValue.replaceAll("\\\\", "/");
}
try {
switch (extension) {
case "n3":
out = getTurtleInputStreamForFormat(mOptionValue, RDFFormat.N3);
break;
case "nt":
out = getTurtleInputStreamForFormat(mOptionValue, RDFFormat.NTRIPLES);
break;
case "nq":
out = getTurtleInputStreamForFormat(mOptionValue, RDFFormat.NQUADS);
break;
case "rdf":
case "xml":
out = getTurtleInputStreamForFormat(mOptionValue, RDFFormat.RDFXML);
break;
case "json":
case "jsonld":
out = getTurtleInputStreamForFormat(mOptionValue, RDFFormat.JSONLD);
break;
case "ttl":
out = getInputStreamFromLocation(mOptionValue, null, RDFFormat.TURTLE.getDefaultMIMEType());
break;
default:
logger.info("Could not determine extension of file path. Trying Turtle format.");
out = getInputStreamFromLocation(mOptionValue, null, "text/turtle");
break;
}
} catch (IOException e) {
logger.info("Trying to read mapping as raw input string.");
out = IOUtils.toInputStream(mOptionValue, StandardCharsets.UTF_8);
}
return out;
}
private static InputStream getTurtleInputStreamForFormat(String mOptionValue, RDFFormat format) throws IOException {
try (InputStream out = getInputStreamFromLocation(mOptionValue, null, format.getDefaultMIMEType())) {
Model model = Rio.parse(out, "", format);
ByteArrayOutputStream output = new ByteArrayOutputStream();
Rio.write(model, output, RDFFormat.TURTLE);
return new ByteArrayInputStream(output.toByteArray());
}
}
public static File getFile(String path) throws IOException {
return Utils.getFile(path, null);
}
/**
* Get path based on basePath or (if not filled in) the user.dir
* This file assumes UNIX path separators.
*/
public static File getFile(String path, File basePath) throws IOException {
// Absolute path?
File f = new File(path);
if (f.isAbsolute()) {
if (f.exists()) {
return f;
} else {
throw new FileNotFoundException();
}
}
if (basePath == null) {
try {
basePath = new File(System.getProperty("user.dir"));
} catch (Exception e) {
throw new FileNotFoundException();
}
}
logger.debug("Looking for file {} in basePath {}", path, basePath);
// Relative from user dir?
f = new File(basePath, path);
if (f.exists()) {
return f;
}
logger.debug("File {} not found in {}", path, basePath);
logger.debug("Looking for file {} in {} /../", path, basePath);
// Relative from parent of user dir?
f = new File(basePath, "../" + path);
if (f.exists()) {
return f;
}
logger.debug("File {} not found in {}", path, basePath);
logger.debug("Looking for file {} in the resources directory", path);
// Resource path?
try {
return MyFileUtils.getResourceAsFile(path);
} catch (IOException e) {
// Too bad
}
logger.debug("File {} not found in the resources directory", path);
throw new FileNotFoundException(path);
}
public static Reader getReaderFromURL(URL url) throws IOException {
return new BufferedReader(new InputStreamReader(url.openStream()));
}
public static Reader getReaderFromURL(URL url, String contentType) throws IOException {
return new BufferedReader(new InputStreamReader(getInputStreamFromURL(url, contentType)));
}
public static Reader getReaderFromFile(File file) throws FileNotFoundException {
return new FileReader(file);
}
public static InputStream getInputStreamFromURL(URL url) throws IOException {
return url.openStream();
}
public static InputStream getInputStreamFromURL(URL url, String contentType) {
InputStream inputStream = null;
try {
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
connection.setInstanceFollowRedirects(true);
connection.setRequestMethod("GET");
connection.setRequestProperty("Accept", contentType);
connection.setRequestProperty("charset", "utf-8");
connection.connect();
inputStream = connection.getInputStream();
} catch (IOException ex) {
ex.printStackTrace();
}
return inputStream;
}
public static InputStream getInputStreamFromURL(URL url, String contentType, Map headers) {
InputStream inputStream = null;
try {
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
connection.setInstanceFollowRedirects(true);
connection.setRequestMethod("GET");
connection.setRequestProperty("Accept", contentType);
// Set encoding if not set before
if(!headers.containsKey("charset")) {
headers.put("charset", "utf-8");
}
// Apply all headers
headers.forEach((name, value) -> {
logger.debug("{}: {}", name, value);
connection.setRequestProperty(name, value);
});
logger.debug("trying to connect");
connection.connect();
logger.debug("getting inputstream");
inputStream = connection.getInputStream();
logger.debug("got inputstream");
} catch (IOException ex) {
ex.printStackTrace();
}
return inputStream;
}
public static InputStream getInputStreamFromAuthURL(URL url, String contentType, Map headers) throws Exception {
InputStream inputStream = null;
try {
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
connection.setInstanceFollowRedirects(true);
connection.setRequestMethod("GET");
connection.setRequestProperty("Accept", contentType);
// Set encoding if not set before
if(!headers.containsKey("charset")) {
headers.put("charset", "utf-8");
}
// Apply all headers
headers.forEach((name, value) -> {
logger.debug("{}: {}", name, value);
connection.setRequestProperty(name, value);
});
logger.debug("trying to connect");
connection.connect();
if(connection.getResponseCode() == 401) throw new Exception("not authenticated");
logger.debug("getting inputstream");
inputStream = connection.getInputStream();
logger.debug("got inputstream");
} catch (IOException ex) {
ex.printStackTrace();
}
return inputStream;
}
public static InputStream getInputStreamFromFile(File file) throws FileNotFoundException {
return new FileInputStream(file);
}
public static InputStream getPostRequestResponse(URL url, String contentType, byte[] auth ){
InputStream inputStream = null;
Map headers = new HashMap<>();
headers.put("charset", "utf-8");
try {
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
connection.setInstanceFollowRedirects(true);
connection.setRequestMethod("POST");
connection.setRequestProperty("Accept", contentType);
connection.connect();
OutputStream outputStream = connection.getOutputStream();
outputStream.write(auth);
inputStream = connection.getInputStream();
} catch (IOException ex) {
ex.printStackTrace();
}
return inputStream;
}
public static boolean isRemoteFile(String location) {
return location.startsWith("https://") || location.startsWith("http://");
}
public static List getSubjectsFromQuads(List quads) {
List subjects = new ArrayList<>();
for (Quad quad : quads) {
subjects.add(quad.getSubject());
}
return subjects;
}
public static List getObjectsFromQuads(List quads) {
List objects = new ArrayList<>();
for (Quad quad : quads) {
objects.add(quad.getObject());
}
return objects;
}
public static List getLiteralObjectsFromQuads(List quads) {
List objects = new ArrayList<>();
for (Quad quad : quads) {
objects.add(((Literal) quad.getObject()).getValue());
}
return objects;
}
public static List getList(QuadStore store, Term first) {
List list = new ArrayList<>();
return getList(store, first, list);
}
public static List getList(QuadStore store, Term first, List list) {
if (first.equals(new NamedNode(NAMESPACES.RDF + "nil"))) {
return list;
}
Term value = Utils.getObjectsFromQuads(store.getQuads(first, new NamedNode(NAMESPACES.RDF + "first"), null)).get(0);
Term next = Utils.getObjectsFromQuads(store.getQuads(first, new NamedNode(NAMESPACES.RDF + "rest"), null)).get(0);
list.add(value);
if (next.equals(new NamedNode(NAMESPACES.RDF + "nil"))) {
return list;
} else {
list = getList(store, next, list);
}
return list;
}
/**
* Check if conforming to https://tools.ietf.org/html/bcp47#section-2.2.9
*
* @param s language tag
* @return True if valid language tag according to BCP 47
*/
public static boolean isValidrrLanguage(String s) {
return regexPatternLanguageTag.matcher(s).matches();
}
public static String encodeURI(String url) {
/* Avoid using regex to escape + and * chars for performance */
final StringBuilder builder = new StringBuilder();
final String encoded = URLEncoder.encode(url, StandardCharsets.UTF_8);
for (char c: encoded.toCharArray()) {
if (c == '+')
builder.append("%20");
else if (c == '*')
builder.append("%2A");
else
builder.append(c);
}
return builder.toString();
}
public static String fileToString(File file) throws IOException {
Reader reader = getReaderFromFile(file);
int intValueOfChar;
String targetString = "";
while ((intValueOfChar = reader.read()) != -1) {
targetString += (char) intValueOfChar;
}
reader.close();
return targetString;
}
/*
Extracts the selected columns from the SQL query
Orders them alphabetically
Returns hash of concatenated string
*/
// todo: Take subquerries into account
public static int selectedColumnHash(String query) {
Pattern p = Pattern.compile("^SELECT(.*)FROM");
Matcher m = p.matcher(query.replace("\n", " ").replace("\r", " ").trim());
if (m.find()) {
String columns = m.group(1);
String[] columnNames = columns.replace("DISTINCT", "").replace(" ", "").split(",");
Arrays.sort(columnNames);
return String.join("", columnNames).hashCode();
}
throw new Error("Invalid query: " + query);
}
// Simpler version of above method. Hashes the whole query.
public static int getHash(String query) {
return query.hashCode();
}
public static String readFile(String path, Charset encoding) throws IOException {
if (encoding == null) {
encoding = StandardCharsets.UTF_8;
}
byte[] encoded = Files.readAllBytes(Paths.get(path));
return new String(encoded, encoding);
}
public static String getURLParamsString(Map params)
throws UnsupportedEncodingException {
StringBuilder result = new StringBuilder();
for (Map.Entry entry : params.entrySet()) {
result.append(URLEncoder.encode(entry.getKey(), "UTF-8"));
result.append("=");
result.append(URLEncoder.encode(entry.getValue(), "UTF-8"));
result.append("&");
}
String resultString = result.toString();
return resultString.length() > 0
? resultString.substring(0, resultString.length() - 1) // remove final '&'
: resultString;
}
public static int getFreePortNumber() throws IOException {
ServerSocket temp = new ServerSocket(0);
temp.setReuseAddress(true);
int portNumber = temp.getLocalPort();
temp.close();
return portNumber;
}
/**
* This method parse the generic template and returns a list of Extractors
* that can later be used by the executor
* to get the data values from the records.
*
* @param template template string
* @return list of extractors
**/
public static List parseTemplate(String template, boolean ignoreDoubleQuotes) {
List extractors = new ArrayList<>();
String current = "";
boolean previousWasBackslash = false;
boolean variableBusy = false;
if (template != null) {
for (Character c : template.toCharArray()) {
if (c == '{') {
if (previousWasBackslash) {
current += c;
previousWasBackslash = false;
} else if (variableBusy) {
throw new Error("Parsing of template failed. Probably a { was followed by a second { without first closing the first {. Make sure that you use { and } correctly.");
} else {
variableBusy = true;
if (!current.equals("")) {
extractors.add(new ConstantExtractor(current));
}
current = "";
}
} else if (c == '}') {
if (previousWasBackslash) {
current += c;
previousWasBackslash = false;
} else if (variableBusy) {
extractors.add(new ReferenceExtractor(current, ignoreDoubleQuotes));
current = "";
variableBusy = false;
} else {
throw new Error("Parsing of template failed. Probably a } as used before a { was used. Make sure that you use { and } correctly.");
}
} else if (c == '\\') {
if (previousWasBackslash) {
previousWasBackslash = false;
current += c;
} else {
previousWasBackslash = true;
}
} else {
current += c;
}
}
if (!current.equals("")) {
extractors.add(new ConstantExtractor(current));
}
}
return extractors;
}
public static String randomString(int len) {
String AB = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
SecureRandom rnd = new SecureRandom();
StringBuilder sb = new StringBuilder(len);
for (int i = 0; i < len; i++)
sb.append(AB.charAt(rnd.nextInt(AB.length())));
return sb.toString();
}
public static String hashCode(String s) {
int hash = 0;
for (int i = 0; i < s.toCharArray().length; i++) {
hash += s.toCharArray()[i] * 31 ^ (s.toCharArray().length - 1 - i);
}
return Integer.toString(Math.abs(hash));
}
public static void ntriples2hdt(String rdfInputPath, String hdtOutputPath) {
// Configuration variables
String baseURI = "http://example.com/mydataset";
String inputType = "ntriples";
try {
// Create HDT from RDF file
HDT hdt = HDTManager.generateHDT(rdfInputPath, baseURI, RDFNotation.parse(inputType), new HDTSpecification(), null);
// Save generated HDT to a file
hdt.saveToHDT(hdtOutputPath, null);
// IMPORTANT: Free resources
hdt.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* This method returns true if a string is valid IRI.
*
* @param iri the IRI to validate.
* @return true if the IRI is valid, else false.
*/
public static boolean isValidIRI(String iri) {
try {
new ParsedIRI(iri);
return true;
} catch (Exception e) {
return false;
}
}
/**
* This method returns true if a string is a relative IRI.
*
* @param iri the IRI to check.
* @return true if the IRI is relative, else false.
*/
public static boolean isRelativeIRI(String iri) {
try {
ParsedIRI parsedIRI = new ParsedIRI(iri);
return !parsedIRI.isAbsolute();
} catch (Exception e) {
return false;
}
}
public static boolean checkPathParent(String path, String base) {
File f;
File basePath;
if (base == null) {
f = new File(path);
if (f.isAbsolute()) {
return f.getParentFile().exists();
}
base = System.getProperty("user.dir");
}
try {
basePath = new File(base);
} catch (Exception e) {
return false;
}
logger.info("Looking for parent of file {} in basePath {}", path, basePath);
// Relative from user dir?
f = new File(basePath, path);
return f.getParentFile().exists();
}
/**
* Get the base directive from a turtle file or return the default base
* @param is - input stream of the turtle file
* @param defaultBase - default base to return if no base directive is found
* @return - base directive or default base
*/
public static String getBaseDirectiveTurtleOrDefault(InputStream is, String defaultBase) {
String turtle;
try {
turtle = IOUtils.toString(is, StandardCharsets.UTF_8);
} catch (IOException e) {
turtle = "";
}
String base = getBaseDirectiveTurtle(turtle);
if (base == null) {
base = defaultBase;
}
return base;
}
public static String getBaseDirectiveTurtle(String turtle) {
Pattern p = Pattern.compile("@base <([^<>]*)>");
Matcher m = p.matcher(turtle);
if (m.find()) {
return m.group(1);
} else {
return null;
}
}
public static String transformDatatypeString(String input, String datatype) {
switch (datatype) {
case "http://www.w3.org/2001/XMLSchema#hexBinary":
// TODO
return input;
case "http://www.w3.org/2001/XMLSchema#decimal":
return "" + Double.parseDouble(input);
case "http://www.w3.org/2001/XMLSchema#integer":
return "" + Integer.parseInt(input);
case "http://www.w3.org/2001/XMLSchema#double":
return formatToScientific(Double.parseDouble(input));
case "http://www.w3.org/2001/XMLSchema#boolean":
switch (input) {
case "t":
case "true":
case "TRUE":
case "1":
return "true";
default:
return "false";
}
case "http://www.w3.org/2001/XMLSchema#date":
return input;
case "http://www.w3.org/2001/XMLSchema#time":
return input;
case "http://www.w3.org/2001/XMLSchema#dateTime":
return input.replace(" ", "T");
default:
return input;
}
}
public static int getHashOfString(String str) {
int hash = 7;
for (int i = 0; i < str.length(); i++) {
hash = hash * 31 + str.charAt(i);
}
return hash;
}
private static String formatToScientific(Double d) {
BigDecimal input = BigDecimal.valueOf(d).stripTrailingZeros();
int precision = input.scale() < 0
? input.precision() - input.scale()
: input.precision();
StringBuilder s = new StringBuilder("0.0");
for (int i = 2; i < precision; i++) {
s.append("#");
}
s.append("E0");
NumberFormat nf = NumberFormat.getNumberInstance(Locale.US);
DecimalFormat df = (DecimalFormat) nf;
df.applyPattern(s.toString());
return df.format(d);
}
}