org.eclipse.rdf4j.benchmark.rio.RDFTestDataset Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2021 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.benchmark.rio;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.eclipse.rdf4j.common.io.IOUtil;
/**
* This enum holds locations for RDF files from the web. It allows downloading these into a a temporary location in
* {@code {java.io.tmpdir}/rdf4j-benchmark-datasets/{filename} }. Please note
*
* @author Frens Jan Rumph
*/
public enum RDFTestDataset {
/**
*
* A dump of the Berlin SPARQL
* Benchmark made available by the HOBBIT project.
*
*
* 25 GB extracted.
*
*/
BSBM("bsbm.nt", "dataset.nt",
"https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/bsbm-dump.zip"),
/**
*
* A dump of the FishMark
* benchmark made available by the HOBBIT project.
*
*
* 1.8 GB extracted
*
*/
FISHMARK("fishmark.nt", "fishmark-size1.nt",
"https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/fishmark-dump.zip"),
/**
*
* A dump of the
* SP2Bench benchmark
* made available by the HOBBIT project.
*
*
* 2.7 GB extracted
*
*/
SP2BENCH("sp2b.n3", "sp2b.n3",
"https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/sp2bench-dump.zip"),
/**
*
* A dump of the Semantic Web Dog Food dataset made available by the HOBBIT
* project.
*
*
* 49 MB extracted
*
*/
SWDF("swdf.nt", "SWDF.nt",
"https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/swdf-dump.zip"),
/**
*
* A dump of the gene database of National Center for Biotechnology Information made available by the
* Bio2RDF project.
*
*
* 5.6 GB extracted
*
*/
GENE2GO("gene2go.nq",
"https://download.bio2rdf.org/files/release/3/ncbigene/gene2go.nq.gz"),
/**
*
* A dump of the Lexvo.org data.
*
*
* 67 MB extracted
*
*/
LEXVO("lexvo_latest.rdf",
"http://www.lexvo.org/resources/lexvo_latest.rdf.gz"),
/**
*
* A Data Catalogue Vocabulary file from Federal Public Service Policy and
* Support DG Digital Transformation's github repository.
*
*
* 139 MB extracted
*
*/
DATAGOVBE("datagovbe.nt",
"https://github.com/Fedict/dcat/raw/master/all/datagovbe.nt.gz");
/**
* Name of the target file.
*/
private final String fileName;
/**
* Name of the file to extract from the archive; {@code null} if the file is not part of a multi file archive (e.g.
* ZIP file).
*/
private final String archiveEntryName;
/**
* The {@link URL} to download the file from.
*/
private final URL url;
RDFTestDataset(String fileName, String url) {
this(fileName, null, url);
}
RDFTestDataset(String fileName, String archiveEntryName, String url) {
this.archiveEntryName = archiveEntryName;
this.fileName = fileName;
try {
this.url = new URL(url);
} catch (MalformedURLException e) {
throw new RuntimeException("Statically defiled URL " + url + " is unexpectedly malformed", e);
}
}
/**
* Download the dataset file to {@code {java.io.tmpdir}/rdf4j-benchmark-datasets/{fileName}}.
*
* @return The {@link File} to which the dataset was downloaded and extracted.
*/
public File download() {
File downloadDir = new File(System.getProperty("java.io.tmpdir"), "rdf4j-benchmark-datasets");
File dataFile = new File(downloadDir, fileName);
if (dataFile.exists()) {
return dataFile;
}
try {
Files.createDirectories(downloadDir.toPath());
File downloadFile = new File(downloadDir, Paths.get(url.getPath()).getFileName().toString());
if (!downloadFile.exists()) {
downloadTo(downloadFile);
}
if (!downloadFile.equals(dataFile)) {
extract(downloadFile, dataFile);
downloadFile.delete();
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
return dataFile;
}
private void downloadTo(File downloadFile) throws IOException {
System.out.println("Downloading " + url);
try (InputStream is = new BufferedInputStream(url.openStream())) {
IOUtil.transfer(is, downloadFile);
}
}
private void extract(File downloadFile, File dataFile) throws IOException {
String downloadFileName = downloadFile.getName();
if (downloadFileName.endsWith(".zip")) {
extractFromZip(downloadFile, dataFile);
} else if (downloadFileName.endsWith(".gz")) {
extractFromGzip(downloadFile, dataFile);
}
}
private void extractFromZip(File downloadFile, File dataFile) throws IOException {
System.out.println("Extracting " + archiveEntryName + " from " + downloadFile + " to " + dataFile.getName());
try (ZipFile zf = new ZipFile(downloadFile)) {
ZipEntry entry = zf.getEntry(archiveEntryName);
try (InputStream in = zf.getInputStream(entry)) {
IOUtil.writeStream(in, dataFile);
}
}
}
private void extractFromGzip(File downloadFile, File dataFile) throws IOException {
System.out.println("Extracting " + downloadFile + " to " + dataFile.getName());
GZIPInputStream in = new GZIPInputStream(new FileInputStream(downloadFile));
IOUtil.writeStream(in, dataFile);
}
}