All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eclipse.rdf4j.benchmark.rio.RDFTestDataset Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2021 Eclipse RDF4J contributors.
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 *******************************************************************************/

package org.eclipse.rdf4j.benchmark.rio;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.eclipse.rdf4j.common.io.IOUtil;

/**
 * This enum holds locations for RDF files from the web. It allows downloading these into a a temporary location in
 * {@code {java.io.tmpdir}/rdf4j-benchmark-datasets/{filename} }. Please note
 *
 * @author Frens Jan Rumph
 */
public enum RDFTestDataset {

	/**
	 * 

* A dump of the Berlin SPARQL * Benchmark made available by the HOBBIT project. *

*

* 25 GB extracted. *

*/ BSBM("bsbm.nt", "dataset.nt", "https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/bsbm-dump.zip"), /** *

* A dump of the FishMark * benchmark made available by the HOBBIT project. *

*

* 1.8 GB extracted *

*/ FISHMARK("fishmark.nt", "fishmark-size1.nt", "https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/fishmark-dump.zip"), /** *

* A dump of the * SP2Bench benchmark * made available by the HOBBIT project. *

*

* 2.7 GB extracted *

*/ SP2BENCH("sp2b.n3", "sp2b.n3", "https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/sp2bench-dump.zip"), /** *

* A dump of the Semantic Web Dog Food dataset made available by the HOBBIT * project. *

*

* 49 MB extracted *

*/ SWDF("swdf.nt", "SWDF.nt", "https://hobbitdata.informatik.uni-leipzig.de/benchmarks-data/datasets-dumps/swdf-dump.zip"), /** *

* A dump of the gene database of National Center for Biotechnology Information made available by the * Bio2RDF project. *

*

* 5.6 GB extracted *

*/ GENE2GO("gene2go.nq", "https://download.bio2rdf.org/files/release/3/ncbigene/gene2go.nq.gz"), /** *

* A dump of the Lexvo.org data. *

*

* 67 MB extracted *

*/ LEXVO("lexvo_latest.rdf", "http://www.lexvo.org/resources/lexvo_latest.rdf.gz"), /** *

* A Data Catalogue Vocabulary file from Federal Public Service Policy and * Support DG Digital Transformation's github repository. *

*

* 139 MB extracted *

*/ DATAGOVBE("datagovbe.nt", "https://github.com/Fedict/dcat/raw/master/all/datagovbe.nt.gz"); /** * Name of the target file. */ private final String fileName; /** * Name of the file to extract from the archive; {@code null} if the file is not part of a multi file archive (e.g. * ZIP file). */ private final String archiveEntryName; /** * The {@link URL} to download the file from. */ private final URL url; RDFTestDataset(String fileName, String url) { this(fileName, null, url); } RDFTestDataset(String fileName, String archiveEntryName, String url) { this.archiveEntryName = archiveEntryName; this.fileName = fileName; try { this.url = new URL(url); } catch (MalformedURLException e) { throw new RuntimeException("Statically defiled URL " + url + " is unexpectedly malformed", e); } } /** * Download the dataset file to {@code {java.io.tmpdir}/rdf4j-benchmark-datasets/{fileName}}. * * @return The {@link File} to which the dataset was downloaded and extracted. */ public File download() { File downloadDir = new File(System.getProperty("java.io.tmpdir"), "rdf4j-benchmark-datasets"); File dataFile = new File(downloadDir, fileName); if (dataFile.exists()) { return dataFile; } try { Files.createDirectories(downloadDir.toPath()); File downloadFile = new File(downloadDir, Paths.get(url.getPath()).getFileName().toString()); if (!downloadFile.exists()) { downloadTo(downloadFile); } if (!downloadFile.equals(dataFile)) { extract(downloadFile, dataFile); downloadFile.delete(); } } catch (IOException e) { throw new UncheckedIOException(e); } return dataFile; } private void downloadTo(File downloadFile) throws IOException { System.out.println("Downloading " + url); try (InputStream is = new BufferedInputStream(url.openStream())) { IOUtil.transfer(is, downloadFile); } } private void extract(File downloadFile, File dataFile) throws IOException { String downloadFileName = downloadFile.getName(); if (downloadFileName.endsWith(".zip")) { extractFromZip(downloadFile, dataFile); } else if (downloadFileName.endsWith(".gz")) { extractFromGzip(downloadFile, dataFile); } } private void extractFromZip(File downloadFile, File dataFile) throws IOException { System.out.println("Extracting " + archiveEntryName + " from " + downloadFile + " to " + dataFile.getName()); try (ZipFile zf = new ZipFile(downloadFile)) { ZipEntry entry = zf.getEntry(archiveEntryName); try (InputStream in = zf.getInputStream(entry)) { IOUtil.writeStream(in, dataFile); } } } private void extractFromGzip(File downloadFile, File dataFile) throws IOException { System.out.println("Extracting " + downloadFile + " to " + dataFile.getName()); GZIPInputStream in = new GZIPInputStream(new FileInputStream(downloadFile)); IOUtil.writeStream(in, dataFile); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy