All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.semanticweb.owlapi.io.DocumentSources Maven / Gradle / Ivy

The newest version!
package org.semanticweb.owlapi.io;

import static org.semanticweb.owlapi.util.OWLAPIPreconditions.checkNotNull;
import static org.semanticweb.owlapi.util.OWLAPIPreconditions.emptyOptional;
import static org.semanticweb.owlapi.util.OWLAPIPreconditions.optional;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.JarURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Locale;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import javax.annotation.Nullable;

import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream;
import org.semanticweb.owlapi.model.IRI;
import org.semanticweb.owlapi.model.OWLOntologyLoaderConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tukaani.xz.XZInputStream;

/**
 * Static methods from AbstractOWLParser. Mostly used by OWLOntologyDocumentSource implementations.
 */
public class DocumentSources {

    private static final Logger LOGGER = LoggerFactory.getLogger(DocumentSources.class);
    private static final String ZIP_FILE_EXTENSION = ".zip";
    private static final String GZ_FILE_EXTENSION = ".gz";
    private static final String XZ_FILE_EXTENSION = ".xz";
    private static final String CONTENT_DISPOSITION_HEADER = "Content-Disposition";
    private static final Pattern CONTENT_DISPOSITION_FILE_NAME_PATTERN =
        Pattern.compile(".*filename=\"([^\\s;]*)\".*");
    private static final int CONTENT_DISPOSITION_FILE_NAME_PATTERN_GROUP = 1;
    private static final Pattern ZIP_ENTRY_ONTOLOGY_NAME_PATTERN =
        Pattern.compile(".*owl|rdf|xml|mos");
    private static final String ACCEPTABLE_CONTENT_ENCODING = "xz,gzip,deflate";
    private static final String TEXTPLAIN_REQUEST_TYPE = ", text/plain; q=0.1";
    private static final String LAST_REQUEST_TYPE = ", */*; q=0.09";
    private static final String DEFAULT_REQUEST =
        "application/rdf+xml, application/xml; q=0.7, text/xml; q=0.6" + TEXTPLAIN_REQUEST_TYPE
            + LAST_REQUEST_TYPE;

    private DocumentSources() {}

    /**
     * Select the available input source and, if it is not already a Reader, wrap it in a Reader.
     * This method removes the duplication of code required for each caller to figure out if a
     * reader or an inputstream is available. The returned Reader will be buffered.
     *
     * @param source ontology source
     * @param configuration loader configuration to use of the reader must be built form the input
     *        IRI
     * @param encoding character encoding if a new Reader needs to be created.
     * @return A Reader for the input; if no Reader can be obtained, an
     *         OWLOntologyInputSourceException is thrown.
     * @throws OWLOntologyInputSourceException if an IO related exception is thrown.
     */
    public static Reader wrapInputAsReader(OWLOntologyDocumentSource source,
        OWLOntologyLoaderConfiguration configuration, Charset encoding)
        throws OWLOntologyInputSourceException {
        Optional reader = source.getReader();
        if (reader.isPresent()) {
            return new BufferedReader(reader.get());
        }
        return new BufferedReader(
            new InputStreamReader(wrap(wrapInput(source, configuration)), encoding));
    }

    /**
     * Call #wrapwrapInputAsReader(OWLOntologyLoaderConfiguration, String) with UTF-* as default
     * encoding.
     *
     * @param source ontology source
     * @param configuration loader configuration to use of the reader must be built form the input
     *        IRI
     * @return A Reader wrapped in an Optional; if no Reader can be obtained, the result is
     *         Optional.empty. @throws OWLOntologyInputSourceException if an IO related exception is
     *         thrown.
     * @throws OWLOntologyInputSourceException if an IO related exception is thrown.
     */
    public static Reader wrapInputAsReader(OWLOntologyDocumentSource source,
        OWLOntologyLoaderConfiguration configuration) throws OWLOntologyInputSourceException {
        return wrapInputAsReader(source, configuration, StandardCharsets.UTF_8);
    }

    /**
     * Select the available input source as an input stream. The input stream will be buffered.
     *
     * @param source ontology source
     * @param configuration loader configuration to use of the reader must be built form the input
     *        IRI
     * @return A Reader for the input; if no Reader can be obtained, an
     *         OWLOntologyInputSourceException is thrown.
     * @throws OWLOntologyInputSourceException if an IO related exception is thrown.
     */
    public static InputStream wrapInput(OWLOntologyDocumentSource source,
        OWLOntologyLoaderConfiguration configuration) throws OWLOntologyInputSourceException {
        Optional input = source.getInputStream();
        if (!input.isPresent() && !source.hasAlredyFailedOnIRIResolution()) {
            if (source.getDocumentIRI().getNamespace().startsWith("jar:")) {
                if (source.getDocumentIRI().getNamespace().startsWith("jar:!")) {
                    String name = source.getDocumentIRI().toString().substring(5);
                    if (!name.startsWith("/")) {
                        name = "/" + name;
                    }
                    return DocumentSources.class.getResourceAsStream(name);
                } else {
                    try {
                        return streamFromJar(source.getDocumentIRI()).getInputStream();
                    } catch (IOException e) {
                        source.setIRIResolutionFailed(true);
                        throw new OWLParserException(e);
                    }
                }
            }

            Optional headers = source.getAcceptHeaders();
            if (headers.isPresent()) {
                input = getInputStream(source.getDocumentIRI(), configuration, headers.get());
            } else {
                input = getInputStream(source.getDocumentIRI(), configuration, DEFAULT_REQUEST);
            }
        }
        if (input.isPresent()) {
            return new BufferedInputStream(input.get());
        }
        throw new OWLOntologyInputSourceException("No input reader can be found");
    }

    protected static JarURLConnection streamFromJar(IRI documentIRI)
        throws IOException, MalformedURLException {
        return (JarURLConnection) new URL(documentIRI.toString()).openConnection();
    }

    /**
     * A convenience method that obtains an input stream from a URI. This method sets up the correct
     * request type and wraps the input stream within a buffered input stream.
     *
     * @param documentIRI The URI from which the input stream should be returned
     * @param config the load configuration
     * @return The input stream obtained from the URI
     * @throws OWLOntologyInputSourceException if there was an {@code IOException} in obtaining the
     *         input stream from the URI.
     * @deprecated use {@link #getInputStream(IRI, OWLOntologyLoaderConfiguration, String)} instead
     */
    @Deprecated
    public static Optional getInputStream(IRI documentIRI,
        OWLOntologyLoaderConfiguration config) throws OWLOntologyInputSourceException {
        return getInputStream(documentIRI, config, DEFAULT_REQUEST);
    }

    /**
     * A convenience method that obtains an input stream from a URI. This method sets up the correct
     * request type and wraps the input stream within a buffered input stream.
     *
     * @param documentIRI The URI from which the input stream should be returned
     * @param config the load configuration
     * @param acceptHeaders accept headers for the connection
     * @return The input stream obtained from the URI
     * @throws OWLOntologyInputSourceException if there was an {@code IOException} in obtaining the
     *         input stream from the URI.
     */
    @SuppressWarnings("resource")
    public static Optional getInputStream(IRI documentIRI,
        OWLOntologyLoaderConfiguration config, String acceptHeaders)
        throws OWLOntologyInputSourceException {
        try {
            URL originalURL = documentIRI.toURI().toURL();
            URLConnection conn = originalURL.openConnection();
            String actualAcceptHeaders = acceptHeaders;
            if (!acceptHeaders.contains("text/plain")) {
                actualAcceptHeaders += TEXTPLAIN_REQUEST_TYPE;
            }
            if (!acceptHeaders.contains("*/*")) {
                actualAcceptHeaders += LAST_REQUEST_TYPE;
            }
            conn.addRequestProperty("Accept", actualAcceptHeaders);
            if (config.getAuthorizationValue() != null
                && !config.getAuthorizationValue().isEmpty()) {
                conn.setRequestProperty("Authorization", config.getAuthorizationValue());
            }
            if (config.isAcceptingHTTPCompression()) {
                conn.setRequestProperty("Accept-Encoding", ACCEPTABLE_CONTENT_ENCODING);
            }
            int connectionTimeout = config.getConnectionTimeout();
            conn.setConnectTimeout(connectionTimeout);
            conn = connect(config, conn, connectionTimeout, actualAcceptHeaders, new HashSet<>());
            String contentEncoding = conn.getContentEncoding();
            InputStream is = connectWithFiveRetries(documentIRI, config, conn, connectionTimeout,
                contentEncoding);
            if (is == null) {
                return emptyOptional();
            }
            return optional(is);
        } catch (IOException e) {
            throw new OWLOntologyInputSourceException(e);
        }
    }

    protected static URLConnection connect(OWLOntologyLoaderConfiguration config,
        URLConnection conn, int connectionTimeout, String acceptHeaders, Set visited)
        throws IOException {
        if (conn instanceof HttpURLConnection && config.isFollowRedirects()) {
            // follow redirects to HTTPS
            HttpURLConnection con = (HttpURLConnection) conn;
            con.connect();
            int responseCode = con.getResponseCode();
            // redirect
            if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP
                || responseCode == HttpURLConnection.HTTP_MOVED_PERM
                || responseCode == HttpURLConnection.HTTP_SEE_OTHER
                // no constants for temporary and permanent redirect in HttpURLConnection
                || responseCode == 307 || responseCode == 308) {
                String location = con.getHeaderField("Location");
                if (visited.add(location)) {
                    URL newURL = new URL(location);
                    return connect(config,
                        rebuildConnection(config, connectionTimeout, newURL, acceptHeaders),
                        connectionTimeout, acceptHeaders, visited);
                } else {
                    throw new IllegalStateException(
                        "Infinite loop: redirect cycle detected. " + visited);
                }
            }
        }
        return conn;
    }

    protected static URLConnection rebuildConnection(OWLOntologyLoaderConfiguration config,
        int connectionTimeout, URL newURL, String acceptHeaders) throws IOException {
        URLConnection conn;
        conn = newURL.openConnection();
        conn.addRequestProperty("Accept", acceptHeaders);
        if (config.isAcceptingHTTPCompression()) {
            conn.setRequestProperty("Accept-Encoding", ACCEPTABLE_CONTENT_ENCODING);
        }
        conn.setConnectTimeout(connectionTimeout);
        return conn;
    }

    @Nullable
    protected static InputStream connectWithFiveRetries(IRI documentIRI,
        OWLOntologyLoaderConfiguration config, URLConnection conn, int connectionTimeout,
        String contentEncoding) throws IOException, OWLOntologyInputSourceException {
        InputStream is = null;
        int count = 0;
        while (count < config.getRetriesToAttempt() && is == null) {
            try {
                is = getInputStreamFromContentEncoding(documentIRI, conn, contentEncoding);
            } catch (SocketTimeoutException e) {
                count++;
                if (count == 5) {
                    throw new OWLOntologyInputSourceException(
                        "cannot connect to " + documentIRI + "; retry limit exhausted", e);
                }
                conn.setConnectTimeout(connectionTimeout + connectionTimeout * count);
            }
        }
        return is;
    }

    /**
     * Wrap an input stream to strip the BOM.
     *
     * @param delegate delegate to wrap
     * @return wrapped input stream
     */
    public static InputStream wrap(InputStream delegate) {
        checkNotNull(delegate, "delegate cannot be null");
        return new BOMInputStream(delegate, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
            ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE);
    }

    private static boolean couldBeOntology(@Nullable ZipEntry zipEntry) {
        if (zipEntry == null) {
            return false;
        }
        return ZIP_ENTRY_ONTOLOGY_NAME_PATTERN.matcher(zipEntry.getName()).matches();
    }

    private static InputStream getInputStreamFromContentEncoding(@Nullable IRI documentIRI,
        URLConnection conn, @Nullable String contentEncoding) throws IOException {
        String fileName = getFileNameFromContentDisposition(conn);
        if (fileName == null) {
            fileName = documentIRI == null ? "" : documentIRI.toString();
        }
        InputStream in = conn.getInputStream();
        if (contentEncoding != null) {
            InputStream toReturn = handleKnownContentEncodings(contentEncoding, in, fileName);
            if (toReturn != null) {
                return toReturn;
            }
        }
        return wrap(checkFileName(in, fileName));
    }

    private static InputStream checkFileName(InputStream in, String fileName) throws IOException {
        if (isGzFileName(fileName)) {
            LOGGER.info("URL connection has no content encoding but name ends with .gz");
            return new BufferedInputStream(new GZIPInputStream(in));
        }
        if (isXzFileName(fileName)) {
            LOGGER.info("URL connection has no content encoding but name ends with .xz");
            return new BufferedInputStream(new XZInputStream(in));
        }
        if (isZipFileName(fileName)) {
            ZipInputStream zis = new ZipInputStream(in);
            ZipEntry entry = null;
            ZipEntry nextEntry = zis.getNextEntry();
            // XXX is this a bug?
            while (entry != null && nextEntry != null) {
                if (couldBeOntology(nextEntry)) {
                    entry = nextEntry;
                }
                nextEntry = zis.getNextEntry();
            }
            return zis;
        }
        return in;

    }

    @Nullable
    protected static InputStream handleKnownContentEncodings(String contentEncoding, InputStream in,
        String fileName) throws IOException {
        if ("xz".equals(contentEncoding)) {
            LOGGER.info("URL connection input stream is compressed using xz");
            return new BufferedInputStream(checkFileName(new XZInputStream(in), fileName));
        }
        if ("gzip".equals(contentEncoding)) {
            LOGGER.info("URL connection input stream is compressed using gzip");
            return new BufferedInputStream(checkFileName(new GZIPInputStream(in), fileName));
        }
        if ("deflate".equals(contentEncoding)) {
            LOGGER.info("URL connection input stream is compressed using deflate");
            return checkFileName(new InflaterInputStream(in, new Inflater(true)), fileName);
        }
        return null;
    }

    @Nullable
    private static String getFileNameFromContentDisposition(URLConnection connection) {
        String contentDispositionHeaderValue =
            connection.getHeaderField(CONTENT_DISPOSITION_HEADER);
        if (contentDispositionHeaderValue != null) {
            Matcher matcher =
                CONTENT_DISPOSITION_FILE_NAME_PATTERN.matcher(contentDispositionHeaderValue);
            if (matcher.matches()) {
                return matcher.group(CONTENT_DISPOSITION_FILE_NAME_PATTERN_GROUP);
            }
        }
        return null;
    }

    private static boolean isZipFileName(String fileName) {
        return fileName.toLowerCase(Locale.getDefault()).endsWith(ZIP_FILE_EXTENSION);
    }

    private static boolean isGzFileName(String fileName) {
        return fileName.toLowerCase(Locale.getDefault()).endsWith(GZ_FILE_EXTENSION);
    }

    private static boolean isXzFileName(String fileName) {
        return fileName.toLowerCase(Locale.getDefault()).endsWith(XZ_FILE_EXTENSION);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy