org.apache.any23.Any23 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of apache-any23-core Show documentation
Core Any23 library implementation.
There is a newer version: 2.7
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.any23;

import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.extractor.ExtractorGroup;
import org.apache.any23.extractor.ExtractorRegistryImpl;
import org.apache.any23.extractor.SingleDocumentExtraction;
import org.apache.any23.extractor.SingleDocumentExtractionReport;
import org.apache.any23.http.AcceptHeaderBuilder;
import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.DefaultHTTPClientConfiguration;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.mime.MIMEType;
import org.apache.any23.mime.MIMETypeDetector;
import org.apache.any23.mime.TikaMIMETypeDetector;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.FileDocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.LocalCopyFactory;
import org.apache.any23.source.MemCopyFactory;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.TripleHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;


/**
 * A facade with convenience methods for typical Any23 extraction
 * operations.
 *
 * @author Richard Cyganiak ([email protected])
 * @author Michele Mostarda ([email protected])
 */
public class Any23 {

    /**
     * Any23 core library version.
     * NOTE: there's also a version string in pom.xml, they should match.
     */
    public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");

    /**
     * Default HTTP User Agent defined in default configuration.
     */
    public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
            "any23.http.user.agent.default"
    );

    protected static final Logger logger = LoggerFactory.getLogger(Any23.class);

    private final Configuration configuration;
    private final String        defaultUserAgent;

    private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector( new WhiteSpacesPurifier() );

    private HTTPClient httpClient = new DefaultHTTPClient();

    private boolean httpClientInitialized = false;

    private final ExtractorGroup factories;
    private LocalCopyFactory     streamCache;
    private String               userAgent;

    /**
     * Constructor that allows the specification of a
     * custom configuration and of a list of extractors.
     *
     * @param configuration configuration used to build the Any23 instance.
     * @param extractorGroup the group of extractors to be applied.
     */
    public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
        if(configuration == null) throw new NullPointerException("configuration must be not null.");
        this.configuration = configuration;
        logger.debug( configuration.getConfigurationDump() );

        this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");

        this.factories = (extractorGroup == null)
                ? ExtractorRegistryImpl.getInstance().getExtractorGroup()
                : extractorGroup;
        setCacheFactory(new MemCopyFactory());
    }

    /**
     * Constructor that allows the specification of a list of extractors.
     *
     * @param extractorGroup the group of extractors to be applied.
     */
    public Any23(ExtractorGroup extractorGroup) {
        this(DefaultConfiguration.singleton(), extractorGroup);
    }

    /**
     * Constructor that allows the specification of a
     * custom configuration and of list of extractor names.
     *
     * @param configuration a {@link Configuration} object
     * @param extractorNames list of extractor's names.
     */
    public Any23(Configuration configuration, String... extractorNames) {
        this(
                configuration,
                extractorNames == null
                        ?
                null
                        :
                ExtractorRegistryImpl.getInstance().getExtractorGroup( Arrays.asList(extractorNames))
        );
    }

    /**
     * Constructor that allows the specification of a list of extractor names.
     *
     * @param extractorNames list of extractor's names.
     */
    public Any23(String... extractorNames) {
        this( DefaultConfiguration.singleton(), extractorNames );
    }

    /**
     * Constructor accepting {@link Configuration}.
     * @param configuration a {@link Configuration} object
     */
    public Any23(Configuration configuration) {
        this(configuration, (String[]) null);
    }

    /**
     * Constructor with default configuration.
     */
    public Any23() {
        this( DefaultConfiguration.singleton() );
    }

    /**
     * Sets the HTTP Header User Agent,
     * see RFC 2616-14.43.
     *
     * @param userAgent text describing the user agent.
     */
    public void setHTTPUserAgent(String userAgent) {
        if (httpClientInitialized) {
            throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
        }
        if(userAgent == null) {
            userAgent = defaultUserAgent;
        }
        if(userAgent.trim().length() == 0) {
            throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
        }
        this.userAgent = userAgent;
    }

    /**
     * Returns the HTTP Header User Agent,
     * see RFC 2616-14.43.
     *
     * @return text describing the user agent.
     */
    public String getHTTPUserAgent() {
        return this.userAgent;
    }

    /**
     * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
     * used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
     *
     * @param httpClient a valid client instance.
     * @throws IllegalStateException if invoked after client has been initialized.
     */
    public void setHTTPClient(HTTPClient httpClient) {
        if(httpClient == null) {
            throw new NullPointerException("httpClient cannot be null.");
        }
        if (httpClientInitialized) {
            throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
        }
        this.httpClient = httpClient;
    }

    /**
     * Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
     *
     * @return instance of HTTPClient.
     * @throws IOException if the HTTP client has not initialized.
     */
    public HTTPClient getHTTPClient() throws IOException {
        if (!httpClientInitialized) {
            if (userAgent == null) {
                throw new IOException("Must call " + Any23.class.getSimpleName() +
                        ".setHTTPUserAgent(String) before extracting from HTTP IRI");
            }
            httpClient.init( new DefaultHTTPClientConfiguration(this.getAcceptHeader()) );
            httpClientInitialized = true;
        }
        return httpClient;
    }

    /**
     * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
     *
     * @param cache valid cache instance.
     */
    public void setCacheFactory(LocalCopyFactory cache) {
        if(cache == null) {
            throw new NullPointerException("cache cannot be null.");
        }
        this.streamCache = cache;
    }

    /**
     * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
     *
     * @param detector a valid detector instance, if null all the detectors
     *        will be used.
     */
    public void setMIMETypeDetector(MIMETypeDetector detector) {
        this.mimeTypeDetector = detector;
    }

    /**
     * Returns the most appropriate {@link DocumentSource} for the givendocumentIRI.
     * N.B. documentIRI's should contain a protocol.
     * E.g. http:, https:, file:
     * 
     *
     * @param documentIRI the document IRI.
     * @return a new instance of DocumentSource.
     * @throws URISyntaxException if an error occurs while parsing the documentIRI as a IRI.
     * @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
     */
    public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
        if(documentIRI == null) throw new NullPointerException("documentIRI cannot be null.");
        if (documentIRI.toLowerCase().startsWith("file:")) {
            return new FileDocumentSource( new File(new URI(documentIRI)) );
        }
        if (documentIRI.toLowerCase().startsWith("http:") || documentIRI.toLowerCase().startsWith("https:")) {
            return new HTTPDocumentSource(getHTTPClient(), documentIRI);
        }
        throw new IllegalArgumentException(
                String.format("Unsupported protocol for document IRI: '%s' . "
                    + "Check that document IRI contains a protocol.", documentIRI)
        );
    }


    /**
     * Performs metadata extraction from the content of the given
     * in document source, sending the generated events
     * to the specified outputHandler.
     *
     * @param eps the extraction parameters to be applied.
     * @param in the input document source.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @param encoding explicit encoding see
     *        available encodings.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(
            ExtractionParameters eps,
            DocumentSource in,
            TripleHandler outputHandler,
            String encoding
    ) throws IOException, ExtractionException {
        final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
        ex.setMIMETypeDetector(mimeTypeDetector);
        ex.setLocalCopyFactory(streamCache);
        ex.setParserEncoding(encoding);
        final SingleDocumentExtractionReport sder = ex.run(eps);
        return new ExtractionReport(
                ex.getMatchingExtractors(),
                ex.getParserEncoding(),
                ex.getDetectedMIMEType(),
                sder.getValidationReport(),
                sder.getExtractorToIssues()
        );
    }

    /**
     * Performs metadata extraction on the in string
     * associated to the documentIRI IRI, declaring
     * contentType and encoding.
     * The generated events are sent to the specified outputHandler.
     *
     * @param in raw data to be analyzed.
     * @param documentIRI IRI from which the raw data has been extracted.
     * @param contentType declared data content type.
     * @param encoding declared data encoding.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(
            String in,
            String documentIRI,
            String contentType,
            String encoding,
            TripleHandler outputHandler
    ) throws IOException, ExtractionException {
        return extract(new StringDocumentSource(in, documentIRI, contentType, encoding), outputHandler);
    }

    /**
     * Performs metadata extraction on the in string
     * associated to the documentIRI IRI, sending the generated
     * events to the specified outputHandler.
     *
     * @param in raw data to be analyzed.
     * @param documentIRI IRI from which the raw data has been extracted.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(String in, String documentIRI, TripleHandler outputHandler)
    throws IOException, ExtractionException {
        return extract(new StringDocumentSource(in, documentIRI), outputHandler);
    }

    /**
     * Performs metadata extraction from the content of the given file
     * sending the generated events to the specified outputHandler.
     *
     * @param file file containing raw data.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(File file, TripleHandler outputHandler)
    throws IOException, ExtractionException {
        return extract(new FileDocumentSource(file), outputHandler);
    }

    /**
     * Performs metadata extraction from the content of the given documentIRI
     * sending the generated events to the specified outputHandler.
     * If the IRI is replied with a redirect, the last will be followed.
     *
     * @param eps the parameters to be applied to the extraction.
     * @param documentIRI the IRI from which retrieve document.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(ExtractionParameters eps, String documentIRI, TripleHandler outputHandler)
    throws IOException, ExtractionException {
        try {
            return extract(eps, createDocumentSource(documentIRI), outputHandler);
        } catch (URISyntaxException ex) {
            throw new ExtractionException("Error while extracting data from document IRI.", ex);
        }
    }

    /**
     * Performs metadata extraction from the content of the given documentIRI
     * sending the generated events to the specified outputHandler.
     * If the IRI is replied with a redirect, the last will be followed.
     *
     * @param documentIRI the IRI from which retrieve document.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(String documentIRI, TripleHandler outputHandler)
    throws IOException, ExtractionException {
        return extract((ExtractionParameters) null, documentIRI, outputHandler);
    }

    /**
     * Performs metadata extraction from the content of the given
     * in document source, sending the generated events
     * to the specified outputHandler.
     *
     * @param in the input document source.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @param encoding explicit encoding see
     *        available encodings.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
    throws IOException, ExtractionException {
        return extract(null, in, outputHandler, encoding);
    }

    /**
     * Performs metadata extraction from the content of the given
     * in document source, sending the generated events
     * to the specified outputHandler.
     *
     * @param in the input document source.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
    throws IOException, ExtractionException {
        return extract(null, in, outputHandler, null);
    }

    /**
     * Performs metadata extraction from the content of the given
     * in document source, sending the generated events
     * to the specified outputHandler.
     *
     * @param eps the parameters to be applied for the extraction phase.
     * @param in the input document source.
     * @param outputHandler handler responsible for collecting of the extracted metadata.
     * @return true if some extraction occurred, false otherwise.
     * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
     * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
     */
    public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
    throws IOException, ExtractionException {
        return extract(eps, in, outputHandler, null);
    }

    private String getAcceptHeader() {
        Collection mimeTypes = new ArrayList();
        for (ExtractorFactory factory : factories) {
            mimeTypes.addAll(factory.getSupportedMIMETypes());
        }
        return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
    }
    
}