org.apache.any23.Any23 Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.extractor.ExtractorGroup;
import org.apache.any23.extractor.ExtractorRegistry;
import org.apache.any23.extractor.SingleDocumentExtraction;
import org.apache.any23.extractor.SingleDocumentExtractionReport;
import org.apache.any23.http.AcceptHeaderBuilder;
import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.DefaultHTTPClientConfiguration;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.mime.MIMEType;
import org.apache.any23.mime.MIMETypeDetector;
import org.apache.any23.mime.TikaMIMETypeDetector;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.FileDocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.LocalCopyFactory;
import org.apache.any23.source.MemCopyFactory;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.writer.TripleHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
/**
* A facade with convenience methods for typical Any23 extraction
* operations.
*
* @author Richard Cyganiak ([email protected])
* @author Michele Mostarda ([email protected])
*/
public class Any23 {
/**
* Any23 core library version.
* NOTE: there's also a version string in pom.xml, they should match.
*/
public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
/**
* Default HTTP User Agent defined in default configuration.
*/
public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
"any23.http.user.agent.default"
);
protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
private final Configuration configuration;
private final String defaultUserAgent;
private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector( new WhiteSpacesPurifier() );
private HTTPClient httpClient = new DefaultHTTPClient();
private boolean httpClientInitialized = false;
private final ExtractorGroup factories;
private LocalCopyFactory streamCache;
private String userAgent;
/**
* Constructor that allows the specification of a
* custom configuration and of a list of extractors.
*
* @param configuration configuration used to build the Any23 instance.
* @param extractorGroup the group of extractors to be applied.
*/
public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
if(configuration == null) throw new NullPointerException("configuration must be not null.");
this.configuration = configuration;
logger.info( configuration.getConfigurationDump() );
this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
this.factories = (extractorGroup == null)
? ExtractorRegistry.getInstance().getExtractorGroup()
: extractorGroup;
setCacheFactory(new MemCopyFactory());
}
/**
* Constructor that allows the specification of a list of extractors.
*
* @param extractorGroup the group of extractors to be applied.
*/
public Any23(ExtractorGroup extractorGroup) {
this(DefaultConfiguration.singleton(), extractorGroup);
}
/**
* Constructor that allows the specification of a
* custom configuration and of list of extractor names.
*
* @param extractorNames list of extractor's names.
*/
public Any23(Configuration configuration, String... extractorNames) {
this(
configuration,
extractorNames == null
?
null
:
ExtractorRegistry.getInstance().getExtractorGroup( Arrays.asList(extractorNames))
);
}
/**
* Constructor that allows the specification of a list of extractor names.
*
* @param extractorNames list of extractor's names.
*/
public Any23(String... extractorNames) {
this( DefaultConfiguration.singleton(), extractorNames );
}
/**
* Constructor accepting {@link Configuration}.
*/
public Any23(Configuration configuration) {
this(configuration, (String[]) null);
}
/**
* Constructor with default configuration.
*/
public Any23() {
this( DefaultConfiguration.singleton() );
}
/**
* Sets the HTTP Header User Agent,
* see RFC 2616-14.43.
*
* @param userAgent text describing the user agent.
*/
public void setHTTPUserAgent(String userAgent) {
if (httpClientInitialized) {
throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
}
if(userAgent == null) {
userAgent = defaultUserAgent;
}
if(userAgent.trim().length() == 0) {
throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
}
this.userAgent = userAgent;
}
/**
* Returns the HTTP Header User Agent,
* see RFC 2616-14.43.
*
* @return text describing the user agent.
*/
public String getHTTPUserAgent() {
return this.userAgent;
}
/**
* Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
* used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
*
* @param httpClient a valid client instance.
* @throws IllegalStateException if invoked after client has been initialized.
*/
public void setHTTPClient(HTTPClient httpClient) {
if(httpClient == null) {
throw new NullPointerException("httpClient cannot be null.");
}
if (httpClientInitialized) {
throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
}
this.httpClient = httpClient;
}
/**
* Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
*
* @return instance of HTTPClient.
* @throws IOException if the HTTP client has not initialized.
*/
public HTTPClient getHTTPClient() throws IOException {
if (!httpClientInitialized) {
if (userAgent == null) {
throw new IOException("Must call " + Any23.class.getSimpleName() +
".setHTTPUserAgent(String) before extracting from HTTP URI");
}
httpClient.init( new DefaultHTTPClientConfiguration(this.getAcceptHeader()) );
httpClientInitialized = true;
}
return httpClient;
}
/**
* Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
*
* @param cache valid cache instance.
*/
public void setCacheFactory(LocalCopyFactory cache) {
if(cache == null) {
throw new NullPointerException("cache cannot be null.");
}
this.streamCache = cache;
}
/**
* Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
*
* @param detector a valid detector instance, if null
all the detectors
* will be used.
*/
public void setMIMETypeDetector(MIMETypeDetector detector) {
this.mimeTypeDetector = detector;
}
/**
* Returns the most appropriate {@link DocumentSource} for the givendocumentURI
.
*
* @param documentURI the document URI.
* @return a new instance of DocumentSource.
* @throws URISyntaxException if an error occurs while parsing the documentURI
as a URI.
* @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
*/
public DocumentSource createDocumentSource(String documentURI) throws URISyntaxException, IOException {
if(documentURI == null) throw new NullPointerException("documentURI cannot be null.");
if (documentURI.toLowerCase().startsWith("file:")) {
return new FileDocumentSource( new File(new URI(documentURI)) );
}
if (documentURI.toLowerCase().startsWith("http:") || documentURI.toLowerCase().startsWith("https:")) {
return new HTTPDocumentSource(getHTTPClient(), documentURI);
}
throw new IllegalArgumentException(
String.format("Unsupported protocol for document URI: '%s' .", documentURI)
);
}
/**
* Performs metadata extraction from the content of the given
* in
document source, sending the generated events
* to the specified outputHandler
.
*
* @param eps the extraction parameters to be applied.
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @param encoding explicit encoding see
* available encodings.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws org.apache.any23.extractor.ExtractionException
*/
public ExtractionReport extract(
ExtractionParameters eps,
DocumentSource in,
TripleHandler outputHandler,
String encoding
) throws IOException, ExtractionException {
final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
ex.setMIMETypeDetector(mimeTypeDetector);
ex.setLocalCopyFactory(streamCache);
ex.setParserEncoding(encoding);
final SingleDocumentExtractionReport sder = ex.run(eps);
return new ExtractionReport(
ex.getMatchingExtractors(),
ex.getParserEncoding(),
ex.getDetectedMIMEType(),
sder.getValidationReport(),
sder.getExtractorToIssues()
);
}
/**
* Performs metadata extraction on the in
string
* associated to the documentURI
URI, declaring
* contentType
and encoding
.
* The generated events are sent to the specified outputHandler
.
*
* @param in raw data to be analyzed.
* @param documentURI URI from which the raw data has been extracted.
* @param contentType declared data content type.
* @param encoding declared data encoding.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(
String in,
String documentURI,
String contentType,
String encoding,
TripleHandler outputHandler
) throws IOException, ExtractionException {
return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
}
/**
* Performs metadata extraction on the in
string
* associated to the documentURI
URI, sending the generated
* events to the specified outputHandler
.
*
* @param in raw data to be analyzed.
* @param documentURI URI from which the raw data has been extracted.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(String in, String documentURI, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(new StringDocumentSource(in, documentURI), outputHandler);
}
/**
* Performs metadata extraction from the content of the given file
* sending the generated events to the specified outputHandler
.
*
* @param file file containing raw data.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(File file, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(new FileDocumentSource(file), outputHandler);
}
/**
* Performs metadata extraction from the content of the given documentURI
* sending the generated events to the specified outputHandler
.
* If the URI is replied with a redirect, the last will be followed.
*
* @param eps the parameters to be applied to the extraction.
* @param documentURI the URI from which retrieve document.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(ExtractionParameters eps, String documentURI, TripleHandler outputHandler)
throws IOException, ExtractionException {
try {
return extract(eps, createDocumentSource(documentURI), outputHandler);
} catch (URISyntaxException ex) {
throw new ExtractionException("Error while extracting data from document URI.", ex);
}
}
/**
* Performs metadata extraction from the content of the given documentURI
* sending the generated events to the specified outputHandler
.
* If the URI is replied with a redirect, the last will be followed.
*
* @param documentURI the URI from which retrieve document.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(String documentURI, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract((ExtractionParameters) null, documentURI, outputHandler);
}
/**
* Performs metadata extraction from the content of the given
* in
document source, sending the generated events
* to the specified outputHandler
.
*
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @param encoding explicit encoding see
* available encodings.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
throws IOException, ExtractionException {
return extract(null, in, outputHandler, encoding);
}
/**
* Performs metadata extraction from the content of the given
* in
document source, sending the generated events
* to the specified outputHandler
.
*
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(null, in, outputHandler, null);
}
/**
* Performs metadata extraction from the content of the given
* in
document source, sending the generated events
* to the specified outputHandler
.
*
* @param eps the parameters to be applied for the extraction phase.
* @param in the input document source.
* @param outputHandler handler responsible for collecting of the extracted metadata.
* @return true
if some extraction occurred, false
otherwise.
* @throws IOException
* @throws ExtractionException
*/
public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
throws IOException, ExtractionException {
return extract(eps, in, outputHandler, null);
}
private String getAcceptHeader() {
Collection mimeTypes = new ArrayList();
for (ExtractorFactory> factory : factories) {
mimeTypes.addAll(factory.getSupportedMIMETypes());
}
return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy