org.modeshape.jcr.api.text.TextExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of modeshape-jcr-api Show documentation
Show all versions of modeshape-jcr-api Show documentation
ModeShape JCR Repository API
/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.jcr.api.text;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import javax.jcr.RepositoryException;
import org.modeshape.jcr.api.Binary;
import org.modeshape.jcr.api.Logger;
/**
* An abstraction for components that are able to extract text content from an input stream.
*/
public abstract class TextExtractor {
/**
* A logger instance
*/
private Logger logger;
/**
* The name of this text extractor, which can be configured for monitoring purposes; set via reflection.
*/
private String name;
/**
* the MIME types that are explicitly requested to be excluded; set via reflection.
*/
private Set excludedMimeTypes = new HashSet();
/**
* the MIME types that are explicitly requested to be included; set via reflection.
*/
private Set includedMimeTypes = new HashSet();
/**
* Determine if this extractor is capable of processing content with the supplied MIME type.
*
* @param mimeType the MIME type; never null
* @return true if this extractor can process content with the supplied MIME type, or false otherwise.
*/
public abstract boolean supportsMimeType( String mimeType );
/**
* Extract text from the given {@link Binary}, using the given output to record the results.
*
* @param binary the binary value that can be used in the extraction process; never null
* @param output the output from the sequencing operation; never null
* @param context the context for the sequencing operation; never null
* @throws Exception if there is a problem during the extraction process
*/
public abstract void extractFrom( Binary binary,
TextExtractor.Output output,
Context context ) throws Exception;
/**
* Allows subclasses to process the stream of binary value property in "safe" fashion, making sure the stream is closed at the
* end of the operation.
*
* @param binary a {@link org.modeshape.jcr.api.Binary} who is expected to contain a non-null binary value.
* @param operation a {@link org.modeshape.jcr.api.text.TextExtractor.BinaryOperation} which should work with the stream
* @param the return type of the binary operation
* @return whatever type of result the stream operation returns
* @throws Exception if there is an error processing the stream
*/
protected final T processStream( Binary binary,
BinaryOperation operation ) throws Exception {
InputStream stream = binary.getStream();
if (stream == null) {
throw new IllegalArgumentException("The binary value is empty");
}
try {
return operation.execute(stream);
} finally {
stream.close();
}
}
/**
* Sets a logger instance.
*
* @param logger a {@link Logger}, never {@code null}
*/
public final void setLogger( Logger logger ) {
if (logger == null) {
throw new IllegalArgumentException("Logger cannot be null");
}
this.logger = logger;
}
protected final Logger logger() {
return logger;
}
protected Set getExcludedMimeTypes() {
return excludedMimeTypes;
}
protected Set getIncludedMimeTypes() {
return includedMimeTypes;
}
/**
* Returns the text extractor name.
*
* @return a {@link String}, possibly {@code null}
*/
public String getName() {
return name;
}
/**
* Sets the text extractor name.
*
* @param name a {@link String}, never {@code null}
*/
public void setName( String name ) {
this.name = name;
}
/**
* Interface which can be used by subclasses to process the input stream of a binary property.
*
* @param the return type of the binary operation
*/
protected interface BinaryOperation {
T execute( InputStream stream ) throws Exception;
}
/**
* Interface which provides additional information to the text extractors, during the extraction operation.
*/
public interface Context {
/**
* Determines the mime-type of the given binary value .
*
* @param name a symbolic name of the binary value (e.g. the name of a file); may be {@code null}
* @param binaryValue a {@link Binary} instance which represents the binary data; may not be {@code null}
* @return either a valid mime-type or {@code null} if it's not possible to determine the mimetype.
* @throws RepositoryException if there are any problems accessing the binary value from the repository.
* @throws IOException if there are any problems reading bytes from the binary value.
*/
String mimeTypeOf( String name,
Binary binaryValue ) throws RepositoryException, IOException;
}
/**
* The interface passed to a TextExtractor to which the extractor should record all text content.
*/
public interface Output {
/**
* Record the text as being extracted. This method can be called multiple times during a single extract.
*
* @param text the text extracted from the content.
*/
void recordText( String text );
}
}