All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.modeshape.jcr.api.text.TextExtractor Maven / Gradle / Ivy

There is a newer version: 5.4.1.Final
Show newest version
/*
 * ModeShape (http://www.modeshape.org)
 * See the COPYRIGHT.txt file distributed with this work for information
 * regarding copyright ownership.  Some portions may be licensed
 * to Red Hat, Inc. under one or more contributor license agreements.
 * See the AUTHORS.txt file in the distribution for a full listing of 
 * individual contributors.
 *
 * ModeShape is free software. Unless otherwise indicated, all code in ModeShape
 * is licensed to you under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 * 
 * ModeShape is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.modeshape.jcr.api.text;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import javax.jcr.RepositoryException;
import org.modeshape.jcr.api.Binary;
import org.modeshape.jcr.api.Logger;

/**
 * An abstraction for components that are able to extract text content from an input stream.
 */
public abstract class TextExtractor {

    /**
     * A logger instance
     */
    private Logger logger;

    /**
     * The name of this text extractor, which can be configured for monitoring purposes; set via reflection.
     */
    private String name;

    /**
     * the MIME types that are explicitly requested to be excluded; set via reflection.
     */
    private Set excludedMimeTypes = new HashSet();

    /**
     * the MIME types that are explicitly requested to be included; set via reflection.
     */
    private Set includedMimeTypes = new HashSet();

    /**
     * Determine if this extractor is capable of processing content with the supplied MIME type.
     * 
     * @param mimeType the MIME type; never null
     * @return true if this extractor can process content with the supplied MIME type, or false otherwise.
     */
    public abstract boolean supportsMimeType( String mimeType );

    /**
     * Extract text from the given {@link Binary}, using the given output to record the results.
     * 
     * @param binary the binary value that can be used in the extraction process; never null
     * @param output the output from the sequencing operation; never null
     * @param context the context for the sequencing operation; never null
     * @throws Exception if there is a problem during the extraction process
     */
    public abstract void extractFrom( Binary binary,
                                      TextExtractor.Output output,
                                      Context context ) throws Exception;

    /**
     * Allows subclasses to process the stream of binary value property in "safe" fashion, making sure the stream is closed at the
     * end of the operation.
     * 
     * @param binary a {@link org.modeshape.jcr.api.Binary} who is expected to contain a non-null binary value.
     * @param operation a {@link org.modeshape.jcr.api.text.TextExtractor.BinaryOperation} which should work with the stream
     * @param  the return type of the binary operation
     * @return whatever type of result the stream operation returns
     * @throws Exception if there is an error processing the stream
     */
    protected final  T processStream( Binary binary,
                                         BinaryOperation operation ) throws Exception {
        InputStream stream = binary.getStream();
        if (stream == null) {
            throw new IllegalArgumentException("The binary value is empty");
        }

        try {
            return operation.execute(stream);
        } finally {
            stream.close();
        }
    }

    /**
     * Sets a logger instance.
     *
     * @param logger a {@link Logger}, never {@code null}
     */
    public final void setLogger( Logger logger ) {
        if (logger == null) {
            throw new IllegalArgumentException("Logger cannot be null");
        }
        this.logger = logger;
    }

    protected final Logger logger() {
        return logger;
    }

    protected Set getExcludedMimeTypes() {
        return excludedMimeTypes;
    }

    protected Set getIncludedMimeTypes() {
        return includedMimeTypes;
    }

    /**
     * Returns the text extractor name.
     *
     * @return a {@link String}, possibly {@code null}
     */
    public String getName() {
        return name;
    }

    /**
     * Sets the text extractor name.
     *
     * @param  name a {@link String}, never {@code null}
     */
    public void setName( String name ) {
        this.name = name;
    }

    /**
     * Interface which can be used by subclasses to process the input stream of a binary property.
     * 
     * @param  the return type of the binary operation
     */
    protected interface BinaryOperation {
        T execute( InputStream stream ) throws Exception;
    }

    /**
     * Interface which provides additional information to the text extractors, during the extraction operation.
     */
    public interface Context {

        /**
         * Determines the mime-type of the given binary value .
         *
         * @param name a symbolic name of the binary value (e.g. the name of a file); may be {@code null}
         * @param binaryValue a {@link Binary} instance which represents the binary data; may not be {@code null}
         * @return either a valid mime-type or {@code null} if it's not possible to determine the mimetype.
         * @throws RepositoryException if there are any problems accessing the binary value from the repository.
         * @throws IOException if there are any problems reading bytes from the binary value.
         */
        String mimeTypeOf( String name,
                           Binary binaryValue ) throws RepositoryException, IOException;

    }

    /**
     * The interface passed to a TextExtractor to which the extractor should record all text content.
     */
    public interface Output {

        /**
         * Record the text as being extracted. This method can be called multiple times during a single extract.
         * 
         * @param text the text extracted from the content.
         */
        void recordText( String text );
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy