gov.sandia.cognition.text.document.extractor.DocumentExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
There is a newer version: 4.0.1
/*
 * File:                DocumentExtractor.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright January 19, 2009, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 * See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.text.document.extractor;

import gov.sandia.cognition.text.document.Document;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URLConnection;

/**
 * Interface for extracting documents from files.
 * 
 * @author  Justin Basilico
 * @since   3.0
 */
public interface DocumentExtractor
{
    /**
     * Determines if the given file can be extracted by this extractor.
     *
     * @param   file
     *      The file to extract.
     * @return
     *      True if this extractor can extract the file and false otherwise.
     * @throws java.io.IOException
     *      If there is an IO error.
     */
    public boolean canExtract(
        final File file)
        throws IOException;

    /**
     * Determines if the given file can be extracted by this extractor.
     *
     * @param   uri
     *      The URI of the file to extract.
     * @return
     *      True if this extractor can extract the file and false otherwise.
     * @throws java.io.IOException
     *      If there is an IO error.
     */
    public boolean canExtract(
        final URI uri)
        throws IOException;

    /**
     * Determines if the given file can be extracted by this extractor.
     *
     * @param   connection
     *      The connection to the file to extract.
     * @return
     *      True if this extractor can extract the file and false otherwise.
     * @throws java.io.IOException
     *      If there is an IO error.
     */
    public boolean canExtract(
        final URLConnection connection)
        throws IOException;

    /**
     * Attempts to extract all of the documents from the given file.
     *
     * @param file
     *      The file to extract.
     * @return
     *      The list of documents extracted from the given file.
     * @throws DocumentExtractionException
     *      If there is an error extracting data from the file.
     * @throws java.io.IOException
     *      If there is an IO error.
     */
    public Iterable extractAll(
        final File file)
        throws DocumentExtractionException, IOException;

    /**
     * Attempts to extract all of the documents from the given file.
     *
     * @param   uri
     *      The URI of the file to extract.
     * @return
     *      The list of documents extracted from the given file.
     * @throws DocumentExtractionException
     *      If there is an error extracting data from the file.
     * @throws java.io.IOException
     *      If there is an IO error.
     */
    public Iterable extractAll(
        final URI uri)
        throws DocumentExtractionException, IOException;

    /**
     * Attempts to extract all of the documents from the given file.
     *
     * @param   connection
     *      The connection to the file to extract.
     * @return
     *      The list of documents extracted from the given file.
     * @throws DocumentExtractionException
     *      If there is an error extracting data from the file.
     * @throws java.io.IOException
     *      If there is an IO error.
     */
    public Iterable extractAll(
        final URLConnection connection)
        throws DocumentExtractionException, IOException;
}